Skip to content

Commit c148699

Browse files
authored
Merge pull request #137 from bact/dev
Update Peter Norvig's spell checker to suggest words based on probability
2 parents 301fc30 + 0f315b9 commit c148699

File tree

8 files changed

+325
-145
lines changed

8 files changed

+325
-145
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,7 @@ target/
5858

5959
# Jupyter Notebook
6060
.ipynb_checkpoints
61+
Untitled*.ipynb
6162

6263
# IDE files
6364
.idea

examples/spell.py

Lines changed: 16 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,21 @@
11
# -*- coding: utf-8 -*-
22

33
from pythainlp.spell import spell
4+
from pythainlp.spell.pn import spell as pn_tnc_spell
5+
from pythainlp.spell.pn import correct as pn_tnc_correct
6+
from pythainlp.spell.pn import NorvigSpellChecker
7+
from pythainlp.corpus import ttc
48

5-
a = spell("สี่เหลียม")
6-
print(a) # ['สี่เหลี่ยม']
9+
# checker from pythainlp.spell module (generic)
10+
spell("สี่เหลียม") # ['สี่เหลี่ยม']
11+
# spell("สี่เหลียม", engine="hunspell") # available in some Linux systems
712

8-
# a = spell("สี่เหลียม", engine="hunspell") # available in some Linux systems
13+
# checker from pythainlp.spell.pn module (specified algorithm - Peter Norvig's)
14+
pn_tnc_spell("เหลืยม")
15+
pn_tnc_correct("เหลืยม")
16+
17+
# checker from pythainlp.spell.pn module (specified algorithm, custom dictionary)
18+
ttc_word_freqs = ttc.get_word_frequency_all()
19+
pn_ttc_spell_checker = NorvigSpellChecker(custom_dict=ttc_word_freqs)
20+
pn_ttc_spell_checker.spell("เหลืยม")
21+
pn_ttc_spell_checker.correct("เหลืยม")

pythainlp/corpus/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,16 +3,16 @@
33
import os
44

55
import requests
6-
from future.moves.urllib.request import urlopen
76
from pythainlp.tools import get_path_data, get_path_db
87
from tinydb import Query, TinyDB
98
from tqdm import tqdm
9+
from urllib.request import urlopen
1010

1111
CORPUS_DB_URL = (
1212
"https://raw.github.com/PyThaiNLP/pythainlp-corpus/master/db.json"
1313
)
1414

15-
# __all__ = ["thaipos", "thaiword","alphabet","tone","country","wordnet"]
15+
# __all__ = ["thaipos", "thaiword", "alphabet", "tone", "country", "wordnet"]
1616
path_db_ = get_path_db()
1717

1818

pythainlp/corpus/tnc.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# -*- coding: utf-8 -*-
22
"""
3-
Word frequency from Thai National Corpus
3+
Thai National Corpus word frequency
4+
45
Credit: Korakot Chaovavanich‎
56
https://www.facebook.com/photo.php?fbid=363640477387469&set=gm.434330506948445&type=3&permPage=1
67
"""
@@ -57,6 +58,6 @@ def get_word_frequency_all():
5758
listword = []
5859
for line in lines:
5960
listindata = line.split(" ")
60-
listword.append((listindata[0], listindata[1]))
61+
listword.append((listindata[0], int(listindata[1])))
6162

6263
return listword

pythainlp/corpus/ttc.py

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
# -*- coding: utf-8 -*-
22
"""
3-
TTC Thai word frequency
3+
Thai Textbook Corpus (TTC) word frequency
4+
45
Credit: Korakot Chaovavanich‎
56
https://www.facebook.com/photo.php?fbid=363640477387469&set=gm.434330506948445&type=3&permPage=1
67
"""
@@ -13,7 +14,7 @@
1314

1415
def get_word_frequency_all():
1516
"""
16-
ดึงข้อมูลความถี่คำของ TTC มาใช้งาน
17+
ดึงข้อมูลความถี่คำของ Thai Textbook Corpus (TTC) มาใช้งาน
1718
โดยมีรูปแบบข้อมูลเป็น List[Tuple] [(word, frequency), ...]
1819
"""
1920
path = os.path.join(os.path.expanduser("~"), "pythainlp-data")
@@ -34,6 +35,6 @@ def get_word_frequency_all():
3435
listword = []
3536
for line in lines:
3637
listindata = line.split(" ")
37-
listword.append((listindata[0], listindata[1]))
38+
listword.append((listindata[0], int(listindata[1])))
3839

3940
return listword

pythainlp/ner/__init__.py

Lines changed: 24 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
from pythainlp.corpus import download, get_file, stopwords
66
from pythainlp.tag import pos_tag
77
from pythainlp.tokenize import word_tokenize
8+
from pythainlp.util import is_thaiword
89

910
try:
1011
import sklearn_crfsuite
@@ -22,62 +23,55 @@
2223
_STOPWORDS = stopwords.words("thai")
2324

2425

25-
def _is_thaichar(ch): # เป็นอักษรไทยหรือไม่
26-
ch_val = ord(ch)
27-
if ch_val >= 3584 and ch_val <= 3711:
28-
return True
29-
return False
30-
31-
32-
def _is_thaiword(word): # เป็นคำที่มีแต่อักษรไทยหรือไม่
33-
for ch in word:
34-
if ch != "." and not _is_thaichar(ch):
35-
return False
36-
return True
37-
38-
3926
def _is_stopword(word): # เช็คว่าเป็นคำฟุ่มเฟือย
4027
return word in _STOPWORDS
4128

4229

4330
def _doc2features(doc, i):
4431
word = doc[i][0]
4532
postag = doc[i][1]
33+
4634
# Features from current word
4735
features = {
4836
"word.word": word,
4937
"word.stopword": _is_stopword(word),
50-
"word.isthai": _is_thaiword(word),
38+
"word.isthai": is_thaiword(word),
5139
"word.isspace": word.isspace(),
5240
"postag": postag,
5341
"word.isdigit()": word.isdigit(),
5442
}
55-
5643
if word.isdigit() and len(word) == 5:
5744
features["word.islen5"] = True
5845

46+
# Features from previous word
5947
if i > 0:
6048
prevword = doc[i - 1][0]
61-
postag1 = doc[i - 1][1]
62-
features["word.prevword"] = prevword
63-
features["word.previsspace"] = prevword.isspace()
64-
features["word.previsthai"] = _is_thaiword(prevword)
65-
features["word.prevstopword"] = _is_stopword(prevword)
66-
features["word.prepostag"] = postag1
67-
features["word.prevwordisdigit"] = prevword.isdigit()
49+
prevpostag = doc[i - 1][1]
50+
prev_features = {
51+
"word.prevword": prevword,
52+
"word.previsspace": prevword.isspace(),
53+
"word.previsthai": is_thaiword(prevword),
54+
"word.prevstopword": _is_stopword(prevword),
55+
"word.prevpostag": prevpostag,
56+
"word.prevwordisdigit": prevword.isdigit(),
57+
}
58+
features.update(prev_features)
6859
else:
6960
features["BOS"] = True # Special "Beginning of Sequence" tag
7061

7162
# Features from next word
7263
if i < len(doc) - 1:
7364
nextword = doc[i + 1][0]
74-
postag1 = doc[i + 1][1]
75-
features["word.nextword"] = nextword
76-
features["word.nextisspace"] = nextword.isspace()
77-
features["word.nextpostag"] = postag1
78-
features["word.nextisthai"] = _is_thaiword(nextword)
79-
features["word.nextstopword"] = _is_stopword(nextword)
80-
features["word.nextwordisdigit"] = nextword.isdigit()
65+
nextpostag = doc[i + 1][1]
66+
next_features = {
67+
"word.nextword": nextword,
68+
"word.nextisspace": nextword.isspace(),
69+
"word.nextpostag": nextpostag,
70+
"word.nextisthai": is_thaiword(nextword),
71+
"word.nextstopword": _is_stopword(nextword),
72+
"word.nextwordisdigit": nextword.isdigit(),
73+
}
74+
features.update(next_features)
8175
else:
8276
features["EOS"] = True # Special "End of Sequence" tag
8377

0 commit comments

Comments
 (0)