Merge pull request #137 from bact/dev

wannaphong · web-flow · commit c1486995af78 · 2018-10-29T11:17:43.000+07:00
Update Peter Norvig's spell checker to suggest words based on probability
diff --git a/.gitignore b/.gitignore
@@ -58,6 +58,7 @@ target/
 
 # Jupyter Notebook
 .ipynb_checkpoints
+Untitled*.ipynb
 
 # IDE files
 .idea
diff --git a/examples/spell.py b/examples/spell.py
@@ -1,8 +1,21 @@
 # -*- coding: utf-8 -*-
 
 from pythainlp.spell import spell
+from pythainlp.spell.pn import spell as pn_tnc_spell
+from pythainlp.spell.pn import correct as pn_tnc_correct
+from pythainlp.spell.pn import NorvigSpellChecker
+from pythainlp.corpus import ttc
 
-a = spell("สี่เหลียม")
-print(a)  # ['สี่เหลี่ยม']
+# checker from pythainlp.spell module (generic)
+spell("สี่เหลียม")  # ['สี่เหลี่ยม']
+# spell("สี่เหลียม", engine="hunspell")  # available in some Linux systems
 
-# a = spell("สี่เหลียม", engine="hunspell")  # available in some Linux systems
+# checker from pythainlp.spell.pn module (specified algorithm - Peter Norvig's)
+pn_tnc_spell("เหลืยม")
+pn_tnc_correct("เหลืยม")
+
+# checker from pythainlp.spell.pn module (specified algorithm, custom dictionary)
+ttc_word_freqs = ttc.get_word_frequency_all()
+pn_ttc_spell_checker = NorvigSpellChecker(custom_dict=ttc_word_freqs)
+pn_ttc_spell_checker.spell("เหลืยม")
+pn_ttc_spell_checker.correct("เหลืยม")
diff --git a/pythainlp/corpus/__init__.py b/pythainlp/corpus/__init__.py
@@ -3,16 +3,16 @@
 import os
 
 import requests
-from future.moves.urllib.request import urlopen
 from pythainlp.tools import get_path_data, get_path_db
 from tinydb import Query, TinyDB
 from tqdm import tqdm
+from urllib.request import urlopen
 
 CORPUS_DB_URL = (
     "https://raw.github.com/PyThaiNLP/pythainlp-corpus/master/db.json"
 )
 
-# __all__ = ["thaipos", "thaiword","alphabet","tone","country","wordnet"]
+# __all__ = ["thaipos", "thaiword", "alphabet", "tone", "country", "wordnet"]
 path_db_ = get_path_db()
 
 
diff --git a/pythainlp/corpus/tnc.py b/pythainlp/corpus/tnc.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 """
-Word frequency from Thai National Corpus
+Thai National Corpus word frequency
+
 Credit: Korakot Chaovavanich‎
 https://www.facebook.com/photo.php?fbid=363640477387469&set=gm.434330506948445&type=3&permPage=1
 """
@@ -57,6 +58,6 @@ def get_word_frequency_all():
     listword = []
     for line in lines:
         listindata = line.split("	")
-        listword.append((listindata[0], listindata[1]))
+        listword.append((listindata[0], int(listindata[1])))
 
     return listword
diff --git a/pythainlp/corpus/ttc.py b/pythainlp/corpus/ttc.py
@@ -1,6 +1,7 @@
 # -*- coding: utf-8 -*-
 """
-TTC Thai word frequency
+Thai Textbook Corpus (TTC) word frequency
+
 Credit: Korakot Chaovavanich‎
 https://www.facebook.com/photo.php?fbid=363640477387469&set=gm.434330506948445&type=3&permPage=1
 """
@@ -13,7 +14,7 @@
 
 def get_word_frequency_all():
     """
-    ดึงข้อมูลความถี่คำของ TTC มาใช้งาน
+    ดึงข้อมูลความถี่คำของ Thai Textbook Corpus (TTC) มาใช้งาน
     โดยมีรูปแบบข้อมูลเป็น List[Tuple] [(word, frequency), ...]
     """
     path = os.path.join(os.path.expanduser("~"), "pythainlp-data")
@@ -34,6 +35,6 @@ def get_word_frequency_all():
     listword = []
     for line in lines:
         listindata = line.split("	")
-        listword.append((listindata[0], listindata[1]))
+        listword.append((listindata[0], int(listindata[1])))
 
     return listword
diff --git a/pythainlp/ner/__init__.py b/pythainlp/ner/__init__.py
@@ -5,6 +5,7 @@
 from pythainlp.corpus import download, get_file, stopwords
 from pythainlp.tag import pos_tag
 from pythainlp.tokenize import word_tokenize
+from pythainlp.util import is_thaiword
 
 try:
     import sklearn_crfsuite
@@ -22,62 +23,55 @@
 _STOPWORDS = stopwords.words("thai")
 
 
-def _is_thaichar(ch):  # เป็นอักษรไทยหรือไม่
-    ch_val = ord(ch)
-    if ch_val >= 3584 and ch_val <= 3711:
-        return True
-    return False
-
-
-def _is_thaiword(word):  # เป็นคำที่มีแต่อักษรไทยหรือไม่
-    for ch in word:
-        if ch != "." and not _is_thaichar(ch):
-            return False
-    return True
-
-
 def _is_stopword(word):  # เช็คว่าเป็นคำฟุ่มเฟือย
     return word in _STOPWORDS
 
 
 def _doc2features(doc, i):
     word = doc[i][0]
     postag = doc[i][1]
+
     # Features from current word
     features = {
         "word.word": word,
         "word.stopword": _is_stopword(word),
-        "word.isthai": _is_thaiword(word),
+        "word.isthai": is_thaiword(word),
         "word.isspace": word.isspace(),
         "postag": postag,
         "word.isdigit()": word.isdigit(),
     }
-
     if word.isdigit() and len(word) == 5:
         features["word.islen5"] = True
 
+    # Features from previous word
     if i > 0:
         prevword = doc[i - 1][0]
-        postag1 = doc[i - 1][1]
-        features["word.prevword"] = prevword
-        features["word.previsspace"] = prevword.isspace()
-        features["word.previsthai"] = _is_thaiword(prevword)
-        features["word.prevstopword"] = _is_stopword(prevword)
-        features["word.prepostag"] = postag1
-        features["word.prevwordisdigit"] = prevword.isdigit()
+        prevpostag = doc[i - 1][1]
+        prev_features = {
+            "word.prevword": prevword,
+            "word.previsspace": prevword.isspace(),
+            "word.previsthai": is_thaiword(prevword),
+            "word.prevstopword": _is_stopword(prevword),
+            "word.prevpostag": prevpostag,
+            "word.prevwordisdigit": prevword.isdigit(),
+        }
+        features.update(prev_features)
     else:
         features["BOS"] = True  # Special "Beginning of Sequence" tag
 
     # Features from next word
     if i < len(doc) - 1:
         nextword = doc[i + 1][0]
-        postag1 = doc[i + 1][1]
-        features["word.nextword"] = nextword
-        features["word.nextisspace"] = nextword.isspace()
-        features["word.nextpostag"] = postag1
-        features["word.nextisthai"] = _is_thaiword(nextword)
-        features["word.nextstopword"] = _is_stopword(nextword)
-        features["word.nextwordisdigit"] = nextword.isdigit()
+        nextpostag = doc[i + 1][1]
+        next_features = {
+            "word.nextword": nextword,
+            "word.nextisspace": nextword.isspace(),
+            "word.nextpostag": nextpostag,
+            "word.nextisthai": is_thaiword(nextword),
+            "word.nextstopword": _is_stopword(nextword),
+            "word.nextwordisdigit": nextword.isdigit(),
+        }
+        features.update(next_features)
     else:
         features["EOS"] = True  # Special "End of Sequence" tag
 
diff --git a/pythainlp/spell/pn.py b/pythainlp/spell/pn.py
diff --git a/pythainlp/util/__init__.py b/pythainlp/util/__init__.py