From e2785c40a6ce3593439ced164f9d5758fca09ddf Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?=E5=8C=85=E6=9C=88=E7=90=B3?= <1913640604@qq.com>
Date: Wed, 10 May 2023 19:34:35 +0800
Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20'app/difficulty.py'?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

增加一个功能，使生词表中的单词的变形例如复数被识别为同一个单词，准确评级用户的level（使其在判定单词时不受单词的变形影响而错判为高等级词汇）。 把同一个单词的不同形式看作是同一个单词。
源代码对于单词不同形式改变过于简单，一些复杂形势的过去式变化无法识别出。因此我们引入Python的nltk模块，从而实现对单词的形式变换。
---
 app/difficulty.py | 31 +++++++++++++++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/app/difficulty.py b/app/difficulty.py
index 50aa179..f7c7ae6 100644
--- a/app/difficulty.py
+++ b/app/difficulty.py
@@ -7,6 +7,9 @@
 
 import pickle
 import math
+from nltk import word_tokenize,pos_tag
+from nltk.corpus import wordnet
+from nltk.stem import WordNetLemmatizer
 from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order
 
 
@@ -74,6 +77,33 @@ def revert_dict(d):
                 d2[date].append(k)
     return d2
 
+def get_wordnet_pos(tag):
+    if tag.startswith('J'):
+        return wordnet.ADJ
+    elif tag.startswith('V'):
+        return wordnet.VERB
+    elif tag.startswith('N'):
+       return wordnet.NOUN
+    elif tag.startswith('R'):
+        return wordnet.ADV
+    else:
+        return None
+
+
+def combine_words_through_grammar(lst,d): #通过语法合并同一单词的不同形式 
+    lst1=lst
+    tagged_sent = pos_tag(lst)     # 获取单词词性
+    print(tagged_sent)
+    wnl = WordNetLemmatizer()
+    lemmas_sent = []
+    for tag in tagged_sent:
+        wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
+        lemmas_sent.append(wnl.lemmatize(tag[0], pos=wordnet_pos)) # 词形还原
+    for index,change_word in enumerate(lemmas_sent):
+        for word2 in d:
+            if change_word==word2:
+                lst1[index]=change_word
+    return lst1  
 
 def user_difficulty_level(d_user, d):
     d_user2 = revert_dict(d_user) # key is date, and value is a list of words added in that date
@@ -81,6 +111,7 @@ def user_difficulty_level(d_user, d):
     geometric = 1
     for date in sorted(d_user2.keys(), reverse=True): # most recently added words are more important while determining user's level
         lst = d_user2[date] # a list of words
+        lst=combine_words_through_grammar(lst,d) #合并单词的不同形式
         lst2 = [] # a list of tuples, (word, difficulty level)
         for  word in lst:
             if word in d: