From e2785c40a6ce3593439ced164f9d5758fca09ddf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=8C=85=E6=9C=88=E7=90=B3?= <1913640604@qq.com> Date: Wed, 10 May 2023 19:34:35 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20'app/difficulty.py'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 增加一个功能,使生词表中的单词的变形例如复数被识别为同一个单词,准确评级用户的level(使其在判定单词时不受单词的变形影响而错判为高等级词汇)。 把同一个单词的不同形式看作是同一个单词。 源代码对于单词不同形式改变过于简单,一些复杂形势的过去式变化无法识别出。因此我们引入Python的nltk模块,从而实现对单词的形式变换。 --- app/difficulty.py | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/app/difficulty.py b/app/difficulty.py index 50aa179..f7c7ae6 100644 --- a/app/difficulty.py +++ b/app/difficulty.py @@ -7,6 +7,9 @@ import pickle import math +from nltk import word_tokenize,pos_tag +from nltk.corpus import wordnet +from nltk.stem import WordNetLemmatizer from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order @@ -74,6 +77,33 @@ def revert_dict(d): d2[date].append(k) return d2 +def get_wordnet_pos(tag): + if tag.startswith('J'): + return wordnet.ADJ + elif tag.startswith('V'): + return wordnet.VERB + elif tag.startswith('N'): + return wordnet.NOUN + elif tag.startswith('R'): + return wordnet.ADV + else: + return None + + +def combine_words_through_grammar(lst,d): #通过语法合并同一单词的不同形式 + lst1=lst + tagged_sent = pos_tag(lst) # 获取单词词性 + print(tagged_sent) + wnl = WordNetLemmatizer() + lemmas_sent = [] + for tag in tagged_sent: + wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN + lemmas_sent.append(wnl.lemmatize(tag[0], pos=wordnet_pos)) # 词形还原 + for index,change_word in enumerate(lemmas_sent): + for word2 in d: + if change_word==word2: + lst1[index]=change_word + return lst1 def user_difficulty_level(d_user, d): d_user2 = revert_dict(d_user) # key is date, and value is a list of words added in that date @@ -81,6 +111,7 @@ def user_difficulty_level(d_user, d): geometric = 1 for date in sorted(d_user2.keys(), reverse=True): # most recently added words are more important while determining user's level lst = d_user2[date] # a list of words + lst=combine_words_through_grammar(lst,d) #合并单词的不同形式 lst2 = [] # a list of tuples, (word, difficulty level) for word in lst: if word in d: