0
0
Fork 0

更新 'app/difficulty.py'

增加一个功能,使生词表中的单词的变形例如复数被识别为同一个单词,准确评级用户的level(使其在判定单词时不受单词的变形影响而错判为高等级词汇)。 把同一个单词的不同形式看作是同一个单词。
源代码对于单词不同形式改变过于简单,一些复杂形势的过去式变化无法识别出。因此我们引入Python的nltk模块,从而实现对单词的形式变换。
Bug476-ZhangWeiHao-BaoYuelin
包月琳 2023-05-10 19:34:35 +08:00
parent 43c719b6b2
commit e2785c40a6
1 changed files with 31 additions and 0 deletions

View File

@ -7,6 +7,9 @@
import pickle import pickle
import math import math
from nltk import word_tokenize,pos_tag
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order
@ -74,6 +77,33 @@ def revert_dict(d):
d2[date].append(k) d2[date].append(k)
return d2 return d2
def get_wordnet_pos(tag):
if tag.startswith('J'):
return wordnet.ADJ
elif tag.startswith('V'):
return wordnet.VERB
elif tag.startswith('N'):
return wordnet.NOUN
elif tag.startswith('R'):
return wordnet.ADV
else:
return None
def combine_words_through_grammar(lst,d): #通过语法合并同一单词的不同形式
lst1=lst
tagged_sent = pos_tag(lst) # 获取单词词性
print(tagged_sent)
wnl = WordNetLemmatizer()
lemmas_sent = []
for tag in tagged_sent:
wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN
lemmas_sent.append(wnl.lemmatize(tag[0], pos=wordnet_pos)) # 词形还原
for index,change_word in enumerate(lemmas_sent):
for word2 in d:
if change_word==word2:
lst1[index]=change_word
return lst1
def user_difficulty_level(d_user, d): def user_difficulty_level(d_user, d):
d_user2 = revert_dict(d_user) # key is date, and value is a list of words added in that date d_user2 = revert_dict(d_user) # key is date, and value is a list of words added in that date
@ -81,6 +111,7 @@ def user_difficulty_level(d_user, d):
geometric = 1 geometric = 1
for date in sorted(d_user2.keys(), reverse=True): # most recently added words are more important while determining user's level for date in sorted(d_user2.keys(), reverse=True): # most recently added words are more important while determining user's level
lst = d_user2[date] # a list of words lst = d_user2[date] # a list of words
lst=combine_words_through_grammar(lst,d) #合并单词的不同形式
lst2 = [] # a list of tuples, (word, difficulty level) lst2 = [] # a list of tuples, (word, difficulty level)
for word in lst: for word in lst:
if word in d: if word in d: