diff --git a/app/difficulty.py b/app/difficulty.py index 50aa179..da13b55 100644 --- a/app/difficulty.py +++ b/app/difficulty.py @@ -7,6 +7,9 @@ import pickle import math +from nltk import word_tokenize,pos_tag +from nltk.corpus import wordnet +from nltk.stem import WordNetLemmatizer from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order @@ -74,6 +77,34 @@ def revert_dict(d): d2[date].append(k) return d2 +def get_wordnet_pos(tag): + if tag.startswith('J'): + return wordnet.ADJ + elif tag.startswith('V'): + return wordnet.VERB + elif tag.startswith('N'): + return wordnet.NOUN + elif tag.startswith('R'): + return wordnet.ADV + else: + return None + + +def combine_words_through_grammar(lst,d): #通过语法合并同一单词的不同形式 + lst1=lst + tagged_sent = pos_tag(lst) # 获取单词词性 + print(tagged_sent) + wnl = WordNetLemmatizer() + lemmas_sent = [] + for tag in tagged_sent: + wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN + lemmas_sent.append(wnl.lemmatize(tag[0], pos=wordnet_pos)) # 词形还原 + for index,change_word in enumerate(lemmas_sent): + for word2 in d: + if change_word==word2: + lst1[index]=change_word + return lst1 + def user_difficulty_level(d_user, d): d_user2 = revert_dict(d_user) # key is date, and value is a list of words added in that date @@ -81,6 +112,7 @@ def user_difficulty_level(d_user, d): geometric = 1 for date in sorted(d_user2.keys(), reverse=True): # most recently added words are more important while determining user's level lst = d_user2[date] # a list of words + lst=combine_words_through_grammar(lst,d) #合并单词的不同形式 lst2 = [] # a list of tuples, (word, difficulty level) for word in lst: if word in d: