diff --git a/app/difficulty.py b/app/difficulty.py index f79ca35..8ce2577 100644 --- a/app/difficulty.py +++ b/app/difficulty.py @@ -7,6 +7,7 @@ import pickle import math +from nltk.stem import WordNetLemmatizer from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order @@ -75,27 +76,11 @@ def revert_dict(d): return d2 -def combine_words_through_grammar(lst,d): #通过语法合并同一单词的不同形式 - lst1=lst - for index,word in enumerate(lst): - change_word='' - if word.endswith('ies'): #语法条件匹配 - change_word=word[:-3]+'y' - elif word.endswith('es'): - change_word=word[:-2] - elif word.endswith('s'): - change_word=word[:-1] - elif word.endswith('ed'): - change_word=word[:-2] - elif word.endswith('en'): - change_word=word[:-2] + 'an' - else: - pass - for word2 in d: - if change_word==word2: - lst1[index]=change_word - break - return lst1 +def stem_words(list_of_words): # It reduces words to the root word (eg. ate, eaten -> eat; leaves, leaf -> leaf) + wnl = WordNetLemmatizer() + lst1 = [wnl.lemmatize(w) for w in list_of_words] + return [wnl.lemmatize(w, pos='v') for w in lst1] # stem by verb: 'v' represents verb + def user_difficulty_level(d_user, d): d_user2 = revert_dict(d_user) # key is date, and value is a list of words added in that date @@ -104,7 +89,7 @@ def user_difficulty_level(d_user, d): for date in sorted(d_user2.keys(), reverse=True): # most recently added words are more important while determining user's level lst = d_user2[date] # a list of words #print(lst) - lst=combine_words_through_grammar(lst,d) #合并单词的不同形式 + lst = stem_words(lst) # this call returns a list of words reduced to root word #print(lst) lst2 = [] # a list of tuples, (word, difficulty level) for word in lst: @@ -116,7 +101,7 @@ def user_difficulty_level(d_user, d): for t in lst3: word = t[0] hard = t[1] - #print('WORD %s HARD %4.2f' % (word, hard)) + print('WORD %s HARD %4.2f' % (word, hard)) geometric = geometric * (hard) count += 1 if count >= 10: