combine_words_through_grammar replaced with stem_words function using nltk stemmer
parent
d6adc70d4f
commit
16ab48d162
|
@ -7,6 +7,7 @@
|
|||
|
||||
import pickle
|
||||
import math
|
||||
from nltk.stem import WordNetLemmatizer
|
||||
from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order
|
||||
|
||||
|
||||
|
@ -75,27 +76,11 @@ def revert_dict(d):
|
|||
return d2
|
||||
|
||||
|
||||
def combine_words_through_grammar(lst,d): #通过语法合并同一单词的不同形式
|
||||
lst1=lst
|
||||
for index,word in enumerate(lst):
|
||||
change_word=''
|
||||
if word.endswith('ies'): #语法条件匹配
|
||||
change_word=word[:-3]+'y'
|
||||
elif word.endswith('es'):
|
||||
change_word=word[:-2]
|
||||
elif word.endswith('s'):
|
||||
change_word=word[:-1]
|
||||
elif word.endswith('ed'):
|
||||
change_word=word[:-2]
|
||||
elif word.endswith('en'):
|
||||
change_word=word[:-2] + 'an'
|
||||
else:
|
||||
pass
|
||||
for word2 in d:
|
||||
if change_word==word2:
|
||||
lst1[index]=change_word
|
||||
break
|
||||
return lst1
|
||||
def stem_words(list_of_words): # It reduces words to the root word (eg. ate, eaten -> eat; leaves, leaf -> leaf)
|
||||
wnl = WordNetLemmatizer()
|
||||
lst1 = [wnl.lemmatize(w) for w in list_of_words]
|
||||
return [wnl.lemmatize(w, pos='v') for w in lst1] # stem by verb: 'v' represents verb
|
||||
|
||||
|
||||
def user_difficulty_level(d_user, d):
|
||||
d_user2 = revert_dict(d_user) # key is date, and value is a list of words added in that date
|
||||
|
@ -104,7 +89,7 @@ def user_difficulty_level(d_user, d):
|
|||
for date in sorted(d_user2.keys(), reverse=True): # most recently added words are more important while determining user's level
|
||||
lst = d_user2[date] # a list of words
|
||||
#print(lst)
|
||||
lst=combine_words_through_grammar(lst,d) #合并单词的不同形式
|
||||
lst = stem_words(lst) # this call returns a list of words reduced to root word
|
||||
#print(lst)
|
||||
lst2 = [] # a list of tuples, (word, difficulty level)
|
||||
for word in lst:
|
||||
|
@ -116,7 +101,7 @@ def user_difficulty_level(d_user, d):
|
|||
for t in lst3:
|
||||
word = t[0]
|
||||
hard = t[1]
|
||||
#print('WORD %s HARD %4.2f' % (word, hard))
|
||||
print('WORD %s HARD %4.2f' % (word, hard))
|
||||
geometric = geometric * (hard)
|
||||
count += 1
|
||||
if count >= 10:
|
||||
|
|
Loading…
Reference in New Issue