From 4da4ec415f50f7f6e2ed106104a3a2174813b3be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=8C=85=E6=9C=88=E7=90=B3?= <1913640604@qq.com> Date: Wed, 10 May 2023 19:27:33 +0800 Subject: [PATCH] =?UTF-8?q?=E6=9B=B4=E6=96=B0=20'app/difficulty.py'?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- app/difficulty.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/app/difficulty.py b/app/difficulty.py index 50aa179..da13b55 100644 --- a/app/difficulty.py +++ b/app/difficulty.py @@ -7,6 +7,9 @@ import pickle import math +from nltk import word_tokenize,pos_tag +from nltk.corpus import wordnet +from nltk.stem import WordNetLemmatizer from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order @@ -74,6 +77,34 @@ def revert_dict(d): d2[date].append(k) return d2 +def get_wordnet_pos(tag): + if tag.startswith('J'): + return wordnet.ADJ + elif tag.startswith('V'): + return wordnet.VERB + elif tag.startswith('N'): + return wordnet.NOUN + elif tag.startswith('R'): + return wordnet.ADV + else: + return None + + +def combine_words_through_grammar(lst,d): #通过语法合并同一单词的不同形式 + lst1=lst + tagged_sent = pos_tag(lst) # 获取单词词性 + print(tagged_sent) + wnl = WordNetLemmatizer() + lemmas_sent = [] + for tag in tagged_sent: + wordnet_pos = get_wordnet_pos(tag[1]) or wordnet.NOUN + lemmas_sent.append(wnl.lemmatize(tag[0], pos=wordnet_pos)) # 词形还原 + for index,change_word in enumerate(lemmas_sent): + for word2 in d: + if change_word==word2: + lst1[index]=change_word + return lst1 + def user_difficulty_level(d_user, d): d_user2 = revert_dict(d_user) # key is date, and value is a list of words added in that date @@ -81,6 +112,7 @@ def user_difficulty_level(d_user, d): geometric = 1 for date in sorted(d_user2.keys(), reverse=True): # most recently added words are more important while determining user's level lst = d_user2[date] # a list of words + lst=combine_words_through_grammar(lst,d) #合并单词的不同形式 lst2 = [] # a list of tuples, (word, difficulty level) for word in lst: if word in d: