forked from mrlan/EnglishPal
vocabulary最终版
parent
cea015f18a
commit
a1255b2f3d
Binary file not shown.
|
@ -22,14 +22,6 @@ def is_english_word(word):
|
||||||
return bool(pattern.match(word))
|
return bool(pattern.match(word))
|
||||||
|
|
||||||
|
|
||||||
# 判断日期格式是否有效
|
|
||||||
def is_valid_datetime_string(date_string, format='%Y%m%d%H%M'):
|
|
||||||
try:
|
|
||||||
datetime.strptime(date_string, format)
|
|
||||||
return True
|
|
||||||
except ValueError:
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
# 去除非单词字符
|
# 去除非单词字符
|
||||||
def remove_non_words(input_string):
|
def remove_non_words(input_string):
|
||||||
|
@ -41,19 +33,17 @@ def remove_non_words(input_string):
|
||||||
# 主类:词汇水平估算器
|
# 主类:词汇水平估算器
|
||||||
class VocabularyLevelEstimator:
|
class VocabularyLevelEstimator:
|
||||||
# 词汇表(单词:【"雅思","高考"...】)
|
# 词汇表(单词:【"雅思","高考"...】)
|
||||||
_test = load_record('static\words_and_tests.p') # 词汇到测试来源的映射
|
_test = load_record('static/words_and_tests.p') # 词汇到测试来源的映射
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def level(self):
|
def level(self):
|
||||||
total = 0.0 # 总评分
|
total = 0.0
|
||||||
num = 0 # 计算的单词数
|
num = 0
|
||||||
for word in self.word_lst:
|
for word in self.word_lst:
|
||||||
num += 1
|
|
||||||
if word in self._test:
|
if word in self._test:
|
||||||
print(f'{word} : {self._test[word]}') # 输出单词及其来源
|
total += self._test[word] # Assuming _test[word] returns a difficulty score
|
||||||
else:
|
num += 1
|
||||||
print(f'{word}') # 输出没有评分的单词
|
return total / num if num > 0 else 0.0
|
||||||
return total / num if num else 0.0 # 返回平均值
|
|
||||||
|
|
||||||
def get_word_level(self, word):
|
def get_word_level(self, word):
|
||||||
# 常见高频词汇列表
|
# 常见高频词汇列表
|
||||||
|
@ -91,15 +81,13 @@ class UserVocabularyLevel(VocabularyLevelEstimator):
|
||||||
self.filter_user_frequency()
|
self.filter_user_frequency()
|
||||||
|
|
||||||
def filter_user_frequency(self):
|
def filter_user_frequency(self):
|
||||||
# 过滤出最近一周的生词,用于计算用户词汇水平
|
|
||||||
stemmer = snowballstemmer.stemmer('english')
|
stemmer = snowballstemmer.stemmer('english')
|
||||||
range_datetime = (datetime.now() - timedelta(days=7)).strftime('%Y%m%d%H%M')
|
|
||||||
|
|
||||||
self.filtered_frequency = []
|
self.filtered_frequency = []
|
||||||
|
|
||||||
for word in self.d:
|
for word in self.d:
|
||||||
if is_english_word(word) and is_valid_datetime_string(self.d[word][0]):
|
if is_english_word(word) :
|
||||||
if self.d[word][0] > range_datetime and word not in self.filtered_frequency:
|
if word not in self.filtered_frequency:
|
||||||
self.filtered_frequency.append(stemmer.stemWord(word))
|
self.filtered_frequency.append(stemmer.stemWord(word))
|
||||||
|
|
||||||
@property
|
@property
|
||||||
|
@ -169,3 +157,8 @@ if __name__ == '__main__':
|
||||||
with open('test/article_test.p', 'rb') as file:
|
with open('test/article_test.p', 'rb') as file:
|
||||||
loaded_data = pickle.load(file)
|
loaded_data = pickle.load(file)
|
||||||
print(loaded_data)
|
print(loaded_data)
|
||||||
|
|
||||||
|
article1 = ArticleVocabularyLevel('source')
|
||||||
|
article2 = ArticleVocabularyLevel('open source')
|
||||||
|
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue