EnglishPal/app/vocabulary.py

137 lines
4.5 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

###########################################################################
# Copyright 2019 (C) Hui Lan <hui.lan@cantab.net>
# Written permission must be obtained from the author for commercial uses.
###########################################################################
# Purpose: compute difficulty level of a English text
import pickle
import math
from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order, map_percentages_to_levels
import snowballstemmer
def load_record(pickle_fname):
with open(pickle_fname, 'rb') as f:
d = pickle.load(f)
return d
ENGLISH_WORD_DIFFICULTY_DICT = {}
def convert_test_type_to_difficulty_level(d):
"""
对原本的单词库中的单词进行难度评级
:param d: 存储了单词库pickle文件中的单词的字典
:return:
"""
result = {}
L = list(d.keys()) # in d, we have test types (e.g., CET4,CET6,BBC) for each word
for k in L:
if 'CET4' in d[k]:
result[k] = 4 # CET4 word has level 4
elif 'OXFORD3000' in d[k]:
result[k] = 5
elif 'CET6' in d[k] or 'GRADUATE' in d[k]:
result[k] = 6
elif 'OXFORD5000' in d[k] or 'IELTS' in d[k]:
result[k] = 7
elif 'BBC' in d[k]:
result[k] = 8
global ENGLISH_WORD_DIFFICULTY_DICT
ENGLISH_WORD_DIFFICULTY_DICT = result
return result # {'apple': 4, ...}
def get_difficulty_level_for_user(d1, d2):
"""
d2 来自于词库的35511个已标记单词
d1 用户不会的词
在d2的后面添加单词没有新建一个新的字典
"""
# TODO: convert_test_type_to_difficulty_level() should not be called every time. Each word's difficulty level should be pre-computed.
if ENGLISH_WORD_DIFFICULTY_DICT == {}:
d2 = convert_test_type_to_difficulty_level(d2) # 根据d2的标记评级{'apple': 4, 'abandon': 4, ...}
else:
d2 = ENGLISH_WORD_DIFFICULTY_DICT
stemmer = snowballstemmer.stemmer('english')
for k in d1: # 用户的词
if k in d2: # 如果用户的词以原型的形式存在于词库d2中
continue # 无需评级,跳过
else:
stem = stemmer.stemWord(k)
if stem in d2: # 如果用户的词的词根存在于词库d2的词根库中
d2[k] = d2[stem] # 按照词根进行评级
else:
d2[k] = 3 # 如果k的词根都不在那么就当认为是3级
return d2
def revert_dict(d):
'''
In d, word is the key, and value is a list of dates.
In d2 (the returned value of this function), time is the key, and the value is a list of words picked at that time.
'''
d2 = {}
for k in d:
if type(d[k]) is list: # d[k] is a list of dates.
lst = d[k]
elif type(d[
k]) is int: # for backward compatibility. d was sth like {'word':1}. The value d[k] is not a list of dates, but a number representing how frequent this word had been added to the new word book.
freq = d[k]
lst = freq * ['2021082019'] # why choose this date? No particular reasons. I fix the bug in this date.
for time_info in lst:
date = time_info[:10] # until hour
if not date in d2:
d2[date] = [k]
else:
d2[date].append(k)
return d2
class VocabularyLevelEstimator:
_test = load_record('words_and_tests.p') # map a word to the sources where it appears
@property
def level(self):
total = 0.0 # TODO: need to compute this number
num = 1
for word in self.word_lst:
num += 1
if word in self._test:
print(f'{word} : {self._test[word]}')
else:
print(f'{word}')
return total/num
class UserVocabularyLevel(VocabularyLevelEstimator):
def __init__(self, d):
self.d = d
self.word_lst = list(d.keys())
# just look at the most recently-added words
class ArticleVocabularyLevel(VocabularyLevelEstimator):
def __init__(self, content):
self.content = content
self.word_lst = content.lower().split()
# select the 10 most difficult words
if __name__ == '__main__':
d = load_record('frequency_mrlan85.pickle')
print(d)
user = UserVocabularyLevel(d)
print(user.level) # level is a property
article = ArticleVocabularyLevel('This is an interesting article')
print(article.level)