修改后的englishpal

Bug585-chenxiao
陈晓 2025-05-29 14:49:53 +08:00
parent d9512c929b
commit 42bd77f2eb
2 changed files with 489 additions and 0 deletions

View File

@ -0,0 +1,130 @@
import pickle
import math
import snowballstemmer
from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order, map_percentages_to_levels
class VocabularyLevelEstimator:
def __init__(self):
self.ENGLISH_WORD_DIFFICULTY_DICT = {}
def load_record(self, pickle_fname):
with open(pickle_fname, 'rb') as f:
return pickle.load(f)
def convert_test_type_to_difficulty_level(self, d):
"""
对原本的单词库中的单词进行难度评级
:param d: 存储了单词库pickle文件中的单词的字典
:return:
"""
result = {}
L = list(d.keys())
for k in L:
if 'CET4' in d[k]:
result[k] = 4
elif 'OXFORD3000' in d[k]:
result[k] = 5
elif 'CET6' in d[k] or 'GRADUATE' in d[k]:
result[k] = 6
elif 'OXFORD5000' in d[k] or 'IELTS' in d[k]:
result[k] = 7
elif 'BBC' in d[k]:
result[k] = 8
self.ENGLISH_WORD_DIFFICULTY_DICT = result
return result
def get_difficulty_level_for_user(self, d1, d2):
if not self.ENGLISH_WORD_DIFFICULTY_DICT:
d2 = self.convert_test_type_to_difficulty_level(d2)
else:
d2 = self.ENGLISH_WORD_DIFFICULTY_DICT
stemmer = snowballstemmer.stemmer('english')
for k in d1:
if k in d2:
continue
else:
stem = stemmer.stemWord(k)
if stem in d2:
d2[k] = d2[stem]
else:
d2[k] = 3
return d2
def revert_dict(self, d):
d2 = {}
for k in d:
lst = d[k] if isinstance(d[k], list) else d[k] * ['2021082019']
for time_info in lst:
date = time_info[:10]
if date not in d2:
d2[date] = [k]
else:
d2[date].append(k)
return d2
def user_difficulty_level(self, d_user, d, calc_func=0):
if calc_func != 0:
d_user2 = self.revert_dict(d_user)
geometric = 0
count = 0
for date in sorted(d_user2.keys(), reverse=True):
lst = d_user2[date]
lst2 = [(word, d[word]) for word in lst if word in d]
lst3 = sort_in_ascending_order(lst2)
for t in lst3:
word = t[0]
hard = t[1]
geometric += math.log(hard)
count += 1
return math.exp(geometric / max(count, 1))
d_user2 = self.revert_dict(d_user)
count = {}
percentages = {}
total = 0
for date in d_user2.keys():
lst = d_user2[date]
for word in lst:
if word in d:
count[d[word]] = count.get(d[word], 0) + 1
total += 1
if total == 0:
return 1
for k in count.keys():
percentages[k] = count[k] / total
weight = map_percentages_to_levels(percentages)
sum_ = sum(weight[k] * k for k in weight.keys())
return sum_
def text_difficulty_level(self, s, d):
s = remove_punctuation(s)
L = freq(s)
lst = []
stop_words = {'the': 1, 'and': 1, 'of': 1, 'to': 1, 'what': 1, 'in': 1, 'there': 1, 'when': 1, 'them': 1, 'would': 1, 'will': 1, 'out': 1, 'his': 1, 'mr': 1, 'that': 1, 'up': 1, 'more': 1, 'your': 1, 'it': 1, 'now': 1, 'very': 1, 'then': 1, 'could': 1, 'he': 1, 'any': 1, 'some': 1, 'with': 1, 'into': 1, 'you': 1, 'our': 1, 'man': 1, 'other': 1, 'time': 1, 'was': 1, 'than': 1, 'know': 1, 'about': 1, 'only': 1, 'like': 1, 'how': 1, 'see': 1, 'is': 1, 'before': 1, 'such': 1, 'little': 1, 'two': 1, 'its': 1, 'as': 1, 'these': 1, 'may': 1, 'much': 1, 'down': 1, 'for': 1, 'well': 1, 'should': 1, 'those': 1, 'after': 1, 'same': 1, 'must': 1, 'say': 1, 'first': 1, 'again': 1, 'us': 1, 'great': 1, 'where': 1, 'being': 1, 'come': 1, 'over': 1, 'good': 1, 'himself': 1, 'am': 1, 'never': 1, 'on': 1, 'old': 1, 'here': 1, 'way': 1, 'at': 1, 'go': 1, 'upon': 1, 'have': 1, 'had': 1, 'without': 1, 'my': 1, 'day': 1, 'be': 1, 'but': 1, 'though': 1, 'from': 1, 'not': 1, 'too': 1, 'another': 1, 'this': 1, 'even': 1, 'still': 1, 'her': 1, 'yet': 1, 'under': 1, 'by': 1, 'let': 1, 'just': 1, 'all': 1, 'because': 1, 'we': 1, 'always': 1, 'off': 1, 'yes': 1, 'so': 1, 'while': 1, 'why': 1, 'which': 1, 'me': 1, 'are': 1, 'or': 1, 'no': 1, 'if': 1, 'an': 1, 'also': 1, 'thus': 1, 'who': 1, 'cannot': 1, 'she': 1, 'whether': 1}
for x in L:
word = x[0]
if word not in stop_words and word in d:
lst.append((word, d[word]))
lst2 = sort_in_descending_order(lst)
geometric = 1
count = 0
for t in lst2:
word = t[0]
hard = t[1]
geometric *= hard
count += 1
if count >= 20:
return geometric ** (1 / count)
return geometric ** (1 / max(count, 1))

359
app/test_vocabulary.py Normal file
View File

@ -0,0 +1,359 @@
import pytest
from app.VocabularyLevelEstimator import VocabularyLevelEstimator
def test_text_difficulty_level_empty_content():
""" 测试空内容的文本难度级别 """
estimator = VocabularyLevelEstimator()
test_dict = {'source': 3}
assert estimator.text_difficulty_level('', test_dict) == 1.0
def test_text_difficulty_level_punctuation_only():
""" 测试只有标点符号的文本难度级别 """
estimator = VocabularyLevelEstimator()
test_dict = {'source': 3}
assert estimator.text_difficulty_level(',', test_dict) == 1.0
def test_text_difficulty_level_digit_only():
""" 测试只有数字的文本难度级别 """
estimator = VocabularyLevelEstimator()
test_dict = {'source': 3}
assert estimator.text_difficulty_level('1', test_dict) == 1.0
def test_text_difficulty_level_single_word():
""" 测试单个单词的文本难度级别 """
estimator = VocabularyLevelEstimator()
test_dict = {'source': 3}
assert 2 <= estimator.text_difficulty_level('source', test_dict) <= 4
def test_text_difficulty_level_subset_vs_superset():
""" 测试子集和超集的文本难度级别比较 """
estimator = VocabularyLevelEstimator()
test_dict = {'source': 3, 'open': 2}
level1 = estimator.text_difficulty_level('source', test_dict)
level2 = estimator.text_difficulty_level('open source', test_dict)
assert level1 < level2
def test_text_difficulty_level_multiple_words():
""" 测试多个单词的文本难度级别 """
estimator = VocabularyLevelEstimator()
test_dict = {
'producing': 3,
'open': 2,
'source': 3,
'software': 3,
'how': 2,
'run': 2,
'successful': 4,
'free': 2,
'project': 3
}
text = 'Producing Open Source Software - How to Run a Successful Free Software Project'
assert 2 <= estimator.text_difficulty_level(text, test_dict) <= 3
def test_text_difficulty_level_short_paragraph():
""" 测试短段落的文本难度级别 """
estimator = VocabularyLevelEstimator()
test_dict = {
'parties': 4,
'people': 2,
'work': 2,
'open': 2,
'source': 3,
'software': 3,
'linux': 4,
'money': 2,
'tool': 3,
'monopoly': 6
}
text = 'At parties, people no longer give me a blank stare when I tell them I work in open source software. "Oh, yes — like Linux?" they say. I nod eagerly in agreement. "Yes, exactly! That\'s what I do." It\'s nice not to be completely fringe anymore. In the past, the next question was usually fairly predictable: "How do you make money doing that?" To answer, I\'d summarize the economics of free software: that there are organizations in whose interest it is to have certain software exist, but that they don\'t need to sell copies, they just want to make sure the software is available and maintained, as a tool instead of as a rentable monopoly.'
assert 2 <= estimator.text_difficulty_level(text, test_dict) <= 3
def test_text_difficulty_level_medium_paragraph():
""" 测试中等段落的文本难度级别 """
estimator = VocabularyLevelEstimator()
test_dict = {
'considering': 5,
'origin': 4,
'species': 6,
'conceivable': 7,
'naturalist': 7,
'reflecting': 6,
'mutual': 5,
'affinities': 7,
'organic': 6,
'embryological': 8,
'relations': 5,
'geographical': 7,
'distribution': 6,
'geological': 7,
'succession': 6,
'conclusion': 5,
'independently': 6,
'descended': 5,
'varieties': 5,
'nevertheless': 6,
'unsatisfactory': 6,
'innumerable': 7,
'inhabiting': 6,
'modified': 5,
'perfection': 5,
'structure': 5,
'coadaptation': 8,
'admiration': 5,
'external': 5,
'conditions': 4,
'climate': 4,
'variation': 5,
'limited': 4,
'preposterous': 7,
'attribute': 5,
'woodpecker': 7,
'adapted': 5,
'insects': 4,
'bark': 4,
'trees': 3,
'misseltoe': 8,
'nourishment': 6,
'seeds': 3,
'transported': 5,
'birds': 3,
'flowers': 3,
'separate': 4,
'sexes': 5,
'absolutely': 5,
'requiring': 5,
'agency': 5,
'pollen': 7,
'parasite': 6,
'relations': 5,
'distinct': 5,
'effects': 4,
'habit': 4,
'volition': 7,
'plant': 3,
'itself': 3
}
text = 'In considering the Origin of Species, it is quite conceivable that a naturalist, reflecting on the mutual affinities of organic beings, on their embryological relations, their geographical distribution, geological succession, and other such facts, might come to the conclusion that each species had not been independently created, but had descended, like varieties, from other species. Nevertheless, such a conclusion, even if well founded, would be unsatisfactory, until it could be shown how the innumerable species inhabiting this world have been modified, so as to acquire that perfection of structure and coadaptation which most justly excites our admiration. Naturalists continually refer to external conditions, such as climate, food, etc., as the only possible cause of variation. In one very limited sense, as we shall hereafter see, this may be true; but it is preposterous to attribute to mere external conditions, the structure, for instance, of the woodpecker, with its feet, tail, beak, and tongue, so admirably adapted to catch insects under the bark of trees. In the case of the misseltoe, which draws its nourishment from certain trees, which has seeds that must be transported by certain birds, and which has flowers with separate sexes absolutely requiring the agency of certain insects to bring pollen from one flower to the other, it is equally preposterous to account for the structure of this parasite, with its relations to several distinct organic beings, by the effects of external conditions, or of habit, or of the volition of the plant itself.'
assert 5 <= estimator.text_difficulty_level(text, test_dict) <= 7
def test_text_difficulty_level_long_paragraph():
""" 测试长段落的文本难度级别 """
estimator = VocabularyLevelEstimator()
test_dict = {
'facts': 3,
'accord': 5,
'theory': 4,
'fixed': 4,
'law': 3,
'development': 5,
'causing': 5,
'inhabitants': 6,
'country': 3,
'change': 3,
'abruptly': 6,
'simultaneously': 7,
'equal': 4,
'degree': 4,
'process': 4,
'modification': 6,
'extremely': 5,
'slow': 3,
'variability': 7,
'independent': 5,
'whether': 4,
'taken': 3,
'advantage': 4,
'natural': 4,
'selection': 5,
'variations': 6,
'accumulated': 7,
'greater': 4,
'lesser': 5,
'amount': 3,
'depends': 4,
'complex': 5,
'contingencies': 8,
'beneficial': 6,
'nature': 4,
'power': 3,
'intercrossing': 8,
'rate': 3,
'breeding': 5,
'slowly': 4,
'changing': 4,
'physical': 5,
'especially': 5,
'inhabitants': 6,
'competition': 5,
'surprising': 5,
'retain': 5,
'identical': 6,
'form': 3,
'longer': 4,
'changing': 4,
'geographical': 7,
'distribution': 6,
'instance': 4,
'land': 3,
'shells': 4,
'coleopterous': 9,
'insects': 4,
'madeira': 7,
'differ': 5,
'considerably': 6,
'nearest': 4,
'allies': 6,
'continent': 5,
'europe': 4,
'marine': 6,
'birds': 3,
'unaltered': 6,
'understand': 4,
'apparently': 6,
'quicker': 5,
'terrestrial': 8,
'highly': 4,
'organised': 7,
'productions': 6,
'compared': 5,
'marine': 6,
'lower': 3,
'complex': 5,
'relations': 5,
'higher': 4,
'beings': 4,
'organic': 6,
'inorganic': 7,
'conditions': 4,
'explained': 5,
'former': 4,
'chapter': 4,
'inhabitants': 6,
'modified': 5,
'improved': 5,
'principle': 5,
'competition': 5,
'all-important': 7,
'organism': 7,
'form': 3,
'degree': 4,
'liable': 6,
'exterminated': 7,
'region': 4,
'intervals': 6,
'time': 3,
'modified': 5,
'extinct': 6
}
text = 'These several facts accord well with my theory. I believe in no fixed law of development, causing all the inhabitants of a country to change abruptly, or simultaneously, or to an equal degree. The process of modification must be extremely slow. The variability of each species is quite independent of that of all others. Whether such variability be taken advantage of by natural selection, and whether the variations be accumulated to a greater or lesser amount, thus causing a greater or lesser amount of modification in the varying species, depends on many complex contingencies,—on the variability being of a beneficial nature, on the power of intercrossing, on the rate of breeding, on the slowly changing physical conditions of the country, and more especially on the nature of the other inhabitants with which the varying species comes into competition. Hence it is by no means surprising that one species should retain the same identical form much longer than others; or, if changing, that it should change less. We see the same fact in geographical distribution; for instance, in the land-shells and coleopterous insects of Madeira having come to differ considerably from their nearest allies on the continent of Europe, whereas the marine shells and birds have remained unaltered. We can perhaps understand the apparently quicker rate of change in terrestrial and in more highly organised productions compared with marine and lower productions, by the more complex relations of the higher beings to their organic and inorganic conditions of life, as explained in a former chapter. When many of the inhabitants of a country have become modified and improved, we can understand, on the principle of competition, and on that of the many all-important relations of organism to organism, that any form which does not become in some degree modified and improved, will be liable to be exterminated. Hence we can see why all the species in the same region do at last, if we look to wide enough intervals of time, become modified; for those which do not change will become extinct.'
assert 6 <= estimator.text_difficulty_level(text, test_dict) <= 8
def test_user_difficulty_level_empty_dictionary():
""" 测试空字典的用户难度级别 """
estimator = VocabularyLevelEstimator()
test_dict = {'source': 3}
assert estimator.user_difficulty_level({}, test_dict) == 1
def test_user_difficulty_level_one_simple_word():
""" 测试单个简单单词的用户难度级别 """
estimator = VocabularyLevelEstimator()
test_dict = {'simple': 2}
user_dict = {'simple': ['202408050930']}
assert 0 < estimator.user_difficulty_level(user_dict, test_dict) <= 4
def test_user_difficulty_level_invalid_word():
""" 测试无效单词的用户难度级别 """
estimator = VocabularyLevelEstimator()
test_dict = {'source': 3}
user_dict = {'xyz': ['202408050930']}
assert estimator.user_difficulty_level(user_dict, test_dict) == 1
def test_user_difficulty_level_one_hard_word():
""" 测试单个困难单词的用户难度级别 """
estimator = VocabularyLevelEstimator()
test_dict = {'pasture': 6}
user_dict = {'pasture': ['202408050930']}
assert 5 <= estimator.user_difficulty_level(user_dict, test_dict) <= 8
def test_user_difficulty_level_multiple_words():
""" 测试多个单词的用户难度级别 """
estimator = VocabularyLevelEstimator()
test_dict = {
'sessile': 8,
'putrid': 7,
'prodigal': 7,
'presumptuous': 8,
'prehension': 8,
'pied': 6,
'pedunculated': 9,
'pasture': 6,
'parturition': 8,
'ovigerous': 9,
'ova': 7,
'orifice': 7,
'obliterate': 7,
'niggard': 8,
'neuter': 6,
'locomotion': 7,
'lineal': 7,
'glottis': 8,
'frivolous': 7,
'frena': 8,
'flotation': 7,
'ductus': 8,
'dorsal': 7,
'crustacean': 8,
'cornea': 7,
'contrivance': 7,
'collateral': 7,
'cirriped': 9,
'canon': 6,
'branchiae': 9,
'auditory': 7,
'articulata': 9,
'alimentary': 8,
'adduce': 7,
'aberration': 8
}
user_dict = {word: ['202408050930'] for word in test_dict.keys()}
assert 6 <= estimator.user_difficulty_level(user_dict, test_dict) <= 8
def test_user_difficulty_level_consider_only_most_recent_words_difficult_words_most_recent():
""" 测试最近学习的困难单词的用户难度级别 """
estimator = VocabularyLevelEstimator()
test_dict = {
'pasture': 6,
'putrid': 7,
'frivolous': 7,
'simple': 2,
'apple': 2
}
user_dict = {
'pasture': ['202408050930'],
'putrid': ['202408040000'],
'frivolous': ['202408030000'],
'simple': ['202408020000'],
'apple': ['202408010000']
}
assert 3 <= estimator.user_difficulty_level(user_dict, test_dict) <= 4
def test_user_difficulty_level_consider_only_most_recent_words_easy_words_most_recent():
""" 测试最近学习的简单单词的用户难度级别 """
estimator = VocabularyLevelEstimator()
test_dict = {
'simple': 2,
'apple': 2,
'happy': 2,
'pasture': 6,
'putrid': 7,
'dearth': 6
}
user_dict = {
'simple': ['202408050930'],
'apple': ['202408040000'],
'happy': ['202408030000'],
'pasture': ['202408020000'],
'putrid': ['202408010000'],
'dearth': ['202407310000']
}
assert 3 <= estimator.user_difficulty_level(user_dict, test_dict) <= 4