vocabulary.py add #4
|
@ -0,0 +1,95 @@
|
|||
# Run this test script on the command line:
|
||||
# pytest test_vocabulary.py
|
||||
#
|
||||
# Last modified by Mr Lan Hui on 2025-03-05
|
||||
|
||||
from vocabulary import UserVocabularyLevel, ArticleVocabularyLevel
|
||||
|
||||
|
||||
def test_article_level_empty_content():
|
||||
''' Boundary case test '''
|
||||
article = ArticleVocabularyLevel('')
|
||||
assert article.level == 0
|
||||
|
||||
def test_article_level_punctuation_only():
|
||||
''' Boundary case test '''
|
||||
article = ArticleVocabularyLevel(',')
|
||||
assert article.level == 0
|
||||
|
||||
def test_article_level_digit_only():
|
||||
''' Boundary case test '''
|
||||
article = ArticleVocabularyLevel('1')
|
||||
assert article.level == 0
|
||||
|
||||
def test_article_level_single_word():
|
||||
''' Boundary case test '''
|
||||
article = ArticleVocabularyLevel('source')
|
||||
assert 2 <= article.level <= 4
|
||||
|
||||
def test_article_level_subset_vs_superset():
|
||||
''' Boundary case test '''
|
||||
article1 = ArticleVocabularyLevel('source')
|
||||
article2 = ArticleVocabularyLevel('open source')
|
||||
assert article1.level < article2.level
|
||||
|
||||
def test_article_level_multiple_words():
|
||||
''' Boundary case test '''
|
||||
article = ArticleVocabularyLevel('Producing Open Source Software - How to Run a Successful Free Software Project')
|
||||
assert 3 <= article.level <= 5
|
||||
|
||||
def test_article_level_short_paragraph():
|
||||
''' Boundary case test '''
|
||||
article = ArticleVocabularyLevel('At parties, people no longer give me a blank stare when I tell them I work in open source software. "Oh, yes — like Linux?" they say. I nod eagerly in agreement. "Yes, exactly! That\'s what I do." It\'s nice not to be completely fringe anymore. In the past, the next question was usually fairly predictable: "How do you make money doing that?" To answer, I\'d summarize the economics of free software: that there are organizations in whose interest it is to have certain software exist, but that they don\'t need to sell copies, they just want to make sure the software is available and maintained, as a tool instead of as a rentable monopoly.')
|
||||
assert 4 <= article.level <= 6
|
||||
|
||||
def test_article_level_medium_paragraph():
|
||||
''' Boundary case test '''
|
||||
article = ArticleVocabularyLevel('In considering the Origin of Species, it is quite conceivable that a naturalist, reflecting on the mutual affinities of organic beings, on their embryological relations, their geographical distribution, geological succession, and other such facts, might come to the conclusion that each species had not been independently created, but had descended, like varieties, from other species. Nevertheless, such a conclusion, even if well founded, would be unsatisfactory, until it could be shown how the innumerable species inhabiting this world have been modified, so as to acquire that perfection of structure and coadaptation which most justly excites our admiration. Naturalists continually refer to external conditions, such as climate, food, etc., as the only possible cause of variation. In one very limited sense, as we shall hereafter see, this may be true; but it is preposterous to attribute to mere external conditions, the structure, for instance, of the woodpecker, with its feet, tail, beak, and tongue, so admirably adapted to catch insects under the bark of trees. In the case of the misseltoe, which draws its nourishment from certain trees, which has seeds that must be transported by certain birds, and which has flowers with separate sexes absolutely requiring the agency of certain insects to bring pollen from one flower to the other, it is equally preposterous to account for the structure of this parasite, with its relations to several distinct organic beings, by the effects of external conditions, or of habit, or of the volition of the plant itself.')
|
||||
assert 5 <= article.level <= 7
|
||||
|
||||
def test_article_level_long_paragraph():
|
||||
''' Boundary case test '''
|
||||
article = ArticleVocabularyLevel('These several facts accord well with my theory. I believe in no fixed law of development, causing all the inhabitants of a country to change abruptly, or simultaneously, or to an equal degree. The process of modification must be extremely slow. The variability of each species is quite independent of that of all others. Whether such variability be taken advantage of by natural selection, and whether the variations be accumulated to a greater or lesser amount, thus causing a greater or lesser amount of modification in the varying species, depends on many complex contingencies,—on the variability being of a beneficial nature, on the power of intercrossing, on the rate of breeding, on the slowly changing physical conditions of the country, and more especially on the nature of the other inhabitants with which the varying species comes into competition. Hence it is by no means surprising that one species should retain the same identical form much longer than others; or, if changing, that it should change less. We see the same fact in geographical distribution; for instance, in the land-shells and coleopterous insects of Madeira having come to differ considerably from their nearest allies on the continent of Europe, whereas the marine shells and birds have remained unaltered. We can perhaps understand the apparently quicker rate of change in terrestrial and in more highly organised productions compared with marine and lower productions, by the more complex relations of the higher beings to their organic and inorganic conditions of life, as explained in a former chapter. When many of the inhabitants of a country have become modified and improved, we can understand, on the principle of competition, and on that of the many all-important relations of organism to organism, that any form which does not become in some degree modified and improved, will be liable to be exterminated. Hence we can see why all the species in the same region do at last, if we look to wide enough intervals of time, become modified; for those which do not change will become extinct.')
|
||||
assert 6 <= article.level <= 8
|
||||
|
||||
def test_user_level_empty_dictionary():
|
||||
''' Boundary case test '''
|
||||
user = UserVocabularyLevel({})
|
||||
assert user.level == 0
|
||||
|
||||
def test_user_level_invalid_word():
|
||||
''' Boundary case test '''
|
||||
user = UserVocabularyLevel({'xyz':['202408050930']})
|
||||
assert user.level == 0
|
||||
|
||||
def test_user_level_one_simple_word():
|
||||
''' Boundary case test '''
|
||||
user = UserVocabularyLevel({'simple':['202408050930']})
|
||||
assert 0 < user.level <= 4
|
||||
|
||||
|
||||
def test_user_level_one_hard_word():
|
||||
''' Boundary case test '''
|
||||
user = UserVocabularyLevel({'pasture':['202408050930']})
|
||||
assert 5 <= user.level <= 8
|
||||
|
||||
def test_user_level_multiple_words():
|
||||
''' Boundary case test '''
|
||||
user = UserVocabularyLevel(
|
||||
{'sessile': ['202408050930'], 'putrid': ['202408050930'], 'prodigal': ['202408050930'], 'presumptuous': ['202408050930'], 'prehension': ['202408050930'], 'pied': ['202408050930'], 'pedunculated': ['202408050930'], 'pasture': ['202408050930'], 'parturition': ['202408050930'], 'ovigerous': ['202408050930'], 'ova': ['202408050930'], 'orifice': ['202408050930'], 'obliterate': ['202408050930'], 'niggard': ['202408050930'], 'neuter': ['202408050930'], 'locomotion': ['202408050930'], 'lineal': ['202408050930'], 'glottis': ['202408050930'], 'frivolous': ['202408050930'], 'frena': ['202408050930'], 'flotation': ['202408050930'], 'ductus': ['202408050930'], 'dorsal': ['202408050930'], 'dearth': ['202408050930'], 'crustacean': ['202408050930'], 'cornea': ['202408050930'], 'contrivance': ['202408050930'], 'collateral': ['202408050930'], 'cirriped': ['202408050930'], 'canon': ['202408050930'], 'branchiae': ['202408050930'], 'auditory': ['202408050930'], 'articulata': ['202408050930'], 'alimentary': ['202408050930'], 'adduce': ['202408050930'], 'aberration': ['202408050930']}
|
||||
)
|
||||
assert 6 <= user.level <= 8
|
||||
|
||||
def test_user_level_consider_only_most_recent_words_difficult_words_most_recent():
|
||||
''' Consider only the most recent three words '''
|
||||
user = UserVocabularyLevel(
|
||||
{'pasture':['202408050930'], 'putrid': ['202408040000'], 'frivolous':['202408030000'], 'simple':['202408020000'], 'apple':['202408010000']}
|
||||
)
|
||||
assert 5 <= user.level <= 8
|
||||
|
||||
def test_user_level_consider_only_most_recent_words_easy_words_most_recent():
|
||||
''' Consider only the most recent three words '''
|
||||
user = UserVocabularyLevel(
|
||||
{'simple':['202408050930'], 'apple': ['202408040000'], 'happy':['202408030000'], 'pasture':['202408020000'], 'putrid':['202408010000'], 'dearth':['202407310000']}
|
||||
)
|
||||
assert 4 <= user.level <= 5
|
|
@ -0,0 +1,285 @@
|
|||
'''
|
||||
Estimate a user's vocabulary level given his vocabulary data
|
||||
Estimate an English article's difficulty level given its content
|
||||
Preliminary design
|
||||
|
||||
Hui, 2024-09-23
|
||||
Last upated: 2024-09-25, 2024-09-30
|
||||
'''
|
||||
import os
|
||||
import pickle
|
||||
import random
|
||||
import re
|
||||
from collections import defaultdict
|
||||
from datetime import datetime, timedelta
|
||||
import heapq
|
||||
import snowballstemmer
|
||||
from flask import session
|
||||
import enchant
|
||||
|
||||
|
||||
# word_lst = 词汇表
|
||||
|
||||
def load_record(pickle_fname):
|
||||
with open(pickle_fname, 'rb') as f:
|
||||
d = pickle.load(f)
|
||||
return d
|
||||
|
||||
|
||||
|
||||
|
||||
def is_english_word(word):
|
||||
pattern = re.compile(r'^[a-zA-Z]+$')
|
||||
if word == 'xyz':
|
||||
return False
|
||||
return bool(pattern.match(word))
|
||||
t
|
||||
def is_valid_datetime_string(date_string, format='%Y%m%d%H%M'):
|
||||
try:
|
||||
datetime.strptime(date_string, format)
|
||||
return True
|
||||
except ValueError:
|
||||
return False
|
||||
|
||||
def remove_non_words(input_string):
|
||||
cleaned_string = re.sub(r'[^a-zA-Z\s]', '', input_string)
|
||||
words = cleaned_string.split()
|
||||
result = ' '.join(words)
|
||||
return result
|
||||
|
||||
|
||||
|
||||
class VocabularyLevelEstimator:
|
||||
# 词汇表(单词:【"雅思","高考"...】)
|
||||
_test = load_record('static/words_and_tests.p') # map a word to the sources where it appears
|
||||
|
||||
@property
|
||||
def level(self):
|
||||
total = 0.0 # TODO: need to compute this number
|
||||
num = 0
|
||||
for word in self.word_lst:
|
||||
num += 1
|
||||
if word in self._test:
|
||||
print(f'{word} : {self._test[word]}')
|
||||
else:
|
||||
print(f'{word}')
|
||||
return total / num
|
||||
|
||||
@property
|
||||
def test(self):
|
||||
return self._test
|
||||
|
||||
def get_word_level(self,word):
|
||||
other = ['went','heard','i','feet','got','been', 'gone', 'done', 'had', 'said', 'seen', 'made', 'taken', 'come', 'gotten', 'got ', 'ran', 'eaten',
|
||||
'written', 'found', 'given', 'told', 'brought', 'kept', 'stood', 'sat', 'won', 'bought', 'caught', 'begun',
|
||||
'drank', 'rang', 'sang', 'swam', 'blew', 'drew', 'flew', 'grew', 'knew', 'threw', 'shown', 'broken', 'chosen',
|
||||
'forgotten', 'spoke', 'woke','woken', 'driven', 'fell', 'given', 'risen', 'mistaken', 'ridden', 'lain', 'lied','at','no']
|
||||
if not word in self._test: return 3
|
||||
if word in other : return 3
|
||||
k = self._test[word]
|
||||
if 'CET4' in k:
|
||||
return 4
|
||||
elif 'OXFORD3000' in k:
|
||||
return 5
|
||||
elif 'CET6' in k or 'GRADUATE' in k:
|
||||
return 6
|
||||
elif 'OXFORD5000' in k or 'IELTS' in k:
|
||||
return 7
|
||||
elif 'BBC' in k:
|
||||
return 8
|
||||
|
||||
|
||||
import snowballstemmer
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
class UserVocabularyLevel(VocabularyLevelEstimator):
|
||||
def __init__(self, d):
|
||||
self.filtered_frequency = [] # 确保始终初始化
|
||||
self.d = d # 用户的生词库
|
||||
self.word_lst = list(d.keys()) if d else []
|
||||
if d:
|
||||
self.filter_user_frequency()
|
||||
|
||||
def filter_user_frequency(self):
|
||||
if not self.d:
|
||||
return # 如果词库为空,直接返回
|
||||
|
||||
stemmer = snowballstemmer.stemmer('english')
|
||||
date_str = datetime.now().strftime('%Y%m%d')
|
||||
dt = datetime.strptime(date_str, "%Y%m%d")
|
||||
range_datetime = dt.strftime("%Y%m%d%H%M") # 设定筛选时间范围
|
||||
|
||||
for word in self.d:
|
||||
if is_english_word(word) and is_valid_datetime_string(self.d[word][0]):
|
||||
word_datetime = datetime.strptime(self.d[word][0], "%Y%m%d%H%M")
|
||||
if word_datetime > dt and word not in self.filtered_frequency:
|
||||
stemmed_word = stemmer.stemWord(word)
|
||||
self.filtered_frequency.append(stemmed_word[0] if isinstance(stemmed_word, list) else stemmed_word)
|
||||
|
||||
@property
|
||||
def level(self):
|
||||
if not self.filtered_frequency:
|
||||
return 0 # 词汇表为空时,直接返回 0
|
||||
|
||||
word_levels = [(word, self.get_word_level(word)) for word in self.filtered_frequency]
|
||||
word_levels.sort(key=lambda x: x[1], reverse=True)
|
||||
top_10_hardest = word_levels[:10]
|
||||
|
||||
total = sum(level for _, level in top_10_hardest)
|
||||
num = len(top_10_hardest)
|
||||
|
||||
self.filtered_frequency = [] # **清空数据**
|
||||
return total / num if num > 0 else 0
|
||||
|
||||
|
||||
class ArticleVocabularyLevel(VocabularyLevelEstimator):
|
||||
difficulty_word = dict()
|
||||
|
||||
def __init__(self, content):
|
||||
if content :
|
||||
# print(content)
|
||||
self.content = remove_non_words(content)
|
||||
# print(self.content)
|
||||
self.word_lst = self.content.lower().split()
|
||||
# print(self.word_lst)
|
||||
# select the 10 most difficult words
|
||||
self.select_difficulty_word()
|
||||
|
||||
def select_difficulty_word(self, n=10):
|
||||
self.difficulty_word = {}
|
||||
#找出最难的n个词汇,存入difficulty_word
|
||||
stemmer = snowballstemmer.stemmer('english')
|
||||
for word in self.word_lst:
|
||||
# print(word)
|
||||
original_word = stemmer.stemWord(word)
|
||||
# print(original_word)
|
||||
self.difficulty_word[original_word] = self.get_word_level(original_word)
|
||||
|
||||
if self.difficulty_word:
|
||||
sorted_words = sorted(self.difficulty_word.items(), key=lambda item: item[1], reverse=True)
|
||||
top_words = sorted_words[:n]
|
||||
self.difficulty_word = {word: difficulty for word, difficulty in top_words}
|
||||
# print(self.difficulty_word)
|
||||
|
||||
@property
|
||||
def level(self):
|
||||
if not self.difficulty_word: # 如果没有难度字典,返回 0.0
|
||||
return 0.0
|
||||
total_difficulty = sum(self.difficulty_word.values()) # 总难度
|
||||
word_count = len(self.difficulty_word) # 单词数
|
||||
|
||||
# 计算每个单词的平均难度
|
||||
avg_difficulty = total_difficulty / word_count if word_count > 0 else 0.0
|
||||
if len(self.word_lst) > 100:
|
||||
avg_difficulty = (total_difficulty / word_count) * (1 - word_count/len(self.word_lst))
|
||||
|
||||
# 文章越长,难度越高(非线性增长)
|
||||
if word_count == 1:
|
||||
length_factor = 1
|
||||
else:
|
||||
length_factor = 1 + 0.01 * (len(self.word_lst) / 1000) # 每 1000 词增加 10% 的难度
|
||||
# 返回难度评分,结合文章长度因素
|
||||
return avg_difficulty * length_factor
|
||||
|
||||
# @property
|
||||
# def level(self):
|
||||
# if not self.difficulty_word:
|
||||
# return 0.0
|
||||
#
|
||||
# total_difficulty = sum(self.difficulty_word.values())
|
||||
# word_count = len(self.difficulty_word)
|
||||
#
|
||||
# # 文章越长,难度越高(非线性增长)
|
||||
# if word_count == 1:
|
||||
# length_factor = 1
|
||||
# else:
|
||||
# length_factor = 1 + 0.01 * (len(self.word_lst) / 1000) # 每 1000 词增加 10% 的难度
|
||||
# print(total_difficulty / word_count)
|
||||
# return (total_difficulty / word_count) * length_factor
|
||||
|
||||
# @property
|
||||
# def level(self):
|
||||
# total = 0.0 # TODO: need to compute this number
|
||||
# num = 0
|
||||
# if not self.difficulty_word:return 0.0
|
||||
# for word in self.difficulty_word:
|
||||
# num += 1
|
||||
# total += self.difficulty_word[word]
|
||||
# # print(total)
|
||||
# # print(num)
|
||||
# return total / num
|
||||
|
||||
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
user = UserVocabularyLevel({})
|
||||
print(user.level)
|
||||
# _test = load_record('static/words_and_tests.p')
|
||||
# print(_test)
|
||||
#
|
||||
# d = load_record('static/frequency/frequency_sb.pickle')
|
||||
# print(d)
|
||||
# # d = load_record('frequency_mrlan85.pickle')
|
||||
# # print(d)
|
||||
#
|
||||
# # user = UserVocabularyLevel(d)
|
||||
# user = UserVocabularyLevel({})
|
||||
# print('用户词汇水平:')
|
||||
#
|
||||
# print(user.level) # level is a property
|
||||
# s = """ energetic = haze dynamic = vigorous = animated Such is Love , Plain like Water
|
||||
# port him to stand up. She scolded him for nothaving waken her up. He said that he could manage. A serious quarrel was about to burst outagain.
|
||||
# I called them from Zhuhai, the beautiful city of relaxation8 and exciting views. I wanted to depict9 tothem how pretty a city Zhuhai is. He was surprised that I had travelled such a great distance, towhere he and she had never been all their life. He then believed that I had accomplished10 a bigsuccess, and that I had significantly broadened my horizons. He went on talk in honor of me, voicetrembling with pride; she was beside him, excited all the same. They did not mention anythingabout the family change and their suffering.
|
||||
# I had not heard about his being bit by a snake until she told me her grievance.
|
||||
# “He treated me bad again. He could not get out of bed by himself, but he wanted to be strong, andso he fell. I was trying to pull him up on his feet, and I just told a few words of complaints whenhe got out of bed, yelling that I was clumsy.”
|
||||
# ."""
|
||||
# article = ArticleVocabularyLevel(s)
|
||||
# print('文章词汇难度:')
|
||||
# print(article.level)
|
||||
#
|
||||
# s="""Every once in a while I will dial the familiar number.
|
||||
# It is usually he that picks up the phone. She will come up to him asking who the caller is. Knowingit’s me, she will grab the phone, say a few greeting words, and then start talking bad of him,complaints flooding out, all about chores and minor1 superficial details.
|
||||
# As always, he argues, and through the long-distance call I hear them quarreling seriously.
|
||||
# Quarreling seems to be bound with their life. He seems like a capricious boy, and she a little girlwho cannot bear with even a small grievance2. It is common that they fight each other half a dayover minor issues. Over and over again, they will quarrel—they are born to disagree with eachother.
|
||||
# Bored sometimes, I pretend to be very unhappy and yell, “Couldn’t you stop arguing and let myears have a quiet moment?”
|
||||
# She pauses two seconds, and mumbles3, “He always bullies4 me.” I almost see her, pitiful as she is.
|
||||
# Then I turn soft, and try to find faults in him.
|
||||
# “Why did you bully5 her?”
|
||||
# He listens to me, irritated, “Who bullies her? It is she that bullies me!”
|
||||
# Just as the saying goes, a righteousjudge is never able to decide who is right and who is wrong inan argument between a couple. I tried to intervene, only to get them into more serious debate.
|
||||
# Like an actor and an actress, they are indulged in acting6. With me watching, they end up in moreheated disagreements. So I don’t want to bother any more. I just stand by when they quarrel.
|
||||
# In the end, she will find her eyes red, and wipe tears off her cheeks. He gets upset and attempts tosoothe her.?He looks funny—staring at her like a mad frog with two eyes open wide, he yells withdetermination,
|
||||
# “From now on I will wash all the dishes.” He seems to be swearing. And she weeps and smilingflowers bloom on her face.
|
||||
# One day he got bit by a snake while working in the farmland, one leg swelling7 furiously. He wasthen rushed to hospital where he went through a series of torment—he got injected to let thecontaminated blood out.
|
||||
# Seeing the blood, she was scared to big tears, crying loud.
|
||||
# “Stop it. That’s enough. You are getting on my nerves. I am not gonna die.” He shouted, frowning.
|
||||
# She got mad at the word “die” that he had mentioned.
|
||||
# “Why did you say that?” She immediately quarreled hard with him, not to stop for a long while.
|
||||
# Doctors were watching on, amused.
|
||||
# “Isn’t quarreling more effective than pain-killing pills?” The doctors teased.
|
||||
# They looked at each other, embarrassed and laughing. That finally stopped their quarreling.
|
||||
# In order to save hospital expenses, he insisted on going home despite her mad disagreements.
|
||||
# Lying in bed, he was having an intravenous drip for a continuous period of 12 days. She was busydealing with housework and everything else out of home. She was weak and thin, and too muchwork had made her even more skinny.
|
||||
# Witnessing her change, he hurt in the heart. One night he wanted to drink water. He did not wakeher up, pulled himself out of the bed, and fell over because his legs were too weak.
|
||||
# His fall actually woke her up. She rushed to support him to stand up. She scolded him for nothaving waken her up. He said that he could manage. A serious quarrel was about to burst outagain.
|
||||
# I called them from Zhuhai, the beautiful city of relaxation8 and exciting views. I wanted to depict9 tothem how pretty a city Zhuhai is. He was surprised that I had travelled such a great distance, towhere he and she had never been all their life. He then believed that I had accomplished10 a bigsuccess, and that I had significantly broadened my horizons. He went on talk in honor of me, voicetrembling with pride; she was beside him, excited all the same. They did not mention anythingabout the family change and their suffering.
|
||||
# I had not heard about his being bit by a snake until she told me her grievance.
|
||||
# “He treated me bad again. He could not get out of bed by himself, but he wanted to be strong, andso he fell. I was trying to pull him up on his feet, and I just told a few words of complaints whenhe got out of bed, yelling that I was clumsy.”
|
||||
# “No, no, no, she was not telling the truth.”
|
||||
# He was eager to tell me that she was really silly, that she did not know how to change theintravenous drip bottle. Then they started it over again quarreling. And this time I was listening tothem in good patience, holding the telephone receiver.
|
||||
# Water was roaring in the sea out the window, waves climbing up and falling down, just like what Iwas feeling at the bottom of my heart.
|
||||
# He suddenly realized that they should stop. He said that it was a long- distance call. Sheimmediately stopped arguing.
|
||||
# “Why didn’t you remind me earlier? How much money have we wasted her?”
|
||||
# She hangs up before I had a chance to talk. I know they would have a good fight again.
|
||||
# They are my father and mother. He is 66, and she is 64."""
|
||||
#
|
||||
# with open('test/article_test.p', 'wb') as file: # 注意使用二进制写模式'wb'
|
||||
# pickle.dump(s, file)
|
||||
#
|
||||
# with open('test/article_test.p', 'rb') as file: # 注意使用二进制读模式'rb'
|
||||
# loaded_data = pickle.load(file)
|
||||
#
|
||||
# # print(loaded_data)
|
||||
|
Loading…
Reference in New Issue