286 lines
14 KiB
Python
286 lines
14 KiB
Python
'''
|
||
Estimate a user's vocabulary level given his vocabulary data
|
||
Estimate an English article's difficulty level given its content
|
||
Preliminary design
|
||
|
||
Hui, 2024-09-23
|
||
Last upated: 2024-09-25, 2024-09-30
|
||
'''
|
||
import os
|
||
import pickle
|
||
import random
|
||
import re
|
||
from collections import defaultdict
|
||
from datetime import datetime, timedelta
|
||
import heapq
|
||
import snowballstemmer
|
||
from flask import session
|
||
import enchant
|
||
|
||
|
||
# word_lst = 词汇表
|
||
|
||
def load_record(pickle_fname):
|
||
with open(pickle_fname, 'rb') as f:
|
||
d = pickle.load(f)
|
||
return d
|
||
|
||
|
||
|
||
|
||
def is_english_word(word):
|
||
pattern = re.compile(r'^[a-zA-Z]+$')
|
||
if word == 'xyz':
|
||
return False
|
||
return bool(pattern.match(word))
|
||
|
||
def is_valid_datetime_string(date_string, format='%Y%m%d%H%M'):
|
||
try:
|
||
datetime.strptime(date_string, format)
|
||
return True
|
||
except ValueError:
|
||
return False
|
||
|
||
def remove_non_words(input_string):
|
||
cleaned_string = re.sub(r'[^a-zA-Z\s]', '', input_string)
|
||
words = cleaned_string.split()
|
||
result = ' '.join(words)
|
||
return result
|
||
|
||
|
||
|
||
class VocabularyLevelEstimator:
|
||
# 词汇表(单词:【"雅思","高考"...】)
|
||
_test = load_record('static/words_and_tests.p') # map a word to the sources where it appears
|
||
|
||
@property
|
||
def level(self):
|
||
total = 0.0 # TODO: need to compute this number
|
||
num = 0
|
||
for word in self.word_lst:
|
||
num += 1
|
||
if word in self._test:
|
||
print(f'{word} : {self._test[word]}')
|
||
else:
|
||
print(f'{word}')
|
||
return total / num
|
||
|
||
@property
|
||
def test(self):
|
||
return self._test
|
||
|
||
def get_word_level(self,word):
|
||
other = ['went','heard','i','feet','got','been', 'gone', 'done', 'had', 'said', 'seen', 'made', 'taken', 'come', 'gotten', 'got ', 'ran', 'eaten',
|
||
'written', 'found', 'given', 'told', 'brought', 'kept', 'stood', 'sat', 'won', 'bought', 'caught', 'begun',
|
||
'drank', 'rang', 'sang', 'swam', 'blew', 'drew', 'flew', 'grew', 'knew', 'threw', 'shown', 'broken', 'chosen',
|
||
'forgotten', 'spoke', 'woke','woken', 'driven', 'fell', 'given', 'risen', 'mistaken', 'ridden', 'lain', 'lied','at','no']
|
||
if not word in self._test: return 3
|
||
if word in other : return 3
|
||
k = self._test[word]
|
||
if 'CET4' in k:
|
||
return 4
|
||
elif 'OXFORD3000' in k:
|
||
return 5
|
||
elif 'CET6' in k or 'GRADUATE' in k:
|
||
return 6
|
||
elif 'OXFORD5000' in k or 'IELTS' in k:
|
||
return 7
|
||
elif 'BBC' in k:
|
||
return 8
|
||
|
||
|
||
import snowballstemmer
|
||
from datetime import datetime
|
||
|
||
|
||
class UserVocabularyLevel(VocabularyLevelEstimator):
|
||
def __init__(self, d):
|
||
self.filtered_frequency = [] # 确保始终初始化
|
||
self.d = d # 用户的生词库
|
||
self.word_lst = list(d.keys()) if d else []
|
||
if d:
|
||
self.filter_user_frequency()
|
||
|
||
def filter_user_frequency(self):
|
||
if not self.d:
|
||
return # 如果词库为空,直接返回
|
||
|
||
stemmer = snowballstemmer.stemmer('english')
|
||
date_str = "20240805"
|
||
dt = datetime.strptime(date_str, "%Y%m%d")
|
||
range_datetime = dt.strftime("%Y%m%d%H%M") # 设定筛选时间范围
|
||
|
||
for word in self.d:
|
||
if is_english_word(word) and is_valid_datetime_string(self.d[word][0]):
|
||
word_datetime = datetime.strptime(self.d[word][0], "%Y%m%d%H%M")
|
||
if word_datetime > dt and word not in self.filtered_frequency:
|
||
stemmed_word = stemmer.stemWord(word)
|
||
self.filtered_frequency.append(stemmed_word[0] if isinstance(stemmed_word, list) else stemmed_word)
|
||
|
||
@property
|
||
def level(self):
|
||
if not self.filtered_frequency:
|
||
return 0 # 词汇表为空时,直接返回 0
|
||
|
||
word_levels = [(word, self.get_word_level(word)) for word in self.filtered_frequency]
|
||
word_levels.sort(key=lambda x: x[1], reverse=True)
|
||
top_10_hardest = word_levels[:10]
|
||
|
||
total = sum(level for _, level in top_10_hardest)
|
||
num = len(top_10_hardest)
|
||
|
||
self.filtered_frequency = [] # **清空数据**
|
||
return total / num if num > 0 else 0
|
||
|
||
|
||
class ArticleVocabularyLevel(VocabularyLevelEstimator):
|
||
difficulty_word = dict()
|
||
|
||
def __init__(self, content):
|
||
if content :
|
||
# print(content)
|
||
self.content = remove_non_words(content)
|
||
# print(self.content)
|
||
self.word_lst = self.content.lower().split()
|
||
# print(self.word_lst)
|
||
# select the 10 most difficult words
|
||
self.select_difficulty_word()
|
||
|
||
def select_difficulty_word(self, n=10):
|
||
self.difficulty_word = {}
|
||
#找出最难的n个词汇,存入difficulty_word
|
||
stemmer = snowballstemmer.stemmer('english')
|
||
for word in self.word_lst:
|
||
# print(word)
|
||
original_word = stemmer.stemWord(word)
|
||
# print(original_word)
|
||
self.difficulty_word[original_word] = self.get_word_level(original_word)
|
||
|
||
if self.difficulty_word:
|
||
sorted_words = sorted(self.difficulty_word.items(), key=lambda item: item[1], reverse=True)
|
||
top_words = sorted_words[:n]
|
||
self.difficulty_word = {word: difficulty for word, difficulty in top_words}
|
||
# print(self.difficulty_word)
|
||
|
||
@property
|
||
def level(self):
|
||
if not self.difficulty_word: # 如果没有难度字典,返回 0.0
|
||
return 0.0
|
||
total_difficulty = sum(self.difficulty_word.values()) # 总难度
|
||
word_count = len(self.difficulty_word) # 单词数
|
||
|
||
# 计算每个单词的平均难度
|
||
avg_difficulty = total_difficulty / word_count if word_count > 0 else 0.0
|
||
if len(self.word_lst) > 100:
|
||
avg_difficulty = (total_difficulty / word_count) * (1 - word_count/len(self.word_lst))
|
||
|
||
# 文章越长,难度越高(非线性增长)
|
||
if word_count == 1:
|
||
length_factor = 1
|
||
else:
|
||
length_factor = 1 + 0.01 * (len(self.word_lst) / 1000) # 每 1000 词增加 10% 的难度
|
||
# 返回难度评分,结合文章长度因素
|
||
return avg_difficulty * length_factor
|
||
|
||
# @property
|
||
# def level(self):
|
||
# if not self.difficulty_word:
|
||
# return 0.0
|
||
#
|
||
# total_difficulty = sum(self.difficulty_word.values())
|
||
# word_count = len(self.difficulty_word)
|
||
#
|
||
# # 文章越长,难度越高(非线性增长)
|
||
# if word_count == 1:
|
||
# length_factor = 1
|
||
# else:
|
||
# length_factor = 1 + 0.01 * (len(self.word_lst) / 1000) # 每 1000 词增加 10% 的难度
|
||
# print(total_difficulty / word_count)
|
||
# return (total_difficulty / word_count) * length_factor
|
||
|
||
# @property
|
||
# def level(self):
|
||
# total = 0.0 # TODO: need to compute this number
|
||
# num = 0
|
||
# if not self.difficulty_word:return 0.0
|
||
# for word in self.difficulty_word:
|
||
# num += 1
|
||
# total += self.difficulty_word[word]
|
||
# # print(total)
|
||
# # print(num)
|
||
# return total / num
|
||
|
||
|
||
|
||
|
||
if __name__ == '__main__':
|
||
user = UserVocabularyLevel({})
|
||
print(user.level)
|
||
# _test = load_record('static/words_and_tests.p')
|
||
# print(_test)
|
||
#
|
||
# d = load_record('static/frequency/frequency_sb.pickle')
|
||
# print(d)
|
||
# # d = load_record('frequency_mrlan85.pickle')
|
||
# # print(d)
|
||
#
|
||
# # user = UserVocabularyLevel(d)
|
||
# user = UserVocabularyLevel({})
|
||
# print('用户词汇水平:')
|
||
#
|
||
# print(user.level) # level is a property
|
||
# s = """ energetic = haze dynamic = vigorous = animated Such is Love , Plain like Water
|
||
# port him to stand up. She scolded him for nothaving waken her up. He said that he could manage. A serious quarrel was about to burst outagain.
|
||
# I called them from Zhuhai, the beautiful city of relaxation8 and exciting views. I wanted to depict9 tothem how pretty a city Zhuhai is. He was surprised that I had travelled such a great distance, towhere he and she had never been all their life. He then believed that I had accomplished10 a bigsuccess, and that I had significantly broadened my horizons. He went on talk in honor of me, voicetrembling with pride; she was beside him, excited all the same. They did not mention anythingabout the family change and their suffering.
|
||
# I had not heard about his being bit by a snake until she told me her grievance.
|
||
# “He treated me bad again. He could not get out of bed by himself, but he wanted to be strong, andso he fell. I was trying to pull him up on his feet, and I just told a few words of complaints whenhe got out of bed, yelling that I was clumsy.”
|
||
# ."""
|
||
# article = ArticleVocabularyLevel(s)
|
||
# print('文章词汇难度:')
|
||
# print(article.level)
|
||
#
|
||
# s="""Every once in a while I will dial the familiar number.
|
||
# It is usually he that picks up the phone. She will come up to him asking who the caller is. Knowingit’s me, she will grab the phone, say a few greeting words, and then start talking bad of him,complaints flooding out, all about chores and minor1 superficial details.
|
||
# As always, he argues, and through the long-distance call I hear them quarreling seriously.
|
||
# Quarreling seems to be bound with their life. He seems like a capricious boy, and she a little girlwho cannot bear with even a small grievance2. It is common that they fight each other half a dayover minor issues. Over and over again, they will quarrel—they are born to disagree with eachother.
|
||
# Bored sometimes, I pretend to be very unhappy and yell, “Couldn’t you stop arguing and let myears have a quiet moment?”
|
||
# She pauses two seconds, and mumbles3, “He always bullies4 me.” I almost see her, pitiful as she is.
|
||
# Then I turn soft, and try to find faults in him.
|
||
# “Why did you bully5 her?”
|
||
# He listens to me, irritated, “Who bullies her? It is she that bullies me!”
|
||
# Just as the saying goes, a righteousjudge is never able to decide who is right and who is wrong inan argument between a couple. I tried to intervene, only to get them into more serious debate.
|
||
# Like an actor and an actress, they are indulged in acting6. With me watching, they end up in moreheated disagreements. So I don’t want to bother any more. I just stand by when they quarrel.
|
||
# In the end, she will find her eyes red, and wipe tears off her cheeks. He gets upset and attempts tosoothe her.?He looks funny—staring at her like a mad frog with two eyes open wide, he yells withdetermination,
|
||
# “From now on I will wash all the dishes.” He seems to be swearing. And she weeps and smilingflowers bloom on her face.
|
||
# One day he got bit by a snake while working in the farmland, one leg swelling7 furiously. He wasthen rushed to hospital where he went through a series of torment—he got injected to let thecontaminated blood out.
|
||
# Seeing the blood, she was scared to big tears, crying loud.
|
||
# “Stop it. That’s enough. You are getting on my nerves. I am not gonna die.” He shouted, frowning.
|
||
# She got mad at the word “die” that he had mentioned.
|
||
# “Why did you say that?” She immediately quarreled hard with him, not to stop for a long while.
|
||
# Doctors were watching on, amused.
|
||
# “Isn’t quarreling more effective than pain-killing pills?” The doctors teased.
|
||
# They looked at each other, embarrassed and laughing. That finally stopped their quarreling.
|
||
# In order to save hospital expenses, he insisted on going home despite her mad disagreements.
|
||
# Lying in bed, he was having an intravenous drip for a continuous period of 12 days. She was busydealing with housework and everything else out of home. She was weak and thin, and too muchwork had made her even more skinny.
|
||
# Witnessing her change, he hurt in the heart. One night he wanted to drink water. He did not wakeher up, pulled himself out of the bed, and fell over because his legs were too weak.
|
||
# His fall actually woke her up. She rushed to support him to stand up. She scolded him for nothaving waken her up. He said that he could manage. A serious quarrel was about to burst outagain.
|
||
# I called them from Zhuhai, the beautiful city of relaxation8 and exciting views. I wanted to depict9 tothem how pretty a city Zhuhai is. He was surprised that I had travelled such a great distance, towhere he and she had never been all their life. He then believed that I had accomplished10 a bigsuccess, and that I had significantly broadened my horizons. He went on talk in honor of me, voicetrembling with pride; she was beside him, excited all the same. They did not mention anythingabout the family change and their suffering.
|
||
# I had not heard about his being bit by a snake until she told me her grievance.
|
||
# “He treated me bad again. He could not get out of bed by himself, but he wanted to be strong, andso he fell. I was trying to pull him up on his feet, and I just told a few words of complaints whenhe got out of bed, yelling that I was clumsy.”
|
||
# “No, no, no, she was not telling the truth.”
|
||
# He was eager to tell me that she was really silly, that she did not know how to change theintravenous drip bottle. Then they started it over again quarreling. And this time I was listening tothem in good patience, holding the telephone receiver.
|
||
# Water was roaring in the sea out the window, waves climbing up and falling down, just like what Iwas feeling at the bottom of my heart.
|
||
# He suddenly realized that they should stop. He said that it was a long- distance call. Sheimmediately stopped arguing.
|
||
# “Why didn’t you remind me earlier? How much money have we wasted her?”
|
||
# She hangs up before I had a chance to talk. I know they would have a good fight again.
|
||
# They are my father and mother. He is 66, and she is 64."""
|
||
#
|
||
# with open('test/article_test.p', 'wb') as file: # 注意使用二进制写模式'wb'
|
||
# pickle.dump(s, file)
|
||
#
|
||
# with open('test/article_test.p', 'rb') as file: # 注意使用二进制读模式'rb'
|
||
# loaded_data = pickle.load(file)
|
||
#
|
||
# # print(loaded_data)
|
||
|