Add vocabulary-level-related files and user service fixes
parent
c3b109528a
commit
250d2c37fd
|
@ -0,0 +1,166 @@
|
|||
from WordFreq import WordFreq
|
||||
from wordfreqCMD import youdao_link, sort_in_descending_order
|
||||
import pickle_idea, pickle_idea2
|
||||
import os
|
||||
import random, glob
|
||||
import hashlib
|
||||
from datetime import datetime
|
||||
from flask import Flask, request, redirect, render_template, url_for, session, abort, flash, get_flashed_messages
|
||||
from difficulty import get_difficulty_level_for_user, text_difficulty_level, user_difficulty_level
|
||||
from model.article import get_all_articles, get_article_by_id, get_number_of_articles
|
||||
import logging
|
||||
import re
|
||||
path_prefix = './'
|
||||
db_path_prefix = './db/' # comment this line in deployment
|
||||
oxford_words_path='C:\\Users\\ANNA\\Desktop\\ooad\\app\\db\\oxford_words.txt'
|
||||
|
||||
def count_oxford_words(text, oxford_words):
|
||||
words = re.findall(r'\b\w+\b', text.lower())
|
||||
total_words = len(words)
|
||||
oxford_word_count = sum(1 for word in words if word in oxford_words)
|
||||
return oxford_word_count, total_words
|
||||
|
||||
def calculate_ratio(oxford_word_count, total_words):
|
||||
if total_words == 0:
|
||||
return 0
|
||||
return oxford_word_count / total_words
|
||||
|
||||
def load_oxford_words(file_path):
|
||||
oxford_words = {}
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
for line in file:
|
||||
parts = line.strip().split()
|
||||
word = parts[0]
|
||||
pos = parts[1]
|
||||
level = parts[2]
|
||||
oxford_words[word] = {'pos': pos, 'level': level}
|
||||
return oxford_words
|
||||
|
||||
def total_number_of_essays():
|
||||
return get_number_of_articles()
|
||||
|
||||
|
||||
def get_article_title(s):
|
||||
return s.split('\n')[0]
|
||||
|
||||
|
||||
def get_article_body(s):
|
||||
lst = s.split('\n')
|
||||
lst.pop(0) # remove the first line
|
||||
return '\n'.join(lst)
|
||||
|
||||
|
||||
def get_today_article(user_word_list, visited_articles):
|
||||
if visited_articles is None:
|
||||
visited_articles = {
|
||||
"index" : 0, # 为 article_ids 的索引
|
||||
"article_ids": [] # 之前显示文章的id列表,越后越新
|
||||
}
|
||||
if visited_articles["index"] > len(visited_articles["article_ids"])-1: # 生成新的文章,因此查找所有的文章
|
||||
result = get_all_articles()
|
||||
else: # 生成阅读过的文章,因此查询指定 article_id 的文章
|
||||
if visited_articles["article_ids"][visited_articles["index"]] == 'null': # 可能因为直接刷新页面导致直接去查询了'null',因此当刷新的页面的时候,需要直接进行“上一篇”操作
|
||||
visited_articles["index"] -= 1
|
||||
visited_articles["article_ids"].pop()
|
||||
article_id = visited_articles["article_ids"][visited_articles["index"]]
|
||||
result = get_article_by_id(article_id)
|
||||
random.shuffle(result)
|
||||
|
||||
# Choose article according to reader's level
|
||||
logging.debug('* get_today_article(): start d1 = ... ')
|
||||
d1 = load_freq_history(user_word_list)
|
||||
d2 = load_freq_history(path_prefix + 'static/words_and_tests.p')
|
||||
logging.debug(' ... get_today_article(): get_difficulty_level_for_user() start')
|
||||
d3 = get_difficulty_level_for_user(d1, d2)
|
||||
logging.debug(' ... get_today_article(): done')
|
||||
|
||||
d = None
|
||||
result_of_generate_article = "not found"
|
||||
|
||||
d_user = load_freq_history(user_word_list)
|
||||
logging.debug('* get_today_article(): user_difficulty_level() start')
|
||||
user_level = user_difficulty_level(d_user, d3) # more consideration as user's behaviour is dynamic. Time factor should be considered.
|
||||
logging.debug('* get_today_article(): done')
|
||||
text_level = 0
|
||||
if visited_articles["index"] > len(visited_articles["article_ids"])-1: # 生成新的文章
|
||||
amount_of_visited_articles = len(visited_articles["article_ids"])
|
||||
amount_of_existing_articles = result.__len__()
|
||||
if amount_of_visited_articles == amount_of_existing_articles: # 如果当前阅读过的文章的数量 == 存在的文章的数量,即所有的书本都阅读过了
|
||||
result_of_generate_article = "had read all articles"
|
||||
else:
|
||||
for k in range(3): # 最多尝试3次
|
||||
for reading in result:
|
||||
text_level = text_difficulty_level(reading['text'], d3)
|
||||
factor = random.gauss(0.8, 0.1) # a number drawn from Gaussian distribution with a mean of 0.8 and a stand deviation of 1
|
||||
if reading['article_id'] not in visited_articles["article_ids"] and within_range(text_level, user_level, (8.0 - user_level) * factor): # 新的文章之前没有出现过且符合一定范围的水平
|
||||
d = reading
|
||||
visited_articles["article_ids"].append(d['article_id']) # 列表添加新的文章id;下面进行
|
||||
result_of_generate_article = "found"
|
||||
break
|
||||
if result_of_generate_article == "found": # 用于成功找到文章后及时退出外层循环
|
||||
break
|
||||
if result_of_generate_article != "found": # 阅读完所有文章,或者循环3次没有找到适合的文章,则放入空(“null”)
|
||||
visited_articles["article_ids"].append('null')
|
||||
else: # 生成已经阅读过的文章
|
||||
d = random.choice(result)
|
||||
text_level = text_difficulty_level(d['text'], d3)
|
||||
result_of_generate_article = "found"
|
||||
|
||||
today_article = None
|
||||
if d:
|
||||
oxford_words = load_oxford_words(oxford_words_path)
|
||||
oxford_word_count, total_words = count_oxford_words(d['text'],oxford_words)
|
||||
ratio = calculate_ratio(oxford_word_count,total_words)
|
||||
today_article = {
|
||||
"user_level": '%4.1f' % user_level,
|
||||
"text_level": '%4.1f' % text_level,
|
||||
"date": d['date'],
|
||||
"article_title": get_article_title(d['text']),
|
||||
"article_body": get_article_body(d['text']),
|
||||
"source": d["source"],
|
||||
"question": get_question_part(d['question']),
|
||||
"answer": get_answer_part(d['question']),
|
||||
"ratio" : ratio
|
||||
}
|
||||
|
||||
return visited_articles, today_article, result_of_generate_article
|
||||
|
||||
|
||||
def load_freq_history(path):
|
||||
d = {}
|
||||
if os.path.exists(path):
|
||||
d = pickle_idea.load_record(path)
|
||||
return d
|
||||
|
||||
|
||||
def within_range(x, y, r):
|
||||
return x > y and abs(x - y) <= r
|
||||
|
||||
|
||||
def get_question_part(s):
|
||||
s = s.strip()
|
||||
result = []
|
||||
flag = 0
|
||||
for line in s.split('\n'):
|
||||
line = line.strip()
|
||||
if line == 'QUESTION':
|
||||
result.append(line)
|
||||
flag = 1
|
||||
elif line == 'ANSWER':
|
||||
flag = 0
|
||||
elif flag == 1:
|
||||
result.append(line)
|
||||
return '\n'.join(result)
|
||||
|
||||
|
||||
def get_answer_part(s):
|
||||
s = s.strip()
|
||||
result = []
|
||||
flag = 0
|
||||
for line in s.split('\n'):
|
||||
line = line.strip()
|
||||
if line == 'ANSWER':
|
||||
flag = 1
|
||||
elif flag == 1:
|
||||
result.append(line)
|
||||
return '\n'.join(result)
|
|
@ -0,0 +1,128 @@
|
|||
import hashlib
|
||||
import string
|
||||
from datetime import datetime, timedelta
|
||||
import unicodedata
|
||||
|
||||
|
||||
def md5(s):
|
||||
'''
|
||||
MD5摘要
|
||||
:param str: 字符串
|
||||
:return: 经MD5以后的字符串
|
||||
'''
|
||||
h = hashlib.md5(s.encode(encoding='utf-8'))
|
||||
return h.hexdigest()
|
||||
|
||||
|
||||
path_prefix = '/var/www/wordfreq/wordfreq/'
|
||||
path_prefix = './' # comment this line in deployment
|
||||
|
||||
|
||||
def verify_user(username, password):
|
||||
from model.user import get_user_by_username
|
||||
user = get_user_by_username(username)
|
||||
encoded_password = md5(username + password)
|
||||
return user is not None and user.password == encoded_password
|
||||
|
||||
|
||||
def add_user(username, password):
|
||||
from model.user import insert_user
|
||||
start_date = datetime.now().strftime('%Y%m%d')
|
||||
expiry_date = (datetime.now() + timedelta(days=30)).strftime('%Y%m%d')
|
||||
password = md5(username + password)
|
||||
insert_user(username=username, password=password, start_date=start_date, expiry_date=expiry_date)
|
||||
|
||||
|
||||
def check_username_availability(username):
|
||||
from model.user import get_user_by_username
|
||||
existed_user = get_user_by_username(username)
|
||||
return existed_user is None
|
||||
|
||||
|
||||
def change_password(username, old_password, new_password):
|
||||
'''
|
||||
修改密码
|
||||
:param username: 用户名
|
||||
:param old_password: 旧的密码
|
||||
:param new_password: 新密码
|
||||
:return: 修改成功:True 否则:False
|
||||
'''
|
||||
if not verify_user(username, old_password): # 旧密码错误
|
||||
return {'error':'Old password is wrong.', 'username':username}
|
||||
# 将用户名和密码一起加密,以免暴露不同用户的相同密码
|
||||
if new_password == old_password: #新旧密码一致
|
||||
return {'error':'New password cannot be the same as the old password.', 'username':username}
|
||||
from model.user import update_password_by_username
|
||||
update_password_by_username(username, new_password)
|
||||
return {'success':'Password changed', 'username':username}
|
||||
|
||||
|
||||
def get_expiry_date(username):
|
||||
from model.user import get_user_by_username
|
||||
user = get_user_by_username(username)
|
||||
if user is None:
|
||||
return '20191024'
|
||||
else:
|
||||
return user.expiry_date
|
||||
|
||||
|
||||
class UserName:
|
||||
def __init__(self, username):
|
||||
self.username = username
|
||||
|
||||
def contains_chinese(self):
|
||||
for char in self.username:
|
||||
# Check if the character is in the CJK (Chinese, Japanese, Korean) Unicode block
|
||||
if unicodedata.name(char).startswith('CJK UNIFIED IDEOGRAPH'):
|
||||
return True
|
||||
return False
|
||||
|
||||
def validate(self):
|
||||
if len(self.username) > 20:
|
||||
return f'{self.username} is too long. The user name cannot exceed 20 characters.'
|
||||
if self.username.startswith('.'): # a user name must not start with a dot
|
||||
return 'Period (.) is not allowed as the first letter in the user name.'
|
||||
if ' ' in self.username: # a user name must not include a whitespace
|
||||
return 'Whitespace is not allowed in the user name.'
|
||||
for c in self.username: # a user name must not include special characters, except non-leading periods or underscores
|
||||
if c in string.punctuation and c != '.' and c != '_':
|
||||
return f'{c} is not allowed in the user name.'
|
||||
if self.username in ['signup', 'login', 'logout', 'reset', 'mark', 'back', 'unfamiliar', 'familiar', 'del',
|
||||
'admin']:
|
||||
return 'You used a restricted word as your user name. Please come up with a better one.'
|
||||
if self.contains_chinese():
|
||||
return 'Chinese characters are not allowed in the user name.'
|
||||
return 'OK'
|
||||
|
||||
|
||||
class Password:
|
||||
def __init__(self, password):
|
||||
self.password = password
|
||||
|
||||
def contains_chinese(self):
|
||||
for char in self.password:
|
||||
# Check if the character is in the CJK (Chinese, Japanese, Korean) Unicode block
|
||||
if unicodedata.name(char).startswith('CJK UNIFIED IDEOGRAPH'):
|
||||
return True
|
||||
return False
|
||||
|
||||
def validate(self):
|
||||
if len(self.password) < 4:
|
||||
return 'Password must be at least 4 characters long.'
|
||||
if ' ' in self.password:
|
||||
return 'Password cannot contain spaces.'
|
||||
if self.contains_chinese():
|
||||
return 'Chinese characters are not allowed in the password.'
|
||||
return 'OK'
|
||||
|
||||
|
||||
class WarningMessage:
|
||||
def __init__(self, s, type='username'):
|
||||
self.s = s
|
||||
self.type = type
|
||||
|
||||
def __str__(self):
|
||||
if self.type == 'username':
|
||||
return UserName(self.s).validate()
|
||||
if self.type == 'password':
|
||||
return Password(self.s).validate()
|
|
@ -0,0 +1,34 @@
|
|||
import pickle
|
||||
import os
|
||||
|
||||
# Sample vocabulary data - simulating a user's word history
|
||||
# Format: word -> list of dates when the word was studied
|
||||
test_data = {
|
||||
"hello": ["20240101"],
|
||||
"world": ["20240101", "20240102"],
|
||||
"computer": ["20240101", "20240103"],
|
||||
"programming": ["20240102"],
|
||||
"python": ["20240102", "20240103"],
|
||||
"algorithm": ["20240103"],
|
||||
"database": ["20240103"],
|
||||
"interface": ["20240104"],
|
||||
"vocabulary": ["20240104"],
|
||||
"sophisticated": ["20240104"]
|
||||
}
|
||||
|
||||
# Ensure frequency directory exists
|
||||
base_path = r'C:\Users\ANNA\Desktop\app\static\frequency'
|
||||
os.makedirs(base_path, exist_ok=True)
|
||||
|
||||
# Save the test data
|
||||
file_path = os.path.join(base_path, 'mr1an85.pickle')
|
||||
with open(file_path, 'wb') as f:
|
||||
pickle.dump(test_data, f)
|
||||
|
||||
print(f"Test file created at: {file_path}")
|
||||
|
||||
# Verify the file was created and can be read
|
||||
with open(file_path, 'rb') as f:
|
||||
loaded_data = pickle.load(f)
|
||||
print("\nVerifying data:")
|
||||
print(loaded_data)
|
|
@ -0,0 +1,101 @@
|
|||
###########################################################################
|
||||
# Copyright 2019 (C) Hui Lan <hui.lan@cantab.net>
|
||||
# Written permission must be obtained from the author for commercial uses.
|
||||
###########################################################################
|
||||
|
||||
# Purpose: dictionary & pickle as a simple means of database.
|
||||
# Task: incorporate the functions into wordfreqCMD.py such that it will also show cumulative frequency.
|
||||
|
||||
import os
|
||||
import pickle
|
||||
from datetime import datetime
|
||||
|
||||
|
||||
def lst2dict(lst, d):
|
||||
'''
|
||||
Store the information in list lst to dictionary d.
|
||||
Handles both frequency counts and date lists.
|
||||
'''
|
||||
for x in lst:
|
||||
word = x[0]
|
||||
if isinstance(x[1], list): # if it's a list of dates
|
||||
freq = len(x[1]) # convert to frequency
|
||||
else:
|
||||
freq = x[1] # already a frequency
|
||||
|
||||
if not word in d:
|
||||
d[word] = freq
|
||||
else:
|
||||
d[word] += freq
|
||||
|
||||
|
||||
def dict2lst(d):
|
||||
''' Convert dictionary to list of (word, frequency) pairs '''
|
||||
if len(d) > 0:
|
||||
keys = list(d.keys())
|
||||
if isinstance(d[keys[0]], list):
|
||||
return [(k, len(v)) for k, v in d.items()]
|
||||
return list(d.items())
|
||||
return []
|
||||
|
||||
|
||||
def merge_frequency(lst1, lst2):
|
||||
d = {}
|
||||
lst2dict(lst1, d)
|
||||
lst2dict(lst2, d)
|
||||
return d
|
||||
|
||||
|
||||
def load_record(pickle_fname):
|
||||
f = open(pickle_fname, 'rb')
|
||||
d = pickle.load(f)
|
||||
f.close()
|
||||
return d
|
||||
|
||||
|
||||
def save_frequency_to_pickle(d, pickle_fname):
|
||||
f = open(pickle_fname, 'wb')
|
||||
#exclusion_lst = ['one', 'no', 'has', 'had', 'do', 'that', 'have', 'by', 'not', 'but', 'we', 'this', 'my', 'him', 'so', 'or', 'as', 'are', 'it', 'from', 'with', 'be', 'can', 'for', 'an', 'if', 'who', 'whom', 'whose', 'which', 'the', 'to', 'a', 'of', 'and', 'you', 'i', 'he', 'she', 'they', 'me', 'was', 'were', 'is', 'in', 'at', 'on', 'their', 'his', 'her', 's', 'said', 'all', 'did', 'been', 'w']
|
||||
exclusion_lst = []
|
||||
d2 = {}
|
||||
for k in d:
|
||||
if not k in exclusion_lst and not k.isnumeric() and len(k) > 1:
|
||||
if isinstance(d[k], list):
|
||||
d2[k] = len(d[k]) # store frequency count
|
||||
else:
|
||||
d2[k] = d[k]
|
||||
pickle.dump(d2, f)
|
||||
f.close()
|
||||
|
||||
def unfamiliar(path,word):
|
||||
if not os.path.exists(path):
|
||||
return None
|
||||
with open(path,"rb") as f:
|
||||
dic = pickle.load(f)
|
||||
dic[word] += [datetime.now().strftime('%Y%m%d%H%M')]
|
||||
with open(path,"wb") as fp:
|
||||
pickle.dump(dic,fp)
|
||||
|
||||
def familiar(path,word):
|
||||
f = open(path,"rb")
|
||||
dic = pickle.load(f)
|
||||
if len(dic[word])>1:
|
||||
del dic[word][0]
|
||||
else:
|
||||
dic.pop(word)
|
||||
fp = open(path,"wb")
|
||||
pickle.dump(dic,fp)
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
lst1 = [('apple',2), ('banana',1)]
|
||||
d = {}
|
||||
lst2dict(lst1, d) # d will change
|
||||
save_frequency_to_pickle(d, 'frequency.p') # frequency.p is our database
|
||||
|
||||
|
||||
lst2 = [('banana',2), ('orange', 4)]
|
||||
d = load_record('frequency.p')
|
||||
lst1 = dict2lst(d)
|
||||
d = merge_frequency(lst2, lst1)
|
||||
print(d)
|
|
@ -0,0 +1,99 @@
|
|||
###########################################################################
|
||||
# Copyright 2019 (C) Hui Lan <hui.lan@cantab.net>
|
||||
# Written permission must be obtained from the author for commercial uses.
|
||||
###########################################################################
|
||||
|
||||
|
||||
# Purpose: dictionary & pickle as a simple means of database.
|
||||
# Task: incorporate the functions into wordfreqCMD.py such that it will also show cumulative frequency.
|
||||
# Note: unlike pick_idea.py, now the second item is not frequency, but a list of dates.
|
||||
|
||||
import pickle
|
||||
from datetime import datetime
|
||||
|
||||
def lst2dict(lst, d):
|
||||
'''
|
||||
Store the information in list lst to dictionary d.
|
||||
Now stores frequency count instead of dates list.
|
||||
'''
|
||||
for x in lst:
|
||||
word = x[0]
|
||||
if isinstance(x[1], list): # if it's a list of dates
|
||||
count = len(x[1]) # convert to frequency
|
||||
else:
|
||||
count = x[1] # already a frequency
|
||||
|
||||
if not word in d:
|
||||
d[word] = count
|
||||
else:
|
||||
d[word] += count
|
||||
|
||||
def deleteRecord(path,word):
|
||||
with open(path, 'rb') as f:
|
||||
db = pickle.load(f)
|
||||
try:
|
||||
db.pop(word)
|
||||
except KeyError:
|
||||
print("sorry")
|
||||
with open(path, 'wb') as ff:
|
||||
pickle.dump(db, ff)
|
||||
|
||||
def dict2lst(d):
|
||||
if len(d) > 0:
|
||||
keys = list(d.keys())
|
||||
if isinstance(d[keys[0]], int):
|
||||
return list(d.items()) # return (word, frequency) pairs directly
|
||||
elif isinstance(d[keys[0]], list):
|
||||
return [(k, len(v)) for k, v in d.items()] # convert date lists to counts
|
||||
|
||||
return []
|
||||
|
||||
def merge_frequency(lst1, lst2):
|
||||
d = {}
|
||||
lst2dict(lst1, d)
|
||||
lst2dict(lst2, d)
|
||||
return d
|
||||
|
||||
|
||||
def load_record(pickle_fname):
|
||||
f = open(pickle_fname, 'rb')
|
||||
d = pickle.load(f)
|
||||
f.close()
|
||||
return d
|
||||
|
||||
|
||||
def save_frequency_to_pickle(d, pickle_fname):
|
||||
f = open(pickle_fname, 'wb')
|
||||
d2 = {}
|
||||
for k in d:
|
||||
if not k in exclusion_lst and not k.isnumeric() and not len(k) < 2:
|
||||
if isinstance(d[k], list):
|
||||
d2[k] = len(d[k]) # store frequency count instead of dates list
|
||||
else:
|
||||
d2[k] = d[k]
|
||||
pickle.dump(d2, f)
|
||||
f.close()
|
||||
|
||||
|
||||
exclusion_lst = ['one', 'no', 'has', 'had', 'do', 'that', 'have', 'by', 'not', 'but', 'we', 'this', 'my', 'him', 'so', 'or', 'as', 'are', 'it', 'from', 'with', 'be', 'can', 'for', 'an', 'if', 'who', 'whom', 'whose', 'which', 'the', 'to', 'a', 'of', 'and', 'you', 'i', 'he', 'she', 'they', 'me', 'was', 'were', 'is', 'in', 'at', 'on', 'their', 'his', 'her', 's', 'said', 'all', 'did', 'been', 'w']
|
||||
|
||||
if __name__ == '__main__':
|
||||
# Test 1: Convert dates to frequencies
|
||||
lst1 = [('apple',['201910251437', '201910251438']), ('banana',['201910251439'])]
|
||||
d = {}
|
||||
lst2dict(lst1, d)
|
||||
print("Test 1 - Convert dates to frequencies:")
|
||||
print(d) # Should show: {'apple': 2, 'banana': 1}
|
||||
|
||||
# Test 2: Save and load frequencies
|
||||
save_frequency_to_pickle(d, 'frequency.p')
|
||||
loaded_d = load_record('frequency.p')
|
||||
print("\nTest 2 - Load saved frequencies:")
|
||||
print(loaded_d) # Should match the previous output
|
||||
|
||||
# Test 3: Merge frequencies
|
||||
lst2 = [('banana',['201910251439']), ('orange', ['201910251440', '201910251439'])]
|
||||
lst1 = dict2lst(loaded_d)
|
||||
merged_d = merge_frequency(lst2, lst1)
|
||||
print("\nTest 3 - Merge frequencies:")
|
||||
print(merged_d) # Should show banana with increased frequency
|
|
@ -0,0 +1,108 @@
|
|||
import pytest
|
||||
from difficulty import VocabularyLevelEstimator
|
||||
|
||||
@pytest.fixture
|
||||
def estimator():
|
||||
"""Fixture to create a VocabularyLevelEstimator instance"""
|
||||
return VocabularyLevelEstimator('path/to/your/actual/word_data.p')
|
||||
|
||||
class TestVocabularyLevelEstimator:
|
||||
|
||||
# Normal input tests
|
||||
def test_normal_text_estimation(self, estimator):
|
||||
"""Test text level estimation with normal English text"""
|
||||
text = """The quick brown fox jumps over the lazy dog.
|
||||
This text contains common English words that
|
||||
should be processed without any issues."""
|
||||
level = estimator.estimate_text_level(text)
|
||||
assert isinstance(level, float)
|
||||
assert 3 <= level <= 8 # Difficulty levels should be between 3-8
|
||||
|
||||
def test_normal_user_level(self, estimator):
|
||||
"""Test user level estimation with normal word history"""
|
||||
word_history = {
|
||||
'algorithm': ['20240101'],
|
||||
'computer': ['20240101', '20240102'],
|
||||
'programming': ['20240101']
|
||||
}
|
||||
level = estimator.estimate_user_level(word_history)
|
||||
assert isinstance(level, float)
|
||||
assert 3 <= level <= 8
|
||||
|
||||
def test_normal_word_level(self, estimator):
|
||||
"""Test word level estimation with common words"""
|
||||
assert estimator.get_word_level('computer') >= 3
|
||||
assert estimator.get_word_level('algorithm') >= 3
|
||||
|
||||
# Boundary input tests
|
||||
def test_empty_text(self, estimator):
|
||||
"""Test behavior with empty text"""
|
||||
assert estimator.estimate_text_level('') == 3 # Default level
|
||||
|
||||
def test_single_word_text(self, estimator):
|
||||
"""Test behavior with single-word text"""
|
||||
assert isinstance(estimator.estimate_text_level('Hello'), float)
|
||||
|
||||
def test_empty_user_history(self, estimator):
|
||||
"""Test behavior with empty user history"""
|
||||
assert estimator.estimate_user_level({}) == 3 # Default level
|
||||
|
||||
def test_maximum_word_length(self, estimator):
|
||||
"""Test behavior with extremely long word"""
|
||||
long_word = 'a' * 100
|
||||
assert estimator.get_word_level(long_word) == 3 # Default level
|
||||
|
||||
# Abnormal input tests
|
||||
def test_non_english_text(self, estimator):
|
||||
"""Test behavior with non-English text"""
|
||||
chinese_text = "这是中文文本"
|
||||
assert estimator.estimate_text_level(chinese_text) == 3 # Default level
|
||||
|
||||
def test_special_characters(self, estimator):
|
||||
"""Test behavior with special characters"""
|
||||
special_chars = "@#$%^&*()"
|
||||
assert estimator.estimate_text_level(special_chars) == 3 # Default level
|
||||
|
||||
def test_invalid_word_history(self, estimator):
|
||||
"""Test behavior with invalid word history format"""
|
||||
invalid_history = {'word': 'not_a_list'}
|
||||
with pytest.raises(ValueError):
|
||||
estimator.estimate_user_level(invalid_history)
|
||||
|
||||
def test_none_input(self, estimator):
|
||||
"""Test behavior with None input"""
|
||||
with pytest.raises(TypeError):
|
||||
estimator.estimate_text_level(None)
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
estimator.estimate_user_level(None)
|
||||
|
||||
with pytest.raises(TypeError):
|
||||
estimator.get_word_level(None)
|
||||
|
||||
# Edge cases
|
||||
def test_mixed_case_words(self, estimator):
|
||||
"""Test behavior with mixed case words"""
|
||||
assert estimator.get_word_level('Computer') == estimator.get_word_level('computer')
|
||||
|
||||
def test_whitespace_handling(self, estimator):
|
||||
"""Test behavior with various whitespace patterns"""
|
||||
text_with_spaces = " Multiple Spaces Between Words "
|
||||
level = estimator.estimate_text_level(text_with_spaces)
|
||||
assert isinstance(level, float)
|
||||
|
||||
def test_repeated_words(self, estimator):
|
||||
"""Test behavior with repeated words"""
|
||||
text = "word word word word word"
|
||||
level = estimator.estimate_text_level(text)
|
||||
assert isinstance(level, float)
|
||||
|
||||
def test_numeric_input(self, estimator):
|
||||
"""Test behavior with numeric input"""
|
||||
assert estimator.estimate_text_level("123 456 789") == 3 # Default level
|
||||
|
||||
def test_mixed_content(self, estimator):
|
||||
"""Test behavior with mixed content (numbers, words, special chars)"""
|
||||
mixed_text = "Hello123 @World! 456"
|
||||
level = estimator.estimate_text_level(mixed_text)
|
||||
assert isinstance(level, float)
|
|
@ -0,0 +1,216 @@
|
|||
from datetime import datetime
|
||||
from admin_service import ADMIN_NAME
|
||||
from flask import *
|
||||
|
||||
# from app import Yaml
|
||||
# from app.Article import get_today_article, load_freq_history
|
||||
# from app.WordFreq import WordFreq
|
||||
# from app.wordfreqCMD import sort_in_descending_order
|
||||
|
||||
import Yaml
|
||||
from Article import get_today_article, load_freq_history
|
||||
from WordFreq import WordFreq
|
||||
from wordfreqCMD import sort_in_descending_order
|
||||
|
||||
import pickle_idea
|
||||
import pickle_idea2
|
||||
|
||||
import logging
|
||||
logging.basicConfig(filename='log.txt', format='%(asctime)s %(message)s', level=logging.DEBUG)
|
||||
|
||||
# 初始化蓝图
|
||||
userService = Blueprint("user_bp", __name__)
|
||||
|
||||
path_prefix = '/var/www/wordfreq/wordfreq/'
|
||||
path_prefix = './' # comment this line in deployment
|
||||
|
||||
@userService.route("/get_next_article/<username>",methods=['GET','POST'])
|
||||
def get_next_article(username):
|
||||
user_freq_record = path_prefix + 'static/frequency/' + 'frequency_%s.pickle' % (username)
|
||||
session['old_articleID'] = session.get('articleID')
|
||||
if request.method == 'GET':
|
||||
visited_articles = session.get("visited_articles")
|
||||
if visited_articles['article_ids'][-1] == "null": # 如果当前还是"null",则将"null"pop出来,无需index+=1
|
||||
visited_articles['article_ids'].pop()
|
||||
else: # 当前不为"null",直接 index+=1
|
||||
visited_articles["index"] += 1
|
||||
session["visited_articles"] = visited_articles
|
||||
logging.debug('/get_next_article: start calling get_today_arcile()')
|
||||
visited_articles, today_article, result_of_generate_article = get_today_article(user_freq_record, session.get('visited_articles'))
|
||||
logging.debug('/get_next_arcile: done.')
|
||||
data = {
|
||||
'visited_articles': visited_articles,
|
||||
'today_article': today_article,
|
||||
'result_of_generate_article': result_of_generate_article
|
||||
}
|
||||
else:
|
||||
return 'Under construction'
|
||||
return json.dumps(data)
|
||||
|
||||
@userService.route("/get_pre_article/<username>",methods=['GET'])
|
||||
def get_pre_article(username):
|
||||
user_freq_record = path_prefix + 'static/frequency/' + 'frequency_%s.pickle' % (username)
|
||||
if request.method == 'GET':
|
||||
visited_articles = session.get("visited_articles")
|
||||
if(visited_articles["index"]==0):
|
||||
data=''
|
||||
else:
|
||||
visited_articles["index"] -= 1 # 上一篇,index-=1
|
||||
if visited_articles['article_ids'][-1] == "null": # 如果当前还是"null",则将"null"pop出来
|
||||
visited_articles['article_ids'].pop()
|
||||
session["visited_articles"] = visited_articles
|
||||
visited_articles, today_article, result_of_generate_article = get_today_article(user_freq_record, session.get('visited_articles'))
|
||||
data = {
|
||||
'visited_articles': visited_articles,
|
||||
'today_article': today_article,
|
||||
'result_of_generate_article':result_of_generate_article
|
||||
}
|
||||
return json.dumps(data)
|
||||
|
||||
@userService.route("/<username>/<word>/unfamiliar", methods=['GET', 'POST'])
|
||||
def unfamiliar(username, word):
|
||||
'''
|
||||
|
||||
:param username:
|
||||
:param word:
|
||||
:return:
|
||||
'''
|
||||
user_freq_record = path_prefix + 'static/frequency/' + 'frequency_%s.pickle' % (username)
|
||||
pickle_idea.unfamiliar(user_freq_record, word)
|
||||
session['thisWord'] = word # 1. put a word into session
|
||||
session['time'] = 1
|
||||
return "success"
|
||||
|
||||
|
||||
@userService.route("/<username>/<word>/familiar", methods=['GET', 'POST'])
|
||||
def familiar(username, word):
|
||||
'''
|
||||
|
||||
:param username:
|
||||
:param word:
|
||||
:return:
|
||||
'''
|
||||
user_freq_record = path_prefix + 'static/frequency/' + 'frequency_%s.pickle' % (username)
|
||||
pickle_idea.familiar(user_freq_record, word)
|
||||
session['thisWord'] = word # 1. put a word into session
|
||||
session['time'] = 1
|
||||
return "success"
|
||||
|
||||
|
||||
@userService.route("/<username>/<word>/del", methods=['GET', 'POST'])
|
||||
def deleteword(username, word):
|
||||
'''
|
||||
删除单词
|
||||
:param username: 用户名
|
||||
:param word: 单词
|
||||
:return: 重定位到用户界面
|
||||
'''
|
||||
user_freq_record = path_prefix + 'static/frequency/' + 'frequency_%s.pickle' % (username)
|
||||
pickle_idea2.deleteRecord(user_freq_record, word)
|
||||
# 模板userpage_get.html中删除单词是异步执行,而flash的信息后续是同步执行的,所以注释这段代码;同时如果这里使用flash但不提取信息,则会影响 signup.html的显示。bug复现:删除单词后,点击退出,点击注册,注册页面就会出现提示信息
|
||||
# flash(f'{word} is no longer in your word list.')
|
||||
return "success"
|
||||
|
||||
|
||||
@userService.route("/<username>/userpage", methods=['GET', 'POST'])
|
||||
def userpage(username):
|
||||
'''
|
||||
用户界面
|
||||
:param username: 用户名
|
||||
:return: 返回用户界面
|
||||
'''
|
||||
# 未登录,跳转到未登录界面
|
||||
if not session.get('logged_in'):
|
||||
return render_template('not_login.html')
|
||||
|
||||
# 用户过期
|
||||
user_expiry_date = session.get('expiry_date')
|
||||
if datetime.now().strftime('%Y%m%d') > user_expiry_date:
|
||||
return render_template('expiry.html', expiry_date=user_expiry_date)
|
||||
|
||||
# 获取session里的用户名
|
||||
username = session.get('username')
|
||||
|
||||
user_freq_record = path_prefix + 'static/frequency/' + 'frequency_%s.pickle' % (username)
|
||||
|
||||
if request.method == 'POST': # when we submit a form
|
||||
content = request.form['content']
|
||||
f = WordFreq(content)
|
||||
lst = f.get_freq()
|
||||
return render_template('userpage_post.html',username=username,lst = lst, yml=Yaml.yml)
|
||||
|
||||
elif request.method == 'GET': # when we load a html page
|
||||
try:
|
||||
d = load_freq_history(user_freq_record)
|
||||
lst = pickle_idea2.dict2lst(d)
|
||||
lst2 = []
|
||||
for t in lst:
|
||||
if isinstance(t[1], (list, tuple)): # Check if t[1] is a list or tuple
|
||||
lst2.append((t[0], len(t[1])))
|
||||
elif isinstance(t[1], int): # Handle case where t[1] is an integer
|
||||
lst2.append((t[0], t[1]))
|
||||
else:
|
||||
lst2.append((t[0], 1)) # Default case
|
||||
|
||||
lst3 = sort_in_descending_order(lst2)
|
||||
words = ''
|
||||
for x in lst3:
|
||||
words += x[0] + ' '
|
||||
visited_articles, today_article, result_of_generate_article = get_today_article(user_freq_record, session.get('visited_articles'))
|
||||
session['visited_articles'] = visited_articles
|
||||
# 通过 today_article,加载前端的显示页面
|
||||
return render_template('userpage_get.html',
|
||||
admin_name=ADMIN_NAME,
|
||||
username=username,
|
||||
session=session,
|
||||
# flashed_messages=get_flashed_messages(), 仅有删除单词的时候使用到flash,而删除单词是异步执行,这里的信息提示是同步执行,所以就没有存在的必要了
|
||||
today_article=today_article,
|
||||
result_of_generate_article=result_of_generate_article,
|
||||
d_len=len(d),
|
||||
lst3=lst3,
|
||||
yml=Yaml.yml,
|
||||
words=words)
|
||||
except Exception as e:
|
||||
print(f"Error in userpage: {str(e)}")
|
||||
return render_template('userpage_get.html',
|
||||
username=username,
|
||||
today_article={"user_level": 4.5}, # Default level
|
||||
lst3=[],
|
||||
d_len=0)
|
||||
|
||||
@userService.route("/<username>/mark", methods=['GET', 'POST'])
|
||||
def user_mark_word(username):
|
||||
'''
|
||||
标记单词
|
||||
:param username: 用户名
|
||||
:return: 重定位到用户界面
|
||||
'''
|
||||
username = session[username]
|
||||
user_freq_record = path_prefix + 'static/frequency/' + 'frequency_%s.pickle' % (username)
|
||||
if request.method == 'POST':
|
||||
# 提交标记的单词
|
||||
d = load_freq_history(user_freq_record)
|
||||
lst_history = pickle_idea2.dict2lst(d)
|
||||
lst = []
|
||||
lst2 = []
|
||||
for word in request.form.getlist('marked'):
|
||||
if not word in pickle_idea2.exclusion_lst and len(word) > 2:
|
||||
lst.append((word, [get_time()]))
|
||||
lst2.append(word)
|
||||
d = pickle_idea2.merge_frequency(lst, lst_history)
|
||||
if len(lst_history) > 999:
|
||||
flash('You have way too many words in your difficult-words book. Delete some first.')
|
||||
else:
|
||||
pickle_idea2.save_frequency_to_pickle(d, user_freq_record)
|
||||
flash('Added %s.' % ', '.join(lst2))
|
||||
return redirect(url_for('user_bp.userpage', username=username))
|
||||
else:
|
||||
return 'Under construction'
|
||||
|
||||
def get_time():
|
||||
'''
|
||||
获取当前时间
|
||||
:return: 当前时间
|
||||
'''
|
||||
return datetime.now().strftime('%Y%m%d%H%M') # upper to minutes
|
||||
|
|
@ -0,0 +1,201 @@
|
|||
###########################################################################
|
||||
# Copyright 2019 (C) Hui Lan <hui.lan@cantab.net>
|
||||
# Written permission must be obtained from the author for commercial uses.
|
||||
###########################################################################
|
||||
|
||||
import collections
|
||||
import html
|
||||
import string
|
||||
import operator
|
||||
import os, sys # 引入模块sys,因为我要用里面的sys.argv列表中的信息来读取命令行参数。
|
||||
import pickle_idea
|
||||
import pickle
|
||||
from datetime import datetime
|
||||
from pickle_idea2 import load_record, save_frequency_to_pickle, lst2dict, dict2lst
|
||||
|
||||
|
||||
def map_percentages_to_levels(percentages):
|
||||
'''
|
||||
功能:按照加权平均难度,给生词本计算难度分,计算权重的规则是(10 - 该词汇难度) * 该难度词汇占总词汇的比例,再进行归一化处理
|
||||
输入:难度占比字典,键代表难度3~8,值代表每种难度的单词的占比
|
||||
输出:权重字典,键代表难度3~8,值代表每种难度的单词的权重
|
||||
'''
|
||||
# 已排序的键
|
||||
sorted_keys = sorted(percentages.keys())
|
||||
|
||||
# 计算权重和权重总和
|
||||
sum = 0 # 总和
|
||||
levels_proportions = {}
|
||||
for k in sorted_keys:
|
||||
levels_proportions[k] = 10 - k
|
||||
for k in sorted_keys:
|
||||
levels_proportions[k] *= percentages[k]
|
||||
sum += levels_proportions[k]
|
||||
|
||||
# 归一化权重到权重总和为1
|
||||
for k in sorted_keys:
|
||||
levels_proportions[k] /= sum
|
||||
|
||||
return levels_proportions
|
||||
|
||||
|
||||
def freq(fruit):
|
||||
'''
|
||||
功能: 把字符串转成列表。 目的是得到每个单词的频率。
|
||||
输入: 字符串
|
||||
输出: 列表, 列表里包含一组元组,每个元组包含单词与单词的频率。 比如 [('apple', 2), ('banana', 1)]
|
||||
注意事项: 首先要把字符串转成小写。原因是。。。
|
||||
'''
|
||||
|
||||
result = []
|
||||
|
||||
fruit = fruit.lower() # 字母转小写
|
||||
flst = fruit.split() # 字符串转成list
|
||||
c = collections.Counter(flst)
|
||||
result = c.most_common()
|
||||
return result
|
||||
|
||||
|
||||
def youdao_link(s): # 有道链接
|
||||
link = 'http://youdao.com/w/eng/' + s + '/#keyfrom=dict2.index'# 网址
|
||||
return link
|
||||
|
||||
|
||||
def file2str(fname):#文件转字符
|
||||
f = open(fname) #打开
|
||||
s = f.read() #读取
|
||||
f.close() #关闭
|
||||
return s
|
||||
|
||||
|
||||
def remove_punctuation(s): # 这里是s是形参 (parameter)。函数被调用时才给s赋值。
|
||||
special_characters = '\_©~<=>+/[]*&$%^@.,?!:;#()"“”—‘’{}|,。?!¥……()、《》【】:;·' # 把里面的字符都去掉
|
||||
s = html.unescape(s) # 将HTML实体转换为对应的字符,比如<会被识别为小于号
|
||||
for c in special_characters:
|
||||
s = s.replace(c, ' ') # 防止出现把 apple,apple 移掉逗号后变成 appleapple 情况
|
||||
s = s.replace('--', ' ')
|
||||
s = s.strip() # 去除前后的空格
|
||||
|
||||
if '\'' in s:
|
||||
n = len(s)
|
||||
t = '' # 用来收集我需要保留的字符
|
||||
for i in range(n): # 只有单引号前后都有英文字符,才保留
|
||||
if s[i] == '\'':
|
||||
i_is_ok = i - 1 >= 0 and i + 1 < n
|
||||
if i_is_ok and s[i-1] in string.ascii_letters and s[i+1] in string.ascii_letters:
|
||||
t += s[i]
|
||||
else:
|
||||
t += s[i]
|
||||
return t
|
||||
else:
|
||||
return s
|
||||
|
||||
|
||||
def sort_in_descending_order(lst):# 单词按频率降序排列
|
||||
lst2 = sorted(lst, reverse=True, key=lambda x: (x[1], x[0]))
|
||||
return lst2
|
||||
|
||||
|
||||
def sort_in_ascending_order(lst):# 单词按频率降序排列
|
||||
lst2 = sorted(lst, reverse=False, key=lambda x: (x[1], x[0]))
|
||||
return lst2
|
||||
|
||||
|
||||
def make_html_page(lst, fname): # 只是在wordfreqCMD.py中的main函数中调用,所以不做修改
|
||||
'''
|
||||
功能:把lst的信息存到fname中,以html格式。
|
||||
'''
|
||||
s = ''
|
||||
count = 1
|
||||
for x in lst:
|
||||
# <a href="">word</a>
|
||||
s += '<p>%d <a href="%s">%s</a> (%d)</p>' % (count, youdao_link(x[0]), x[0], x[1])
|
||||
count += 1
|
||||
f = open(fname, 'w')
|
||||
f.write(s)
|
||||
f.close()
|
||||
|
||||
|
||||
class WordFreq:
|
||||
def __init__(self):
|
||||
self.pickle_file = 'frequency.p' # Add this to store cumulative data
|
||||
|
||||
def process_file(self, filename):
|
||||
# ... existing word processing code ...
|
||||
|
||||
# Convert current word frequencies to timestamp format
|
||||
current_words = {}
|
||||
timestamp = datetime.now().strftime('%Y%m%d%H%M')
|
||||
for word, freq in self.freq.items():
|
||||
current_words[word] = [timestamp] * freq # Create list of timestamps for each occurrence
|
||||
|
||||
# Load existing cumulative data
|
||||
try:
|
||||
cumulative_data = load_record(self.pickle_file)
|
||||
except (FileNotFoundError, EOFError):
|
||||
cumulative_data = {}
|
||||
|
||||
# Merge current words with historical data
|
||||
for word, timestamps in current_words.items():
|
||||
if word in cumulative_data:
|
||||
cumulative_data[word].extend(timestamps)
|
||||
else:
|
||||
cumulative_data[word] = timestamps
|
||||
|
||||
# Save updated data
|
||||
save_frequency_to_pickle(cumulative_data, self.pickle_file)
|
||||
|
||||
def show_results(self):
|
||||
# ... existing code ...
|
||||
|
||||
# Add cumulative frequency display
|
||||
print("\nCumulative Frequencies (all-time):")
|
||||
try:
|
||||
cumulative_data = load_record(self.pickle_file)
|
||||
# Sort by cumulative frequency (length of timestamp list)
|
||||
sorted_words = sorted(cumulative_data.items(),
|
||||
key=lambda x: len(x[1]),
|
||||
reverse=True)
|
||||
|
||||
for word, timestamps in sorted_words[:20]: # Show top 20
|
||||
print(f"{word}: {len(timestamps)} times")
|
||||
except (FileNotFoundError, EOFError):
|
||||
print("No cumulative data available yet")
|
||||
|
||||
|
||||
## main(程序入口)
|
||||
if __name__ == '__main__':
|
||||
num = len(sys.argv)
|
||||
|
||||
if num == 1: # 从键盘读入字符串
|
||||
s = input()
|
||||
elif num == 2: # 从文件读入字符串
|
||||
fname = sys.argv[1]
|
||||
s = file2str(fname)
|
||||
else:
|
||||
print('I can accept at most 2 arguments.')
|
||||
sys.exit()# 结束程序运行, 下面的代码不会被执行了。
|
||||
|
||||
s = remove_punctuation(s) # 这里是s是实参(argument),里面有值
|
||||
L = freq(s)
|
||||
for x in sort_in_descending_order(L):
|
||||
print('%s\t%d\t%s' % (x[0], x[1], youdao_link(x[0])))#函数导出
|
||||
|
||||
# 把频率的结果放result.html中
|
||||
make_html_page(sort_in_descending_order(L), 'result.html')
|
||||
|
||||
print('\nHistory:\n')
|
||||
if os.path.exists('frequency.p'):
|
||||
d = pickle_idea.load_record('frequency.p')
|
||||
else:
|
||||
d = {}
|
||||
|
||||
print(sort_in_descending_order(pickle_idea.dict2lst(d)))
|
||||
|
||||
# 合并频率
|
||||
lst_history = pickle_idea.dict2lst(d)
|
||||
d = pickle_idea.merge_frequency(L, lst_history)
|
||||
pickle_idea.save_frequency_to_pickle(d, 'frequency.p')
|
||||
|
||||
|
||||
|
Loading…
Reference in New Issue