diff --git a/Article.py b/Article.py new file mode 100644 index 0000000..8d74a7c --- /dev/null +++ b/Article.py @@ -0,0 +1,166 @@ +from WordFreq import WordFreq +from wordfreqCMD import youdao_link, sort_in_descending_order +import pickle_idea, pickle_idea2 +import os +import random, glob +import hashlib +from datetime import datetime +from flask import Flask, request, redirect, render_template, url_for, session, abort, flash, get_flashed_messages +from difficulty import get_difficulty_level_for_user, text_difficulty_level, user_difficulty_level +from model.article import get_all_articles, get_article_by_id, get_number_of_articles +import logging +import re +path_prefix = './' +db_path_prefix = './db/' # comment this line in deployment +oxford_words_path='C:\\Users\\ANNA\\Desktop\\ooad\\app\\db\\oxford_words.txt' + +def count_oxford_words(text, oxford_words): + words = re.findall(r'\b\w+\b', text.lower()) + total_words = len(words) + oxford_word_count = sum(1 for word in words if word in oxford_words) + return oxford_word_count, total_words + +def calculate_ratio(oxford_word_count, total_words): + if total_words == 0: + return 0 + return oxford_word_count / total_words + +def load_oxford_words(file_path): + oxford_words = {} + with open(file_path, 'r', encoding='utf-8') as file: + for line in file: + parts = line.strip().split() + word = parts[0] + pos = parts[1] + level = parts[2] + oxford_words[word] = {'pos': pos, 'level': level} + return oxford_words + +def total_number_of_essays(): + return get_number_of_articles() + + +def get_article_title(s): + return s.split('\n')[0] + + +def get_article_body(s): + lst = s.split('\n') + lst.pop(0) # remove the first line + return '\n'.join(lst) + + +def get_today_article(user_word_list, visited_articles): + if visited_articles is None: + visited_articles = { + "index" : 0, # 为 article_ids 的索引 + "article_ids": [] # 之前显示文章的id列表,越后越新 + } + if visited_articles["index"] > len(visited_articles["article_ids"])-1: # 生成新的文章,因此查找所有的文章 + result = get_all_articles() + else: # 生成阅读过的文章,因此查询指定 article_id 的文章 + if visited_articles["article_ids"][visited_articles["index"]] == 'null': # 可能因为直接刷新页面导致直接去查询了'null',因此当刷新的页面的时候,需要直接进行“上一篇”操作 + visited_articles["index"] -= 1 + visited_articles["article_ids"].pop() + article_id = visited_articles["article_ids"][visited_articles["index"]] + result = get_article_by_id(article_id) + random.shuffle(result) + + # Choose article according to reader's level + logging.debug('* get_today_article(): start d1 = ... ') + d1 = load_freq_history(user_word_list) + d2 = load_freq_history(path_prefix + 'static/words_and_tests.p') + logging.debug(' ... get_today_article(): get_difficulty_level_for_user() start') + d3 = get_difficulty_level_for_user(d1, d2) + logging.debug(' ... get_today_article(): done') + + d = None + result_of_generate_article = "not found" + + d_user = load_freq_history(user_word_list) + logging.debug('* get_today_article(): user_difficulty_level() start') + user_level = user_difficulty_level(d_user, d3) # more consideration as user's behaviour is dynamic. Time factor should be considered. + logging.debug('* get_today_article(): done') + text_level = 0 + if visited_articles["index"] > len(visited_articles["article_ids"])-1: # 生成新的文章 + amount_of_visited_articles = len(visited_articles["article_ids"]) + amount_of_existing_articles = result.__len__() + if amount_of_visited_articles == amount_of_existing_articles: # 如果当前阅读过的文章的数量 == 存在的文章的数量,即所有的书本都阅读过了 + result_of_generate_article = "had read all articles" + else: + for k in range(3): # 最多尝试3次 + for reading in result: + text_level = text_difficulty_level(reading['text'], d3) + factor = random.gauss(0.8, 0.1) # a number drawn from Gaussian distribution with a mean of 0.8 and a stand deviation of 1 + if reading['article_id'] not in visited_articles["article_ids"] and within_range(text_level, user_level, (8.0 - user_level) * factor): # 新的文章之前没有出现过且符合一定范围的水平 + d = reading + visited_articles["article_ids"].append(d['article_id']) # 列表添加新的文章id;下面进行 + result_of_generate_article = "found" + break + if result_of_generate_article == "found": # 用于成功找到文章后及时退出外层循环 + break + if result_of_generate_article != "found": # 阅读完所有文章,或者循环3次没有找到适合的文章,则放入空(“null”) + visited_articles["article_ids"].append('null') + else: # 生成已经阅读过的文章 + d = random.choice(result) + text_level = text_difficulty_level(d['text'], d3) + result_of_generate_article = "found" + + today_article = None + if d: + oxford_words = load_oxford_words(oxford_words_path) + oxford_word_count, total_words = count_oxford_words(d['text'],oxford_words) + ratio = calculate_ratio(oxford_word_count,total_words) + today_article = { + "user_level": '%4.1f' % user_level, + "text_level": '%4.1f' % text_level, + "date": d['date'], + "article_title": get_article_title(d['text']), + "article_body": get_article_body(d['text']), + "source": d["source"], + "question": get_question_part(d['question']), + "answer": get_answer_part(d['question']), + "ratio" : ratio + } + + return visited_articles, today_article, result_of_generate_article + + +def load_freq_history(path): + d = {} + if os.path.exists(path): + d = pickle_idea.load_record(path) + return d + + +def within_range(x, y, r): + return x > y and abs(x - y) <= r + + +def get_question_part(s): + s = s.strip() + result = [] + flag = 0 + for line in s.split('\n'): + line = line.strip() + if line == 'QUESTION': + result.append(line) + flag = 1 + elif line == 'ANSWER': + flag = 0 + elif flag == 1: + result.append(line) + return '\n'.join(result) + + +def get_answer_part(s): + s = s.strip() + result = [] + flag = 0 + for line in s.split('\n'): + line = line.strip() + if line == 'ANSWER': + flag = 1 + elif flag == 1: + result.append(line) + return '\n'.join(result) diff --git a/Login.py b/Login.py new file mode 100644 index 0000000..b41e604 --- /dev/null +++ b/Login.py @@ -0,0 +1,128 @@ +import hashlib +import string +from datetime import datetime, timedelta +import unicodedata + + +def md5(s): + ''' + MD5摘要 + :param str: 字符串 + :return: 经MD5以后的字符串 + ''' + h = hashlib.md5(s.encode(encoding='utf-8')) + return h.hexdigest() + + +path_prefix = '/var/www/wordfreq/wordfreq/' +path_prefix = './' # comment this line in deployment + + +def verify_user(username, password): + from model.user import get_user_by_username + user = get_user_by_username(username) + encoded_password = md5(username + password) + return user is not None and user.password == encoded_password + + +def add_user(username, password): + from model.user import insert_user + start_date = datetime.now().strftime('%Y%m%d') + expiry_date = (datetime.now() + timedelta(days=30)).strftime('%Y%m%d') + password = md5(username + password) + insert_user(username=username, password=password, start_date=start_date, expiry_date=expiry_date) + + +def check_username_availability(username): + from model.user import get_user_by_username + existed_user = get_user_by_username(username) + return existed_user is None + + +def change_password(username, old_password, new_password): + ''' + 修改密码 + :param username: 用户名 + :param old_password: 旧的密码 + :param new_password: 新密码 + :return: 修改成功:True 否则:False + ''' + if not verify_user(username, old_password): # 旧密码错误 + return {'error':'Old password is wrong.', 'username':username} + # 将用户名和密码一起加密,以免暴露不同用户的相同密码 + if new_password == old_password: #新旧密码一致 + return {'error':'New password cannot be the same as the old password.', 'username':username} + from model.user import update_password_by_username + update_password_by_username(username, new_password) + return {'success':'Password changed', 'username':username} + + +def get_expiry_date(username): + from model.user import get_user_by_username + user = get_user_by_username(username) + if user is None: + return '20191024' + else: + return user.expiry_date + + +class UserName: + def __init__(self, username): + self.username = username + + def contains_chinese(self): + for char in self.username: + # Check if the character is in the CJK (Chinese, Japanese, Korean) Unicode block + if unicodedata.name(char).startswith('CJK UNIFIED IDEOGRAPH'): + return True + return False + + def validate(self): + if len(self.username) > 20: + return f'{self.username} is too long. The user name cannot exceed 20 characters.' + if self.username.startswith('.'): # a user name must not start with a dot + return 'Period (.) is not allowed as the first letter in the user name.' + if ' ' in self.username: # a user name must not include a whitespace + return 'Whitespace is not allowed in the user name.' + for c in self.username: # a user name must not include special characters, except non-leading periods or underscores + if c in string.punctuation and c != '.' and c != '_': + return f'{c} is not allowed in the user name.' + if self.username in ['signup', 'login', 'logout', 'reset', 'mark', 'back', 'unfamiliar', 'familiar', 'del', + 'admin']: + return 'You used a restricted word as your user name. Please come up with a better one.' + if self.contains_chinese(): + return 'Chinese characters are not allowed in the user name.' + return 'OK' + + +class Password: + def __init__(self, password): + self.password = password + + def contains_chinese(self): + for char in self.password: + # Check if the character is in the CJK (Chinese, Japanese, Korean) Unicode block + if unicodedata.name(char).startswith('CJK UNIFIED IDEOGRAPH'): + return True + return False + + def validate(self): + if len(self.password) < 4: + return 'Password must be at least 4 characters long.' + if ' ' in self.password: + return 'Password cannot contain spaces.' + if self.contains_chinese(): + return 'Chinese characters are not allowed in the password.' + return 'OK' + + +class WarningMessage: + def __init__(self, s, type='username'): + self.s = s + self.type = type + + def __str__(self): + if self.type == 'username': + return UserName(self.s).validate() + if self.type == 'password': + return Password(self.s).validate() diff --git a/create_pickle.py b/create_pickle.py new file mode 100644 index 0000000..3e1fcda --- /dev/null +++ b/create_pickle.py @@ -0,0 +1,34 @@ +import pickle +import os + +# Sample vocabulary data - simulating a user's word history +# Format: word -> list of dates when the word was studied +test_data = { + "hello": ["20240101"], + "world": ["20240101", "20240102"], + "computer": ["20240101", "20240103"], + "programming": ["20240102"], + "python": ["20240102", "20240103"], + "algorithm": ["20240103"], + "database": ["20240103"], + "interface": ["20240104"], + "vocabulary": ["20240104"], + "sophisticated": ["20240104"] +} + +# Ensure frequency directory exists +base_path = r'C:\Users\ANNA\Desktop\app\static\frequency' +os.makedirs(base_path, exist_ok=True) + +# Save the test data +file_path = os.path.join(base_path, 'mr1an85.pickle') +with open(file_path, 'wb') as f: + pickle.dump(test_data, f) + +print(f"Test file created at: {file_path}") + +# Verify the file was created and can be read +with open(file_path, 'rb') as f: + loaded_data = pickle.load(f) +print("\nVerifying data:") +print(loaded_data) \ No newline at end of file diff --git a/pickle_idea.py b/pickle_idea.py new file mode 100644 index 0000000..b0de662 --- /dev/null +++ b/pickle_idea.py @@ -0,0 +1,101 @@ +########################################################################### +# Copyright 2019 (C) Hui Lan +# Written permission must be obtained from the author for commercial uses. +########################################################################### + +# Purpose: dictionary & pickle as a simple means of database. +# Task: incorporate the functions into wordfreqCMD.py such that it will also show cumulative frequency. + +import os +import pickle +from datetime import datetime + + +def lst2dict(lst, d): + ''' + Store the information in list lst to dictionary d. + Handles both frequency counts and date lists. + ''' + for x in lst: + word = x[0] + if isinstance(x[1], list): # if it's a list of dates + freq = len(x[1]) # convert to frequency + else: + freq = x[1] # already a frequency + + if not word in d: + d[word] = freq + else: + d[word] += freq + + +def dict2lst(d): + ''' Convert dictionary to list of (word, frequency) pairs ''' + if len(d) > 0: + keys = list(d.keys()) + if isinstance(d[keys[0]], list): + return [(k, len(v)) for k, v in d.items()] + return list(d.items()) + return [] + + +def merge_frequency(lst1, lst2): + d = {} + lst2dict(lst1, d) + lst2dict(lst2, d) + return d + + +def load_record(pickle_fname): + f = open(pickle_fname, 'rb') + d = pickle.load(f) + f.close() + return d + + +def save_frequency_to_pickle(d, pickle_fname): + f = open(pickle_fname, 'wb') + #exclusion_lst = ['one', 'no', 'has', 'had', 'do', 'that', 'have', 'by', 'not', 'but', 'we', 'this', 'my', 'him', 'so', 'or', 'as', 'are', 'it', 'from', 'with', 'be', 'can', 'for', 'an', 'if', 'who', 'whom', 'whose', 'which', 'the', 'to', 'a', 'of', 'and', 'you', 'i', 'he', 'she', 'they', 'me', 'was', 'were', 'is', 'in', 'at', 'on', 'their', 'his', 'her', 's', 'said', 'all', 'did', 'been', 'w'] + exclusion_lst = [] + d2 = {} + for k in d: + if not k in exclusion_lst and not k.isnumeric() and len(k) > 1: + if isinstance(d[k], list): + d2[k] = len(d[k]) # store frequency count + else: + d2[k] = d[k] + pickle.dump(d2, f) + f.close() + +def unfamiliar(path,word): + if not os.path.exists(path): + return None + with open(path,"rb") as f: + dic = pickle.load(f) + dic[word] += [datetime.now().strftime('%Y%m%d%H%M')] + with open(path,"wb") as fp: + pickle.dump(dic,fp) + +def familiar(path,word): + f = open(path,"rb") + dic = pickle.load(f) + if len(dic[word])>1: + del dic[word][0] + else: + dic.pop(word) + fp = open(path,"wb") + pickle.dump(dic,fp) + +if __name__ == '__main__': + + lst1 = [('apple',2), ('banana',1)] + d = {} + lst2dict(lst1, d) # d will change + save_frequency_to_pickle(d, 'frequency.p') # frequency.p is our database + + + lst2 = [('banana',2), ('orange', 4)] + d = load_record('frequency.p') + lst1 = dict2lst(d) + d = merge_frequency(lst2, lst1) + print(d) diff --git a/pickle_idea2.py b/pickle_idea2.py new file mode 100644 index 0000000..dd41a20 --- /dev/null +++ b/pickle_idea2.py @@ -0,0 +1,99 @@ +########################################################################### +# Copyright 2019 (C) Hui Lan +# Written permission must be obtained from the author for commercial uses. +########################################################################### + + +# Purpose: dictionary & pickle as a simple means of database. +# Task: incorporate the functions into wordfreqCMD.py such that it will also show cumulative frequency. +# Note: unlike pick_idea.py, now the second item is not frequency, but a list of dates. + +import pickle +from datetime import datetime + +def lst2dict(lst, d): + ''' + Store the information in list lst to dictionary d. + Now stores frequency count instead of dates list. + ''' + for x in lst: + word = x[0] + if isinstance(x[1], list): # if it's a list of dates + count = len(x[1]) # convert to frequency + else: + count = x[1] # already a frequency + + if not word in d: + d[word] = count + else: + d[word] += count + +def deleteRecord(path,word): + with open(path, 'rb') as f: + db = pickle.load(f) + try: + db.pop(word) + except KeyError: + print("sorry") + with open(path, 'wb') as ff: + pickle.dump(db, ff) + +def dict2lst(d): + if len(d) > 0: + keys = list(d.keys()) + if isinstance(d[keys[0]], int): + return list(d.items()) # return (word, frequency) pairs directly + elif isinstance(d[keys[0]], list): + return [(k, len(v)) for k, v in d.items()] # convert date lists to counts + + return [] + +def merge_frequency(lst1, lst2): + d = {} + lst2dict(lst1, d) + lst2dict(lst2, d) + return d + + +def load_record(pickle_fname): + f = open(pickle_fname, 'rb') + d = pickle.load(f) + f.close() + return d + + +def save_frequency_to_pickle(d, pickle_fname): + f = open(pickle_fname, 'wb') + d2 = {} + for k in d: + if not k in exclusion_lst and not k.isnumeric() and not len(k) < 2: + if isinstance(d[k], list): + d2[k] = len(d[k]) # store frequency count instead of dates list + else: + d2[k] = d[k] + pickle.dump(d2, f) + f.close() + + +exclusion_lst = ['one', 'no', 'has', 'had', 'do', 'that', 'have', 'by', 'not', 'but', 'we', 'this', 'my', 'him', 'so', 'or', 'as', 'are', 'it', 'from', 'with', 'be', 'can', 'for', 'an', 'if', 'who', 'whom', 'whose', 'which', 'the', 'to', 'a', 'of', 'and', 'you', 'i', 'he', 'she', 'they', 'me', 'was', 'were', 'is', 'in', 'at', 'on', 'their', 'his', 'her', 's', 'said', 'all', 'did', 'been', 'w'] + +if __name__ == '__main__': + # Test 1: Convert dates to frequencies + lst1 = [('apple',['201910251437', '201910251438']), ('banana',['201910251439'])] + d = {} + lst2dict(lst1, d) + print("Test 1 - Convert dates to frequencies:") + print(d) # Should show: {'apple': 2, 'banana': 1} + + # Test 2: Save and load frequencies + save_frequency_to_pickle(d, 'frequency.p') + loaded_d = load_record('frequency.p') + print("\nTest 2 - Load saved frequencies:") + print(loaded_d) # Should match the previous output + + # Test 3: Merge frequencies + lst2 = [('banana',['201910251439']), ('orange', ['201910251440', '201910251439'])] + lst1 = dict2lst(loaded_d) + merged_d = merge_frequency(lst2, lst1) + print("\nTest 3 - Merge frequencies:") + print(merged_d) # Should show banana with increased frequency diff --git a/test_estimator.py b/test_estimator.py new file mode 100644 index 0000000..23fa4ee --- /dev/null +++ b/test_estimator.py @@ -0,0 +1,108 @@ +import pytest +from difficulty import VocabularyLevelEstimator + +@pytest.fixture +def estimator(): + """Fixture to create a VocabularyLevelEstimator instance""" + return VocabularyLevelEstimator('path/to/your/actual/word_data.p') + +class TestVocabularyLevelEstimator: + + # Normal input tests + def test_normal_text_estimation(self, estimator): + """Test text level estimation with normal English text""" + text = """The quick brown fox jumps over the lazy dog. + This text contains common English words that + should be processed without any issues.""" + level = estimator.estimate_text_level(text) + assert isinstance(level, float) + assert 3 <= level <= 8 # Difficulty levels should be between 3-8 + + def test_normal_user_level(self, estimator): + """Test user level estimation with normal word history""" + word_history = { + 'algorithm': ['20240101'], + 'computer': ['20240101', '20240102'], + 'programming': ['20240101'] + } + level = estimator.estimate_user_level(word_history) + assert isinstance(level, float) + assert 3 <= level <= 8 + + def test_normal_word_level(self, estimator): + """Test word level estimation with common words""" + assert estimator.get_word_level('computer') >= 3 + assert estimator.get_word_level('algorithm') >= 3 + + # Boundary input tests + def test_empty_text(self, estimator): + """Test behavior with empty text""" + assert estimator.estimate_text_level('') == 3 # Default level + + def test_single_word_text(self, estimator): + """Test behavior with single-word text""" + assert isinstance(estimator.estimate_text_level('Hello'), float) + + def test_empty_user_history(self, estimator): + """Test behavior with empty user history""" + assert estimator.estimate_user_level({}) == 3 # Default level + + def test_maximum_word_length(self, estimator): + """Test behavior with extremely long word""" + long_word = 'a' * 100 + assert estimator.get_word_level(long_word) == 3 # Default level + + # Abnormal input tests + def test_non_english_text(self, estimator): + """Test behavior with non-English text""" + chinese_text = "这是中文文本" + assert estimator.estimate_text_level(chinese_text) == 3 # Default level + + def test_special_characters(self, estimator): + """Test behavior with special characters""" + special_chars = "@#$%^&*()" + assert estimator.estimate_text_level(special_chars) == 3 # Default level + + def test_invalid_word_history(self, estimator): + """Test behavior with invalid word history format""" + invalid_history = {'word': 'not_a_list'} + with pytest.raises(ValueError): + estimator.estimate_user_level(invalid_history) + + def test_none_input(self, estimator): + """Test behavior with None input""" + with pytest.raises(TypeError): + estimator.estimate_text_level(None) + + with pytest.raises(TypeError): + estimator.estimate_user_level(None) + + with pytest.raises(TypeError): + estimator.get_word_level(None) + + # Edge cases + def test_mixed_case_words(self, estimator): + """Test behavior with mixed case words""" + assert estimator.get_word_level('Computer') == estimator.get_word_level('computer') + + def test_whitespace_handling(self, estimator): + """Test behavior with various whitespace patterns""" + text_with_spaces = " Multiple Spaces Between Words " + level = estimator.estimate_text_level(text_with_spaces) + assert isinstance(level, float) + + def test_repeated_words(self, estimator): + """Test behavior with repeated words""" + text = "word word word word word" + level = estimator.estimate_text_level(text) + assert isinstance(level, float) + + def test_numeric_input(self, estimator): + """Test behavior with numeric input""" + assert estimator.estimate_text_level("123 456 789") == 3 # Default level + + def test_mixed_content(self, estimator): + """Test behavior with mixed content (numbers, words, special chars)""" + mixed_text = "Hello123 @World! 456" + level = estimator.estimate_text_level(mixed_text) + assert isinstance(level, float) \ No newline at end of file diff --git a/user_service.py b/user_service.py new file mode 100644 index 0000000..cbc7891 --- /dev/null +++ b/user_service.py @@ -0,0 +1,216 @@ +from datetime import datetime +from admin_service import ADMIN_NAME +from flask import * + +# from app import Yaml +# from app.Article import get_today_article, load_freq_history +# from app.WordFreq import WordFreq +# from app.wordfreqCMD import sort_in_descending_order + +import Yaml +from Article import get_today_article, load_freq_history +from WordFreq import WordFreq +from wordfreqCMD import sort_in_descending_order + +import pickle_idea +import pickle_idea2 + +import logging +logging.basicConfig(filename='log.txt', format='%(asctime)s %(message)s', level=logging.DEBUG) + +# 初始化蓝图 +userService = Blueprint("user_bp", __name__) + +path_prefix = '/var/www/wordfreq/wordfreq/' +path_prefix = './' # comment this line in deployment + +@userService.route("/get_next_article/",methods=['GET','POST']) +def get_next_article(username): + user_freq_record = path_prefix + 'static/frequency/' + 'frequency_%s.pickle' % (username) + session['old_articleID'] = session.get('articleID') + if request.method == 'GET': + visited_articles = session.get("visited_articles") + if visited_articles['article_ids'][-1] == "null": # 如果当前还是"null",则将"null"pop出来,无需index+=1 + visited_articles['article_ids'].pop() + else: # 当前不为"null",直接 index+=1 + visited_articles["index"] += 1 + session["visited_articles"] = visited_articles + logging.debug('/get_next_article: start calling get_today_arcile()') + visited_articles, today_article, result_of_generate_article = get_today_article(user_freq_record, session.get('visited_articles')) + logging.debug('/get_next_arcile: done.') + data = { + 'visited_articles': visited_articles, + 'today_article': today_article, + 'result_of_generate_article': result_of_generate_article + } + else: + return 'Under construction' + return json.dumps(data) + +@userService.route("/get_pre_article/",methods=['GET']) +def get_pre_article(username): + user_freq_record = path_prefix + 'static/frequency/' + 'frequency_%s.pickle' % (username) + if request.method == 'GET': + visited_articles = session.get("visited_articles") + if(visited_articles["index"]==0): + data='' + else: + visited_articles["index"] -= 1 # 上一篇,index-=1 + if visited_articles['article_ids'][-1] == "null": # 如果当前还是"null",则将"null"pop出来 + visited_articles['article_ids'].pop() + session["visited_articles"] = visited_articles + visited_articles, today_article, result_of_generate_article = get_today_article(user_freq_record, session.get('visited_articles')) + data = { + 'visited_articles': visited_articles, + 'today_article': today_article, + 'result_of_generate_article':result_of_generate_article + } + return json.dumps(data) + +@userService.route("///unfamiliar", methods=['GET', 'POST']) +def unfamiliar(username, word): + ''' + + :param username: + :param word: + :return: + ''' + user_freq_record = path_prefix + 'static/frequency/' + 'frequency_%s.pickle' % (username) + pickle_idea.unfamiliar(user_freq_record, word) + session['thisWord'] = word # 1. put a word into session + session['time'] = 1 + return "success" + + +@userService.route("///familiar", methods=['GET', 'POST']) +def familiar(username, word): + ''' + + :param username: + :param word: + :return: + ''' + user_freq_record = path_prefix + 'static/frequency/' + 'frequency_%s.pickle' % (username) + pickle_idea.familiar(user_freq_record, word) + session['thisWord'] = word # 1. put a word into session + session['time'] = 1 + return "success" + + +@userService.route("///del", methods=['GET', 'POST']) +def deleteword(username, word): + ''' + 删除单词 + :param username: 用户名 + :param word: 单词 + :return: 重定位到用户界面 + ''' + user_freq_record = path_prefix + 'static/frequency/' + 'frequency_%s.pickle' % (username) + pickle_idea2.deleteRecord(user_freq_record, word) + # 模板userpage_get.html中删除单词是异步执行,而flash的信息后续是同步执行的,所以注释这段代码;同时如果这里使用flash但不提取信息,则会影响 signup.html的显示。bug复现:删除单词后,点击退出,点击注册,注册页面就会出现提示信息 + # flash(f'{word} is no longer in your word list.') + return "success" + + +@userService.route("//userpage", methods=['GET', 'POST']) +def userpage(username): + ''' + 用户界面 + :param username: 用户名 + :return: 返回用户界面 + ''' + # 未登录,跳转到未登录界面 + if not session.get('logged_in'): + return render_template('not_login.html') + + # 用户过期 + user_expiry_date = session.get('expiry_date') + if datetime.now().strftime('%Y%m%d') > user_expiry_date: + return render_template('expiry.html', expiry_date=user_expiry_date) + + # 获取session里的用户名 + username = session.get('username') + + user_freq_record = path_prefix + 'static/frequency/' + 'frequency_%s.pickle' % (username) + + if request.method == 'POST': # when we submit a form + content = request.form['content'] + f = WordFreq(content) + lst = f.get_freq() + return render_template('userpage_post.html',username=username,lst = lst, yml=Yaml.yml) + + elif request.method == 'GET': # when we load a html page + try: + d = load_freq_history(user_freq_record) + lst = pickle_idea2.dict2lst(d) + lst2 = [] + for t in lst: + if isinstance(t[1], (list, tuple)): # Check if t[1] is a list or tuple + lst2.append((t[0], len(t[1]))) + elif isinstance(t[1], int): # Handle case where t[1] is an integer + lst2.append((t[0], t[1])) + else: + lst2.append((t[0], 1)) # Default case + + lst3 = sort_in_descending_order(lst2) + words = '' + for x in lst3: + words += x[0] + ' ' + visited_articles, today_article, result_of_generate_article = get_today_article(user_freq_record, session.get('visited_articles')) + session['visited_articles'] = visited_articles + # 通过 today_article,加载前端的显示页面 + return render_template('userpage_get.html', + admin_name=ADMIN_NAME, + username=username, + session=session, + # flashed_messages=get_flashed_messages(), 仅有删除单词的时候使用到flash,而删除单词是异步执行,这里的信息提示是同步执行,所以就没有存在的必要了 + today_article=today_article, + result_of_generate_article=result_of_generate_article, + d_len=len(d), + lst3=lst3, + yml=Yaml.yml, + words=words) + except Exception as e: + print(f"Error in userpage: {str(e)}") + return render_template('userpage_get.html', + username=username, + today_article={"user_level": 4.5}, # Default level + lst3=[], + d_len=0) + +@userService.route("//mark", methods=['GET', 'POST']) +def user_mark_word(username): + ''' + 标记单词 + :param username: 用户名 + :return: 重定位到用户界面 + ''' + username = session[username] + user_freq_record = path_prefix + 'static/frequency/' + 'frequency_%s.pickle' % (username) + if request.method == 'POST': + # 提交标记的单词 + d = load_freq_history(user_freq_record) + lst_history = pickle_idea2.dict2lst(d) + lst = [] + lst2 = [] + for word in request.form.getlist('marked'): + if not word in pickle_idea2.exclusion_lst and len(word) > 2: + lst.append((word, [get_time()])) + lst2.append(word) + d = pickle_idea2.merge_frequency(lst, lst_history) + if len(lst_history) > 999: + flash('You have way too many words in your difficult-words book. Delete some first.') + else: + pickle_idea2.save_frequency_to_pickle(d, user_freq_record) + flash('Added %s.' % ', '.join(lst2)) + return redirect(url_for('user_bp.userpage', username=username)) + else: + return 'Under construction' + +def get_time(): + ''' + 获取当前时间 + :return: 当前时间 + ''' + return datetime.now().strftime('%Y%m%d%H%M') # upper to minutes + diff --git a/wordfreqCMD.py b/wordfreqCMD.py new file mode 100644 index 0000000..5405552 --- /dev/null +++ b/wordfreqCMD.py @@ -0,0 +1,201 @@ +########################################################################### +# Copyright 2019 (C) Hui Lan +# Written permission must be obtained from the author for commercial uses. +########################################################################### + +import collections +import html +import string +import operator +import os, sys # 引入模块sys,因为我要用里面的sys.argv列表中的信息来读取命令行参数。 +import pickle_idea +import pickle +from datetime import datetime +from pickle_idea2 import load_record, save_frequency_to_pickle, lst2dict, dict2lst + + +def map_percentages_to_levels(percentages): + ''' + 功能:按照加权平均难度,给生词本计算难度分,计算权重的规则是(10 - 该词汇难度) * 该难度词汇占总词汇的比例,再进行归一化处理 + 输入:难度占比字典,键代表难度3~8,值代表每种难度的单词的占比 + 输出:权重字典,键代表难度3~8,值代表每种难度的单词的权重 + ''' + # 已排序的键 + sorted_keys = sorted(percentages.keys()) + + # 计算权重和权重总和 + sum = 0 # 总和 + levels_proportions = {} + for k in sorted_keys: + levels_proportions[k] = 10 - k + for k in sorted_keys: + levels_proportions[k] *= percentages[k] + sum += levels_proportions[k] + + # 归一化权重到权重总和为1 + for k in sorted_keys: + levels_proportions[k] /= sum + + return levels_proportions + + +def freq(fruit): + ''' + 功能: 把字符串转成列表。 目的是得到每个单词的频率。 + 输入: 字符串 + 输出: 列表, 列表里包含一组元组,每个元组包含单词与单词的频率。 比如 [('apple', 2), ('banana', 1)] + 注意事项: 首先要把字符串转成小写。原因是。。。 + ''' + + result = [] + + fruit = fruit.lower() # 字母转小写 + flst = fruit.split() # 字符串转成list + c = collections.Counter(flst) + result = c.most_common() + return result + + +def youdao_link(s): # 有道链接 + link = 'http://youdao.com/w/eng/' + s + '/#keyfrom=dict2.index'# 网址 + return link + + +def file2str(fname):#文件转字符 + f = open(fname) #打开 + s = f.read() #读取 + f.close() #关闭 + return s + + +def remove_punctuation(s): # 这里是s是形参 (parameter)。函数被调用时才给s赋值。 + special_characters = '\_©~<=>+/[]*&$%^@.,?!:;#()"“”—‘’{}|,。?!¥……()、《》【】:;·' # 把里面的字符都去掉 + s = html.unescape(s) # 将HTML实体转换为对应的字符,比如<会被识别为小于号 + for c in special_characters: + s = s.replace(c, ' ') # 防止出现把 apple,apple 移掉逗号后变成 appleapple 情况 + s = s.replace('--', ' ') + s = s.strip() # 去除前后的空格 + + if '\'' in s: + n = len(s) + t = '' # 用来收集我需要保留的字符 + for i in range(n): # 只有单引号前后都有英文字符,才保留 + if s[i] == '\'': + i_is_ok = i - 1 >= 0 and i + 1 < n + if i_is_ok and s[i-1] in string.ascii_letters and s[i+1] in string.ascii_letters: + t += s[i] + else: + t += s[i] + return t + else: + return s + + +def sort_in_descending_order(lst):# 单词按频率降序排列 + lst2 = sorted(lst, reverse=True, key=lambda x: (x[1], x[0])) + return lst2 + + +def sort_in_ascending_order(lst):# 单词按频率降序排列 + lst2 = sorted(lst, reverse=False, key=lambda x: (x[1], x[0])) + return lst2 + + +def make_html_page(lst, fname): # 只是在wordfreqCMD.py中的main函数中调用,所以不做修改 + ''' + 功能:把lst的信息存到fname中,以html格式。 + ''' + s = '' + count = 1 + for x in lst: + # word + s += '

%d %s (%d)

' % (count, youdao_link(x[0]), x[0], x[1]) + count += 1 + f = open(fname, 'w') + f.write(s) + f.close() + + +class WordFreq: + def __init__(self): + self.pickle_file = 'frequency.p' # Add this to store cumulative data + + def process_file(self, filename): + # ... existing word processing code ... + + # Convert current word frequencies to timestamp format + current_words = {} + timestamp = datetime.now().strftime('%Y%m%d%H%M') + for word, freq in self.freq.items(): + current_words[word] = [timestamp] * freq # Create list of timestamps for each occurrence + + # Load existing cumulative data + try: + cumulative_data = load_record(self.pickle_file) + except (FileNotFoundError, EOFError): + cumulative_data = {} + + # Merge current words with historical data + for word, timestamps in current_words.items(): + if word in cumulative_data: + cumulative_data[word].extend(timestamps) + else: + cumulative_data[word] = timestamps + + # Save updated data + save_frequency_to_pickle(cumulative_data, self.pickle_file) + + def show_results(self): + # ... existing code ... + + # Add cumulative frequency display + print("\nCumulative Frequencies (all-time):") + try: + cumulative_data = load_record(self.pickle_file) + # Sort by cumulative frequency (length of timestamp list) + sorted_words = sorted(cumulative_data.items(), + key=lambda x: len(x[1]), + reverse=True) + + for word, timestamps in sorted_words[:20]: # Show top 20 + print(f"{word}: {len(timestamps)} times") + except (FileNotFoundError, EOFError): + print("No cumulative data available yet") + + +## main(程序入口) +if __name__ == '__main__': + num = len(sys.argv) + + if num == 1: # 从键盘读入字符串 + s = input() + elif num == 2: # 从文件读入字符串 + fname = sys.argv[1] + s = file2str(fname) + else: + print('I can accept at most 2 arguments.') + sys.exit()# 结束程序运行, 下面的代码不会被执行了。 + + s = remove_punctuation(s) # 这里是s是实参(argument),里面有值 + L = freq(s) + for x in sort_in_descending_order(L): + print('%s\t%d\t%s' % (x[0], x[1], youdao_link(x[0])))#函数导出 + + # 把频率的结果放result.html中 + make_html_page(sort_in_descending_order(L), 'result.html') + + print('\nHistory:\n') + if os.path.exists('frequency.p'): + d = pickle_idea.load_record('frequency.p') + else: + d = {} + + print(sort_in_descending_order(pickle_idea.dict2lst(d))) + + # 合并频率 + lst_history = pickle_idea.dict2lst(d) + d = pickle_idea.merge_frequency(L, lst_history) + pickle_idea.save_frequency_to_pickle(d, 'frequency.p') + + +