Add vocabulary-level-related files and user service fixes

2025-06-12 13:17:02 +08:00 · 2025-06-12 13:17:02 +08:00 · 250d2c37fd
parent c3b109528a
commit 250d2c37fd
8 changed files with 1053 additions and 0 deletions
--- a/Article.py
+++ b/Article.py
@ -0,0 +1,166 @@
+from WordFreq import WordFreq
+from wordfreqCMD import youdao_link, sort_in_descending_order
+import pickle_idea, pickle_idea2
+import os
+import random, glob
+import hashlib
+from datetime import datetime
+from flask import Flask, request, redirect, render_template, url_for, session, abort, flash, get_flashed_messages
+from difficulty import get_difficulty_level_for_user, text_difficulty_level, user_difficulty_level
+from model.article import get_all_articles, get_article_by_id, get_number_of_articles
+import logging
+import re
+path_prefix = './'
+db_path_prefix = './db/'  # comment this line in deployment
+oxford_words_path='C:\\Users\\ANNA\\Desktop\\ooad\\app\\db\\oxford_words.txt'
+
+def count_oxford_words(text, oxford_words):
+    words = re.findall(r'\b\w+\b', text.lower())
+    total_words = len(words)
+    oxford_word_count = sum(1 for word in words if word in oxford_words)
+    return oxford_word_count, total_words
+
+def calculate_ratio(oxford_word_count, total_words):
+    if total_words == 0:
+        return 0
+    return oxford_word_count / total_words
+
+def load_oxford_words(file_path):
+    oxford_words = {}
+    with open(file_path, 'r', encoding='utf-8') as file:
+        for line in file:
+            parts = line.strip().split()
+            word = parts[0]
+            pos = parts[1]
+            level = parts[2]
+            oxford_words[word] = {'pos': pos, 'level': level}
+    return oxford_words
+
+def total_number_of_essays():
+    return get_number_of_articles()
+
+
+def get_article_title(s):
+    return s.split('\n')[0]
+
+
+def get_article_body(s):
+    lst = s.split('\n')
+    lst.pop(0)  # remove the first line
+    return '\n'.join(lst)
+
+
+def get_today_article(user_word_list, visited_articles):
+    if visited_articles is None:
+        visited_articles = {
+            "index" : 0,  # 为 article_ids 的索引
+            "article_ids": []  # 之前显示文章的id列表，越后越新
+        }
+    if visited_articles["index"] > len(visited_articles["article_ids"])-1:  # 生成新的文章，因此查找所有的文章
+        result = get_all_articles()
+    else:  # 生成阅读过的文章，因此查询指定 article_id 的文章
+        if visited_articles["article_ids"][visited_articles["index"]] == 'null':  # 可能因为直接刷新页面导致直接去查询了'null'，因此当刷新的页面的时候，需要直接进行“上一篇”操作
+            visited_articles["index"] -= 1
+            visited_articles["article_ids"].pop()
+        article_id = visited_articles["article_ids"][visited_articles["index"]]
+        result = get_article_by_id(article_id)
+    random.shuffle(result)
+
+    # Choose article according to reader's level
+    logging.debug('* get_today_article(): start d1 = ... ')
+    d1 = load_freq_history(user_word_list)
+    d2 = load_freq_history(path_prefix + 'static/words_and_tests.p')
+    logging.debug(' ... get_today_article(): get_difficulty_level_for_user() start')
+    d3 = get_difficulty_level_for_user(d1, d2)
+    logging.debug(' ... get_today_article(): done')
+
+    d = None
+    result_of_generate_article = "not found"
+
+    d_user = load_freq_history(user_word_list)
+    logging.debug('* get_today_article(): user_difficulty_level() start')
+    user_level = user_difficulty_level(d_user, d3)  # more consideration as user's behaviour is dynamic. Time factor should be considered.
+    logging.debug('* get_today_article(): done')
+    text_level = 0
+    if visited_articles["index"] > len(visited_articles["article_ids"])-1:  # 生成新的文章
+        amount_of_visited_articles = len(visited_articles["article_ids"])
+        amount_of_existing_articles = result.__len__()
+        if amount_of_visited_articles == amount_of_existing_articles:  # 如果当前阅读过的文章的数量 == 存在的文章的数量，即所有的书本都阅读过了
+            result_of_generate_article = "had read all articles"
+        else:
+            for k in range(3):  # 最多尝试3次
+                for reading in result:
+                    text_level = text_difficulty_level(reading['text'], d3)
+                    factor = random.gauss(0.8, 0.1)  # a number drawn from Gaussian distribution with a mean of 0.8 and a stand deviation of 1
+                    if reading['article_id'] not in visited_articles["article_ids"] and within_range(text_level, user_level, (8.0 - user_level) * factor):  # 新的文章之前没有出现过且符合一定范围的水平
+                        d = reading
+                        visited_articles["article_ids"].append(d['article_id'])  # 列表添加新的文章id；下面进行
+                        result_of_generate_article = "found"
+                        break
+                if result_of_generate_article == "found":  # 用于成功找到文章后及时退出外层循环
+                    break
+        if result_of_generate_article != "found":  # 阅读完所有文章，或者循环3次没有找到适合的文章，则放入空（“null”）
+            visited_articles["article_ids"].append('null')
+    else:  # 生成已经阅读过的文章
+        d = random.choice(result)
+        text_level = text_difficulty_level(d['text'], d3)
+        result_of_generate_article = "found"
+
+    today_article = None
+    if d:
+        oxford_words = load_oxford_words(oxford_words_path)
+        oxford_word_count, total_words = count_oxford_words(d['text'],oxford_words)
+        ratio = calculate_ratio(oxford_word_count,total_words)
+        today_article = {
+            "user_level": '%4.1f' % user_level,
+            "text_level": '%4.1f' % text_level,
+            "date": d['date'],
+            "article_title": get_article_title(d['text']),
+            "article_body": get_article_body(d['text']),
+            "source": d["source"],
+            "question": get_question_part(d['question']),
+            "answer": get_answer_part(d['question']),
+            "ratio" : ratio
+        }
+
+    return visited_articles, today_article, result_of_generate_article
+
+
+def load_freq_history(path):
+    d = {}
+    if os.path.exists(path):
+        d = pickle_idea.load_record(path)
+    return d
+
+
+def within_range(x, y, r):
+    return x > y and abs(x - y) <= r
+
+
+def get_question_part(s):
+    s = s.strip()
+    result = []
+    flag = 0
+    for line in s.split('\n'):
+        line = line.strip()
+        if line == 'QUESTION':
+            result.append(line)
+            flag = 1
+        elif line == 'ANSWER':
+            flag = 0
+        elif flag == 1:
+            result.append(line)
+    return '\n'.join(result)
+
+
+def get_answer_part(s):
+    s = s.strip()
+    result = []
+    flag = 0
+    for line in s.split('\n'):
+        line = line.strip()
+        if line == 'ANSWER':
+            flag = 1
+        elif flag == 1:
+            result.append(line)
+    return '\n'.join(result)
--- a/Login.py
+++ b/Login.py
@ -0,0 +1,128 @@
+import hashlib
+import string
+from datetime import datetime, timedelta
+import unicodedata
+
+
+def md5(s):
+    '''
+    MD5摘要
+    :param str: 字符串
+    :return: 经MD5以后的字符串
+    '''
+    h = hashlib.md5(s.encode(encoding='utf-8'))
+    return h.hexdigest()
+
+
+path_prefix = '/var/www/wordfreq/wordfreq/'
+path_prefix = './'  # comment this line in deployment
+
+
+def verify_user(username, password):
+    from model.user import get_user_by_username
+    user = get_user_by_username(username)
+    encoded_password = md5(username + password)
+    return user is not None and user.password == encoded_password
+
+
+def add_user(username, password):
+    from model.user import insert_user
+    start_date = datetime.now().strftime('%Y%m%d')
+    expiry_date = (datetime.now() + timedelta(days=30)).strftime('%Y%m%d')
+    password = md5(username + password)
+    insert_user(username=username, password=password, start_date=start_date, expiry_date=expiry_date)
+
+
+def check_username_availability(username):
+    from model.user import get_user_by_username
+    existed_user = get_user_by_username(username)
+    return existed_user is None
+
+
+def change_password(username, old_password, new_password):
+    '''
+    修改密码
+    :param username: 用户名
+    :param old_password: 旧的密码
+    :param new_password: 新密码
+    :return: 修改成功:True 否则:False
+    '''
+    if not verify_user(username, old_password):  # 旧密码错误
+        return {'error':'Old password is wrong.', 'username':username}
+    # 将用户名和密码一起加密，以免暴露不同用户的相同密码
+    if new_password == old_password:  #新旧密码一致
+        return {'error':'New password cannot be the same as the old password.', 'username':username}
+    from model.user import update_password_by_username
+    update_password_by_username(username, new_password)
+    return {'success':'Password changed', 'username':username}
+
+
+def get_expiry_date(username):
+    from model.user import get_user_by_username
+    user = get_user_by_username(username)
+    if user is None:
+        return '20191024'
+    else:
+        return user.expiry_date
+
+
+class UserName:
+    def __init__(self, username):
+        self.username = username
+
+    def contains_chinese(self):
+        for char in self.username:
+            # Check if the character is in the CJK (Chinese, Japanese, Korean) Unicode block
+            if unicodedata.name(char).startswith('CJK UNIFIED IDEOGRAPH'):
+                return True
+        return False
+
+    def validate(self):
+        if len(self.username) > 20:
+            return f'{self.username} is too long.  The user name cannot exceed 20 characters.'
+        if self.username.startswith('.'): # a user name must not start with a dot
+            return 'Period (.) is not allowed as the first letter in the user name.'
+        if ' ' in self.username: # a user name must not include a whitespace
+            return 'Whitespace is not allowed in the user name.'
+        for c in self.username: # a user name must not include special characters, except non-leading periods or underscores
+            if c in string.punctuation and c != '.' and c != '_':
+                return f'{c} is not allowed in the user name.'
+        if self.username in ['signup', 'login', 'logout', 'reset', 'mark', 'back', 'unfamiliar', 'familiar', 'del',
+                             'admin']:
+            return 'You used a restricted word as your user name.  Please come up with a better one.'
+        if self.contains_chinese():
+            return 'Chinese characters are not allowed in the user name.'
+        return 'OK'
+
+
+class Password:
+    def __init__(self, password):
+        self.password = password
+
+    def contains_chinese(self):
+        for char in self.password:
+            # Check if the character is in the CJK (Chinese, Japanese, Korean) Unicode block
+            if unicodedata.name(char).startswith('CJK UNIFIED IDEOGRAPH'):
+                return True
+        return False
+
+    def validate(self):
+        if len(self.password) < 4:
+            return 'Password must be at least 4 characters long.'
+        if ' ' in self.password:
+            return 'Password cannot contain spaces.'
+        if self.contains_chinese():
+            return 'Chinese characters are not allowed in the password.'
+        return 'OK'
+
+
+class WarningMessage:
+    def __init__(self, s, type='username'):
+        self.s = s
+        self.type = type
+
+    def __str__(self):
+        if self.type == 'username':
+            return UserName(self.s).validate()
+        if self.type == 'password':
+            return Password(self.s).validate()
--- a/create_pickle.py
+++ b/create_pickle.py
@ -0,0 +1,34 @@
+import pickle
+import os
+
+# Sample vocabulary data - simulating a user's word history
+# Format: word -> list of dates when the word was studied
+test_data = {
+    "hello": ["20240101"],
+    "world": ["20240101", "20240102"],
+    "computer": ["20240101", "20240103"],
+    "programming": ["20240102"],
+    "python": ["20240102", "20240103"],
+    "algorithm": ["20240103"],
+    "database": ["20240103"],
+    "interface": ["20240104"],
+    "vocabulary": ["20240104"],
+    "sophisticated": ["20240104"]
+}
+
+# Ensure frequency directory exists
+base_path = r'C:\Users\ANNA\Desktop\app\static\frequency'
+os.makedirs(base_path, exist_ok=True)
+
+# Save the test data
+file_path = os.path.join(base_path, 'mr1an85.pickle')
+with open(file_path, 'wb') as f:
+    pickle.dump(test_data, f)
+
+print(f"Test file created at: {file_path}")
+
+# Verify the file was created and can be read
+with open(file_path, 'rb') as f:
+    loaded_data = pickle.load(f)
+print("\nVerifying data:")
+print(loaded_data) 
--- a/pickle_idea.py
+++ b/pickle_idea.py
@ -0,0 +1,101 @@
+###########################################################################
+# Copyright 2019 (C) Hui Lan <hui.lan@cantab.net>
+# Written permission must be obtained from the author for commercial uses.
+###########################################################################
+
+# Purpose: dictionary & pickle as a simple means of database.
+# Task: incorporate the functions into wordfreqCMD.py such that it will also show cumulative frequency.
+
+import os
+import pickle
+from datetime import datetime
+
+
+def lst2dict(lst, d):
+    ''' 
+    Store the information in list lst to dictionary d. 
+    Handles both frequency counts and date lists.
+    '''
+    for x in lst:
+        word = x[0]
+        if isinstance(x[1], list):  # if it's a list of dates
+            freq = len(x[1])        # convert to frequency
+        else:
+            freq = x[1]             # already a frequency
+            
+        if not word in d:
+            d[word] = freq
+        else:
+            d[word] += freq
+
+
+def dict2lst(d):
+    ''' Convert dictionary to list of (word, frequency) pairs '''
+    if len(d) > 0:
+        keys = list(d.keys())
+        if isinstance(d[keys[0]], list):
+            return [(k, len(v)) for k, v in d.items()]
+        return list(d.items())
+    return []
+
+
+def merge_frequency(lst1, lst2):
+    d = {}
+    lst2dict(lst1, d)
+    lst2dict(lst2, d)
+    return d
+
+
+def load_record(pickle_fname):
+    f = open(pickle_fname, 'rb')
+    d = pickle.load(f)
+    f.close()
+    return d
+
+
+def save_frequency_to_pickle(d, pickle_fname):
+    f = open(pickle_fname, 'wb')
+    #exclusion_lst = ['one', 'no', 'has', 'had', 'do', 'that', 'have', 'by', 'not', 'but', 'we', 'this', 'my', 'him', 'so', 'or', 'as', 'are', 'it', 'from', 'with', 'be', 'can', 'for', 'an', 'if', 'who', 'whom', 'whose', 'which', 'the', 'to', 'a', 'of', 'and', 'you', 'i', 'he', 'she', 'they', 'me', 'was', 'were', 'is', 'in', 'at', 'on', 'their', 'his', 'her', 's', 'said', 'all', 'did', 'been', 'w']
+    exclusion_lst = []
+    d2 = {}
+    for k in d:
+        if not k in exclusion_lst and not k.isnumeric() and len(k) > 1:
+            if isinstance(d[k], list):
+                d2[k] = len(d[k])  # store frequency count
+            else:
+                d2[k] = d[k]
+    pickle.dump(d2, f)
+    f.close()
+
+def unfamiliar(path,word):
+    if not os.path.exists(path):
+        return None
+    with open(path,"rb") as f:
+        dic = pickle.load(f)
+        dic[word] += [datetime.now().strftime('%Y%m%d%H%M')]
+    with open(path,"wb") as fp:
+        pickle.dump(dic,fp)
+
+def familiar(path,word):
+    f = open(path,"rb")
+    dic = pickle.load(f)
+    if len(dic[word])>1:
+        del dic[word][0]
+    else:
+        dic.pop(word)
+    fp = open(path,"wb")
+    pickle.dump(dic,fp)
+
+if __name__ == '__main__':
+
+    lst1 = [('apple',2),  ('banana',1)]
+    d = {}
+    lst2dict(lst1, d) # d will change
+    save_frequency_to_pickle(d, 'frequency.p') # frequency.p is our database
+
+
+    lst2 = [('banana',2), ('orange', 4)]
+    d = load_record('frequency.p')
+    lst1 = dict2lst(d)
+    d = merge_frequency(lst2, lst1)
+    print(d)
--- a/pickle_idea2.py
+++ b/pickle_idea2.py
@ -0,0 +1,99 @@
+###########################################################################
+# Copyright 2019 (C) Hui Lan <hui.lan@cantab.net>
+# Written permission must be obtained from the author for commercial uses.
+###########################################################################
+
+
+# Purpose: dictionary & pickle as a simple means of database.
+# Task: incorporate the functions into wordfreqCMD.py such that it will also show cumulative frequency.
+# Note: unlike pick_idea.py, now the second item is not frequency, but a list of dates.
+
+import pickle
+from datetime import datetime
+
+def lst2dict(lst, d):
+    ''' 
+    Store the information in list lst to dictionary d. 
+    Now stores frequency count instead of dates list.
+    '''
+    for x in lst:
+        word = x[0]
+        if isinstance(x[1], list):  # if it's a list of dates
+            count = len(x[1])       # convert to frequency
+        else:
+            count = x[1]            # already a frequency
+        
+        if not word in d:
+            d[word] = count
+        else:
+            d[word] += count
+
+def deleteRecord(path,word):
+    with open(path, 'rb') as f:
+        db = pickle.load(f)
+    try:
+        db.pop(word)
+    except KeyError:
+        print("sorry")
+    with open(path, 'wb') as ff:
+            pickle.dump(db, ff)
+
+def dict2lst(d):
+    if len(d) > 0:
+        keys = list(d.keys())
+        if isinstance(d[keys[0]], int):
+            return list(d.items())  # return (word, frequency) pairs directly
+        elif isinstance(d[keys[0]], list):
+            return [(k, len(v)) for k, v in d.items()]  # convert date lists to counts
+
+    return []
+
+def merge_frequency(lst1, lst2):
+    d = {}
+    lst2dict(lst1, d)
+    lst2dict(lst2, d)
+    return d
+
+
+def load_record(pickle_fname):
+    f = open(pickle_fname, 'rb')
+    d = pickle.load(f)
+    f.close()
+    return d
+
+
+def save_frequency_to_pickle(d, pickle_fname):
+    f = open(pickle_fname, 'wb')
+    d2 = {}
+    for k in d:
+        if not k in exclusion_lst and not k.isnumeric() and not len(k) < 2:
+            if isinstance(d[k], list):
+                d2[k] = len(d[k])  # store frequency count instead of dates list
+            else:
+                d2[k] = d[k]
+    pickle.dump(d2, f)
+    f.close()
+
+
+exclusion_lst = ['one', 'no', 'has', 'had', 'do', 'that', 'have', 'by', 'not', 'but', 'we', 'this', 'my', 'him', 'so', 'or', 'as', 'are', 'it', 'from', 'with', 'be', 'can', 'for', 'an', 'if', 'who', 'whom', 'whose', 'which', 'the', 'to', 'a', 'of', 'and', 'you', 'i', 'he', 'she', 'they', 'me', 'was', 'were', 'is', 'in', 'at', 'on', 'their', 'his', 'her', 's', 'said', 'all', 'did', 'been', 'w']
+
+if __name__ == '__main__':
+    # Test 1: Convert dates to frequencies
+    lst1 = [('apple',['201910251437', '201910251438']),  ('banana',['201910251439'])]
+    d = {}
+    lst2dict(lst1, d)
+    print("Test 1 - Convert dates to frequencies:")
+    print(d)  # Should show: {'apple': 2, 'banana': 1}
+
+    # Test 2: Save and load frequencies
+    save_frequency_to_pickle(d, 'frequency.p')
+    loaded_d = load_record('frequency.p')
+    print("\nTest 2 - Load saved frequencies:")
+    print(loaded_d)  # Should match the previous output
+
+    # Test 3: Merge frequencies
+    lst2 = [('banana',['201910251439']), ('orange', ['201910251440', '201910251439'])]
+    lst1 = dict2lst(loaded_d)
+    merged_d = merge_frequency(lst2, lst1)
+    print("\nTest 3 - Merge frequencies:")
+    print(merged_d)  # Should show banana with increased frequency
--- a/test_estimator.py
+++ b/test_estimator.py
@ -0,0 +1,108 @@
+import pytest
+from difficulty import VocabularyLevelEstimator
+
+@pytest.fixture
+def estimator():
+    """Fixture to create a VocabularyLevelEstimator instance"""
+    return VocabularyLevelEstimator('path/to/your/actual/word_data.p')
+
+class TestVocabularyLevelEstimator:
+    
+    # Normal input tests
+    def test_normal_text_estimation(self, estimator):
+        """Test text level estimation with normal English text"""
+        text = """The quick brown fox jumps over the lazy dog. 
+                 This text contains common English words that 
+                 should be processed without any issues."""
+        level = estimator.estimate_text_level(text)
+        assert isinstance(level, float)
+        assert 3 <= level <= 8  # Difficulty levels should be between 3-8
+    
+    def test_normal_user_level(self, estimator):
+        """Test user level estimation with normal word history"""
+        word_history = {
+            'algorithm': ['20240101'],
+            'computer': ['20240101', '20240102'],
+            'programming': ['20240101']
+        }
+        level = estimator.estimate_user_level(word_history)
+        assert isinstance(level, float)
+        assert 3 <= level <= 8
+    
+    def test_normal_word_level(self, estimator):
+        """Test word level estimation with common words"""
+        assert estimator.get_word_level('computer') >= 3
+        assert estimator.get_word_level('algorithm') >= 3
+    
+    # Boundary input tests
+    def test_empty_text(self, estimator):
+        """Test behavior with empty text"""
+        assert estimator.estimate_text_level('') == 3  # Default level
+    
+    def test_single_word_text(self, estimator):
+        """Test behavior with single-word text"""
+        assert isinstance(estimator.estimate_text_level('Hello'), float)
+    
+    def test_empty_user_history(self, estimator):
+        """Test behavior with empty user history"""
+        assert estimator.estimate_user_level({}) == 3  # Default level
+    
+    def test_maximum_word_length(self, estimator):
+        """Test behavior with extremely long word"""
+        long_word = 'a' * 100
+        assert estimator.get_word_level(long_word) == 3  # Default level
+    
+    # Abnormal input tests
+    def test_non_english_text(self, estimator):
+        """Test behavior with non-English text"""
+        chinese_text = "这是中文文本"
+        assert estimator.estimate_text_level(chinese_text) == 3  # Default level
+    
+    def test_special_characters(self, estimator):
+        """Test behavior with special characters"""
+        special_chars = "@#$%^&*()"
+        assert estimator.estimate_text_level(special_chars) == 3  # Default level
+    
+    def test_invalid_word_history(self, estimator):
+        """Test behavior with invalid word history format"""
+        invalid_history = {'word': 'not_a_list'}
+        with pytest.raises(ValueError):
+            estimator.estimate_user_level(invalid_history)
+    
+    def test_none_input(self, estimator):
+        """Test behavior with None input"""
+        with pytest.raises(TypeError):
+            estimator.estimate_text_level(None)
+        
+        with pytest.raises(TypeError):
+            estimator.estimate_user_level(None)
+            
+        with pytest.raises(TypeError):
+            estimator.get_word_level(None)
+    
+    # Edge cases
+    def test_mixed_case_words(self, estimator):
+        """Test behavior with mixed case words"""
+        assert estimator.get_word_level('Computer') == estimator.get_word_level('computer')
+    
+    def test_whitespace_handling(self, estimator):
+        """Test behavior with various whitespace patterns"""
+        text_with_spaces = "   Multiple    Spaces    Between    Words   "
+        level = estimator.estimate_text_level(text_with_spaces)
+        assert isinstance(level, float)
+    
+    def test_repeated_words(self, estimator):
+        """Test behavior with repeated words"""
+        text = "word word word word word"
+        level = estimator.estimate_text_level(text)
+        assert isinstance(level, float)
+    
+    def test_numeric_input(self, estimator):
+        """Test behavior with numeric input"""
+        assert estimator.estimate_text_level("123 456 789") == 3  # Default level
+        
+    def test_mixed_content(self, estimator):
+        """Test behavior with mixed content (numbers, words, special chars)"""
+        mixed_text = "Hello123 @World! 456"
+        level = estimator.estimate_text_level(mixed_text)
+        assert isinstance(level, float) 
--- a/user_service.py
+++ b/user_service.py
@ -0,0 +1,216 @@
+from datetime import datetime
+from admin_service import ADMIN_NAME
+from flask import *
+
+# from app import Yaml
+# from app.Article import get_today_article, load_freq_history
+# from app.WordFreq import WordFreq
+# from app.wordfreqCMD import sort_in_descending_order
+
+import Yaml
+from Article import get_today_article, load_freq_history
+from WordFreq import WordFreq
+from wordfreqCMD import sort_in_descending_order
+
+import pickle_idea
+import pickle_idea2
+
+import logging
+logging.basicConfig(filename='log.txt', format='%(asctime)s %(message)s', level=logging.DEBUG)
+
+# 初始化蓝图
+userService = Blueprint("user_bp", __name__)
+
+path_prefix = '/var/www/wordfreq/wordfreq/'
+path_prefix = './'  # comment this line in deployment
+
+@userService.route("/get_next_article/<username>",methods=['GET','POST'])
+def get_next_article(username):
+    user_freq_record = path_prefix + 'static/frequency/' + 'frequency_%s.pickle' % (username)
+    session['old_articleID'] = session.get('articleID')
+    if request.method == 'GET':
+        visited_articles = session.get("visited_articles")
+        if visited_articles['article_ids'][-1] == "null":  # 如果当前还是"null"，则将"null"pop出来,无需index+=1
+            visited_articles['article_ids'].pop()
+        else:  # 当前不为"null"，直接 index+=1
+            visited_articles["index"] += 1
+        session["visited_articles"] = visited_articles
+        logging.debug('/get_next_article: start calling get_today_arcile()')
+        visited_articles, today_article, result_of_generate_article = get_today_article(user_freq_record, session.get('visited_articles'))
+        logging.debug('/get_next_arcile: done.')
+        data = {
+            'visited_articles': visited_articles,
+            'today_article': today_article,
+            'result_of_generate_article': result_of_generate_article
+        }
+    else:
+        return 'Under construction'
+    return json.dumps(data)
+
+@userService.route("/get_pre_article/<username>",methods=['GET'])
+def get_pre_article(username):
+    user_freq_record = path_prefix + 'static/frequency/' + 'frequency_%s.pickle' % (username)
+    if request.method == 'GET':
+        visited_articles = session.get("visited_articles")
+        if(visited_articles["index"]==0):
+            data=''
+        else:
+            visited_articles["index"] -= 1  # 上一篇，index-=1
+            if visited_articles['article_ids'][-1] == "null":  # 如果当前还是"null"，则将"null"pop出来
+                visited_articles['article_ids'].pop()
+            session["visited_articles"] = visited_articles
+            visited_articles, today_article, result_of_generate_article = get_today_article(user_freq_record, session.get('visited_articles'))
+            data = {
+                'visited_articles': visited_articles,
+                'today_article': today_article,
+                'result_of_generate_article':result_of_generate_article
+            }
+        return json.dumps(data)
+
+@userService.route("/<username>/<word>/unfamiliar", methods=['GET', 'POST'])
+def unfamiliar(username, word):
+    '''
+
+    :param username:
+    :param word:
+    :return:
+    '''
+    user_freq_record = path_prefix + 'static/frequency/' + 'frequency_%s.pickle' % (username)
+    pickle_idea.unfamiliar(user_freq_record, word)
+    session['thisWord'] = word  # 1. put a word into session
+    session['time'] = 1
+    return "success"
+
+
+@userService.route("/<username>/<word>/familiar", methods=['GET', 'POST'])
+def familiar(username, word):
+    '''
+
+    :param username:
+    :param word:
+    :return:
+    '''
+    user_freq_record = path_prefix + 'static/frequency/' + 'frequency_%s.pickle' % (username)
+    pickle_idea.familiar(user_freq_record, word)
+    session['thisWord'] = word  # 1. put a word into session
+    session['time'] = 1
+    return "success"
+
+
+@userService.route("/<username>/<word>/del", methods=['GET', 'POST'])
+def deleteword(username, word):
+    '''
+    删除单词
+    :param username: 用户名
+    :param word: 单词
+    :return: 重定位到用户界面
+    '''
+    user_freq_record = path_prefix + 'static/frequency/' + 'frequency_%s.pickle' % (username)
+    pickle_idea2.deleteRecord(user_freq_record, word)
+    # 模板userpage_get.html中删除单词是异步执行，而flash的信息后续是同步执行的，所以注释这段代码；同时如果这里使用flash但不提取信息，则会影响 signup.html的显示。bug复现：删除单词后，点击退出，点击注册，注册页面就会出现提示信息
+    # flash(f'{word} is no longer in your word list.')
+    return "success"
+
+
+@userService.route("/<username>/userpage", methods=['GET', 'POST'])
+def userpage(username):
+    '''
+    用户界面
+    :param username: 用户名
+    :return: 返回用户界面
+    '''
+    # 未登录，跳转到未登录界面
+    if not session.get('logged_in'):
+        return render_template('not_login.html')
+
+    # 用户过期
+    user_expiry_date = session.get('expiry_date')
+    if datetime.now().strftime('%Y%m%d') > user_expiry_date:
+        return render_template('expiry.html', expiry_date=user_expiry_date)
+
+    # 获取session里的用户名
+    username = session.get('username')
+
+    user_freq_record = path_prefix + 'static/frequency/' + 'frequency_%s.pickle' % (username)
+
+    if request.method == 'POST':  # when we submit a form
+        content = request.form['content']
+        f = WordFreq(content)
+        lst = f.get_freq()
+        return render_template('userpage_post.html',username=username,lst = lst, yml=Yaml.yml)
+
+    elif request.method == 'GET':  # when we load a html page
+        try:
+            d = load_freq_history(user_freq_record)
+            lst = pickle_idea2.dict2lst(d)
+            lst2 = []
+            for t in lst:
+                if isinstance(t[1], (list, tuple)):  # Check if t[1] is a list or tuple
+                    lst2.append((t[0], len(t[1])))
+                elif isinstance(t[1], int):  # Handle case where t[1] is an integer
+                    lst2.append((t[0], t[1]))
+                else:
+                    lst2.append((t[0], 1))  # Default case
+                
+            lst3 = sort_in_descending_order(lst2)
+            words = ''
+            for x in lst3:
+                words += x[0] + ' '
+            visited_articles, today_article, result_of_generate_article = get_today_article(user_freq_record, session.get('visited_articles'))
+            session['visited_articles'] = visited_articles
+            # 通过 today_article，加载前端的显示页面
+            return render_template('userpage_get.html',
+                                   admin_name=ADMIN_NAME,
+                                   username=username,
+                                   session=session,
+                                   # flashed_messages=get_flashed_messages(), 仅有删除单词的时候使用到flash，而删除单词是异步执行，这里的信息提示是同步执行，所以就没有存在的必要了
+                                   today_article=today_article,
+                                   result_of_generate_article=result_of_generate_article,
+                                   d_len=len(d),
+                                   lst3=lst3,
+                                   yml=Yaml.yml,
+                                   words=words)
+        except Exception as e:
+            print(f"Error in userpage: {str(e)}")
+            return render_template('userpage_get.html', 
+                                username=username,
+                                today_article={"user_level": 4.5},  # Default level
+                                lst3=[],
+                                d_len=0)
+
+@userService.route("/<username>/mark", methods=['GET', 'POST'])
+def user_mark_word(username):
+    '''
+    标记单词
+    :param username: 用户名
+    :return: 重定位到用户界面
+    '''
+    username = session[username]
+    user_freq_record = path_prefix + 'static/frequency/' + 'frequency_%s.pickle' % (username)
+    if request.method == 'POST':
+        # 提交标记的单词
+        d = load_freq_history(user_freq_record)
+        lst_history = pickle_idea2.dict2lst(d)
+        lst = []
+        lst2 = []
+        for word in request.form.getlist('marked'):
+            if not word in pickle_idea2.exclusion_lst and len(word) > 2:
+                lst.append((word, [get_time()]))
+                lst2.append(word)
+        d = pickle_idea2.merge_frequency(lst, lst_history)
+        if len(lst_history) > 999:
+            flash('You have way too many words in your difficult-words book. Delete some first.')
+        else:
+            pickle_idea2.save_frequency_to_pickle(d, user_freq_record)
+            flash('Added %s.' % ', '.join(lst2))
+        return redirect(url_for('user_bp.userpage', username=username))
+    else:
+        return 'Under construction'
+
+def get_time():
+    '''
+    获取当前时间
+    :return: 当前时间
+    '''
+    return datetime.now().strftime('%Y%m%d%H%M')  # upper to minutes
+
--- a/wordfreqCMD.py
+++ b/wordfreqCMD.py
@ -0,0 +1,201 @@
+###########################################################################
+# Copyright 2019 (C) Hui Lan <hui.lan@cantab.net>
+# Written permission must be obtained from the author for commercial uses.
+###########################################################################
+
+import collections
+import html
+import string
+import operator
+import os, sys # 引入模块sys，因为我要用里面的sys.argv列表中的信息来读取命令行参数。
+import pickle_idea
+import pickle
+from datetime import datetime
+from pickle_idea2 import load_record, save_frequency_to_pickle, lst2dict, dict2lst
+
+
+def map_percentages_to_levels(percentages):
+    '''
+    功能：按照加权平均难度，给生词本计算难度分，计算权重的规则是(10 - 该词汇难度) * 该难度词汇占总词汇的比例，再进行归一化处理
+    输入：难度占比字典，键代表难度3~8，值代表每种难度的单词的占比
+    输出：权重字典，键代表难度3~8，值代表每种难度的单词的权重
+    '''
+    # 已排序的键
+    sorted_keys = sorted(percentages.keys())
+
+    # 计算权重和权重总和
+    sum = 0  # 总和
+    levels_proportions = {}
+    for k in sorted_keys:
+        levels_proportions[k] = 10 - k
+    for k in sorted_keys:
+        levels_proportions[k] *= percentages[k]
+        sum += levels_proportions[k]
+
+    # 归一化权重到权重总和为1
+    for k in sorted_keys:
+        levels_proportions[k] /= sum
+
+    return levels_proportions
+
+
+def freq(fruit):
+    '''
+    功能： 把字符串转成列表。 目的是得到每个单词的频率。
+    输入： 字符串
+    输出： 列表， 列表里包含一组元组，每个元组包含单词与单词的频率。 比如 [('apple', 2), ('banana', 1)]
+    注意事项： 首先要把字符串转成小写。原因是。。。
+    '''
+
+    result = []
+    
+    fruit = fruit.lower() # 字母转小写
+    flst = fruit.split()  # 字符串转成list
+    c = collections.Counter(flst)
+    result = c.most_common()
+    return result
+
+
+def youdao_link(s): # 有道链接
+    link = 'http://youdao.com/w/eng/' + s + '/#keyfrom=dict2.index'# 网址
+    return link
+
+
+def file2str(fname):#文件转字符
+    f = open(fname) #打开
+    s = f.read()    #读取
+    f.close()       #关闭
+    return s
+
+
+def remove_punctuation(s): # 这里是s是形参 (parameter)。函数被调用时才给s赋值。
+    special_characters = '\_©~<=>+/[]*&$%^@.,?!:;#()"“”—‘’{}|，。？！￥……（）、《》【】：；·' # 把里面的字符都去掉
+    s = html.unescape(s) # 将HTML实体转换为对应的字符，比如<会被识别为小于号
+    for c in special_characters:
+        s = s.replace(c, ' ') # 防止出现把 apple,apple 移掉逗号后变成 appleapple 情况
+    s = s.replace('--', ' ')
+    s = s.strip() # 去除前后的空格
+    
+    if '\'' in s:
+        n = len(s)
+        t = '' # 用来收集我需要保留的字符
+        for i in range(n): # 只有单引号前后都有英文字符，才保留
+            if s[i] == '\'':
+                i_is_ok = i - 1 >= 0 and i + 1 < n
+                if i_is_ok and s[i-1] in string.ascii_letters and s[i+1] in string.ascii_letters:
+                    t += s[i]
+            else:
+                t += s[i]
+        return t
+    else:
+        return s
+
+
+def sort_in_descending_order(lst):# 单词按频率降序排列
+    lst2 = sorted(lst, reverse=True, key=lambda x: (x[1], x[0]))
+    return lst2
+
+
+def sort_in_ascending_order(lst):# 单词按频率降序排列
+    lst2 = sorted(lst, reverse=False, key=lambda x: (x[1], x[0]))
+    return lst2
+
+
+def make_html_page(lst, fname):  # 只是在wordfreqCMD.py中的main函数中调用，所以不做修改
+    '''
+    功能：把lst的信息存到fname中，以html格式。
+    '''
+    s = ''
+    count = 1
+    for x in lst:
+        # <a href="">word</a>
+        s += '<p>%d <a href="%s">%s</a> (%d)</p>' % (count, youdao_link(x[0]), x[0], x[1])
+        count += 1
+    f = open(fname, 'w')
+    f.write(s)
+    f.close()
+
+
+class WordFreq:
+    def __init__(self):
+        self.pickle_file = 'frequency.p'  # Add this to store cumulative data
+        
+    def process_file(self, filename):
+        # ... existing word processing code ...
+        
+        # Convert current word frequencies to timestamp format
+        current_words = {}
+        timestamp = datetime.now().strftime('%Y%m%d%H%M')
+        for word, freq in self.freq.items():
+            current_words[word] = [timestamp] * freq  # Create list of timestamps for each occurrence
+            
+        # Load existing cumulative data
+        try:
+            cumulative_data = load_record(self.pickle_file)
+        except (FileNotFoundError, EOFError):
+            cumulative_data = {}
+            
+        # Merge current words with historical data
+        for word, timestamps in current_words.items():
+            if word in cumulative_data:
+                cumulative_data[word].extend(timestamps)
+            else:
+                cumulative_data[word] = timestamps
+                
+        # Save updated data
+        save_frequency_to_pickle(cumulative_data, self.pickle_file)
+        
+    def show_results(self):
+        # ... existing code ...
+        
+        # Add cumulative frequency display
+        print("\nCumulative Frequencies (all-time):")
+        try:
+            cumulative_data = load_record(self.pickle_file)
+            # Sort by cumulative frequency (length of timestamp list)
+            sorted_words = sorted(cumulative_data.items(), 
+                                key=lambda x: len(x[1]), 
+                                reverse=True)
+            
+            for word, timestamps in sorted_words[:20]:  # Show top 20
+                print(f"{word}: {len(timestamps)} times")
+        except (FileNotFoundError, EOFError):
+            print("No cumulative data available yet")
+
+
+## main（程序入口）
+if __name__ == '__main__':
+    num = len(sys.argv)
+
+    if num == 1: # 从键盘读入字符串
+        s = input()
+    elif num == 2: # 从文件读入字符串
+        fname = sys.argv[1]
+        s = file2str(fname)
+    else:
+        print('I can accept at most 2 arguments.')
+        sys.exit()# 结束程序运行， 下面的代码不会被执行了。
+
+    s = remove_punctuation(s) # 这里是s是实参(argument)，里面有值
+    L = freq(s)
+    for x in sort_in_descending_order(L):
+        print('%s\t%d\t%s' % (x[0], x[1], youdao_link(x[0])))#函数导出
+
+    # 把频率的结果放result.html中
+    make_html_page(sort_in_descending_order(L), 'result.html')
+
+    print('\nHistory:\n')
+    if os.path.exists('frequency.p'):
+        d = pickle_idea.load_record('frequency.p')
+    else:
+        d = {}
+
+    print(sort_in_descending_order(pickle_idea.dict2lst(d)))
+
+    # 合并频率
+    lst_history = pickle_idea.dict2lst(d)
+    d = pickle_idea.merge_frequency(L, lst_history)
+    pickle_idea.save_frequency_to_pickle(d, 'frequency.p')
+
+
+