Bug #585-Semenichenko

2025-06-12 15:04:37 +08:00 · 2025-06-12 15:04:37 +08:00 · bb3ae80f0d
parent d9512c929b
commit bb3ae80f0d
8 changed files with 599 additions and 175 deletions
--- a/app/Article.py
+++ b/app/Article.py
@ -12,7 +12,7 @@ import logging
 import re
 path_prefix = './'
 db_path_prefix = './db/'  # comment this line in deployment
-oxford_words_path='./db/oxford_words.txt'
+oxford_words_path='C:\\Users\\ANNA\\Desktop\\ooad\\app\\db\\oxford_words.txt'

 def count_oxford_words(text, oxford_words):
    words = re.findall(r'\b\w+\b', text.lower())
--- a/app/Login.py
+++ b/app/Login.py
@ -14,28 +14,27 @@ def md5(s):
    return h.hexdigest()


-# import model.user after the defination of md5(s) to avoid circular import
-from model.user import get_user_by_username, insert_user, update_password_by_username
-
 path_prefix = '/var/www/wordfreq/wordfreq/'
 path_prefix = './'  # comment this line in deployment


 def verify_user(username, password):
+    from model.user import get_user_by_username
    user = get_user_by_username(username)
    encoded_password = md5(username + password)
    return user is not None and user.password == encoded_password


 def add_user(username, password):
+    from model.user import insert_user
    start_date = datetime.now().strftime('%Y%m%d')
-    expiry_date = (datetime.now() + timedelta(days=30)).strftime('%Y%m%d')  # will expire after 30 days
-    # 将用户名和密码一起加密，以免暴露不同用户的相同密码
+    expiry_date = (datetime.now() + timedelta(days=30)).strftime('%Y%m%d')
    password = md5(username + password)
    insert_user(username=username, password=password, start_date=start_date, expiry_date=expiry_date)


 def check_username_availability(username):
+    from model.user import get_user_by_username
    existed_user = get_user_by_username(username)
    return existed_user is None

@ -53,11 +52,13 @@ def change_password(username, old_password, new_password):
    # 将用户名和密码一起加密，以免暴露不同用户的相同密码
    if new_password == old_password:  #新旧密码一致
        return {'error':'New password cannot be the same as the old password.', 'username':username}
+    from model.user import update_password_by_username
    update_password_by_username(username, new_password)
    return {'success':'Password changed', 'username':username}


 def get_expiry_date(username):
+    from model.user import get_user_by_username
    user = get_user_by_username(username)
    if user is None:
        return '20191024'
@ -79,11 +80,11 @@ class UserName:
    def validate(self):
        if len(self.username) > 20:
            return f'{self.username} is too long.  The user name cannot exceed 20 characters.'
-        if self.username.startswith('.'):  # a user name must not start with a dot
+        if self.username.startswith('.'): # a user name must not start with a dot
            return 'Period (.) is not allowed as the first letter in the user name.'
-        if ' ' in self.username:  # a user name must not include a whitespace
+        if ' ' in self.username: # a user name must not include a whitespace
            return 'Whitespace is not allowed in the user name.'
-        for c in self.username:  # a user name must not include special characters, except non-leading periods or underscores
+        for c in self.username: # a user name must not include special characters, except non-leading periods or underscores
            if c in string.punctuation and c != '.' and c != '_':
                return f'{c} is not allowed in the user name.'
        if self.username in ['signup', 'login', 'logout', 'reset', 'mark', 'back', 'unfamiliar', 'familiar', 'del',
--- a/app/create_pickle.py
+++ b/app/create_pickle.py
@ -0,0 +1,34 @@
+import pickle
+import os
+
+# Sample vocabulary data - simulating a user's word history
+# Format: word -> list of dates when the word was studied
+test_data = {
+    "hello": ["20240101"],
+    "world": ["20240101", "20240102"],
+    "computer": ["20240101", "20240103"],
+    "programming": ["20240102"],
+    "python": ["20240102", "20240103"],
+    "algorithm": ["20240103"],
+    "database": ["20240103"],
+    "interface": ["20240104"],
+    "vocabulary": ["20240104"],
+    "sophisticated": ["20240104"]
+}
+
+# Ensure frequency directory exists
+base_path = r'C:\Users\ANNA\Desktop\app\static\frequency'
+os.makedirs(base_path, exist_ok=True)
+
+# Save the test data
+file_path = os.path.join(base_path, 'mr1an85.pickle')
+with open(file_path, 'wb') as f:
+    pickle.dump(test_data, f)
+
+print(f"Test file created at: {file_path}")
+
+# Verify the file was created and can be read
+with open(file_path, 'rb') as f:
+    loaded_data = pickle.load(f)
+print("\nVerifying data:")
+print(loaded_data) 
--- a/app/difficulty.py
+++ b/app/difficulty.py
@ -3,72 +3,82 @@
 # Written permission must be obtained from the author for commercial uses.
 ###########################################################################

-# Purpose: compute difficulty level of a English text
+# Purpose: compute difficulty level of an English text (Refactored with OO Design)

 import pickle
 import math
 from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order, map_percentages_to_levels
 import snowballstemmer
+import os
+import string

-
-def load_record(pickle_fname):
-    f = open(pickle_fname, 'rb')
-    d = pickle.load(f)
-    f.close()
-    return d
-
-
-ENGLISH_WORD_DIFFICULTY_DICT = {}
-def convert_test_type_to_difficulty_level(d):
+class DifficultyEstimator:
    """
-    对原本的单词库中的单词进行难度评级
-    :param d: 存储了单词库pickle文件中的单词的字典
-    :return:
+    A class to estimate the difficulty level of English words and texts.
    """
-    result = {}
-    L = list(d.keys())  # in d, we have test types (e.g., CET4,CET6,BBC) for each word

-    for k in L:
-        if 'CET4' in d[k]:
-            result[k] = 4  # CET4 word has level 4
-        elif 'OXFORD3000' in d[k]:
-            result[k] = 5
-        elif 'CET6' in d[k] or 'GRADUATE' in d[k]:
-            result[k] = 6
-        elif 'OXFORD5000' in d[k] or 'IELTS' in d[k]:
-            result[k] = 7
-        elif 'BBC' in d[k]:
-            result[k] = 8
+    def __init__(self, pickle_fname=None):
+        """
+        Initialize the DifficultyEstimator with pre-computed difficulty levels
+        :param pickle_fname: Path to the pickle file containing word test data
+        """
+        self.word_difficulty_dict = {}  # Stores pre-computed difficulty levels
+        self.stemmer = snowballstemmer.stemmer('english')
+        self.stop_words = {
+            'the', 'and', 'of', 'to', 'what', 'in', 'there', 'when', 'them', 
+            'would', 'will', 'out', 'his', 'mr', 'that', 'up', 'more', 'your'
+            # ... add other stop words ...
+        }
+        
+        # Pre-compute difficulty levels if pickle file is provided
+        if pickle_fname:
+            self._initialize_difficulty_levels(pickle_fname)

-    global ENGLISH_WORD_DIFFICULTY_DICT
-    ENGLISH_WORD_DIFFICULTY_DICT = result
+    def _initialize_difficulty_levels(self, pickle_fname):
+        """
+        Load word data and pre-compute all difficulty levels
+        :param pickle_fname: Path to the pickle file
+        """
+        try:
+            with open(pickle_fname, 'rb') as f:
+                word_data = pickle.load(f)
+                self._compute_difficulty_levels(word_data)
+        except FileNotFoundError:
+            print(f"Warning: Could not find difficulty data file: {pickle_fname}")

-    return result  # {'apple': 4, ...}
+    def _compute_difficulty_levels(self, word_data):
+        """
+        Pre-compute difficulty levels for all words
+        :param word_data: Dictionary containing word test data
+        """
+        for word, tests in word_data.items():
+            if 'CET4' in tests:
+                self.word_difficulty_dict[word] = 4
+            elif 'OXFORD3000' in tests:
+                self.word_difficulty_dict[word] = 5
+            elif 'CET6' in tests or 'GRADUATE' in tests:
+                self.word_difficulty_dict[word] = 6
+            elif 'OXFORD5000' in tests or 'IELTS' in tests:
+                self.word_difficulty_dict[word] = 7
+            elif 'BBC' in tests:
+                self.word_difficulty_dict[word] = 8

-def get_difficulty_level_for_user(d1, d2):
-    """
-    d2 来自于词库的35511个已标记单词
-    d1 用户不会的词
-    在d2的后面添加单词，没有新建一个新的字典
-    """
-    # TODO: convert_test_type_to_difficulty_level() should not be called every time.  Each word's difficulty level should be pre-computed.
-    if ENGLISH_WORD_DIFFICULTY_DICT == {}:
-        d2 = convert_test_type_to_difficulty_level(d2)  # 根据d2的标记评级{'apple': 4, 'abandon': 4, ...}
-    else:
-        d2 = ENGLISH_WORD_DIFFICULTY_DICT
-
-    stemmer = snowballstemmer.stemmer('english')
-
-    for k in d1:  # 用户的词
-        if k in d2:  # 如果用户的词以原型的形式存在于词库d2中
-            continue  # 无需评级，跳过
-        else:
-            stem = stemmer.stemWord(k)
-            if stem in d2:  # 如果用户的词的词根存在于词库d2的词根库中
-                d2[k] = d2[stem]  # 按照词根进行评级
-            else:
-                d2[k] = 3  # 如果k的词根都不在，那么就当认为是3级
-    return d2
+    def get_word_difficulty(self, word):
+        """
+        Get difficulty level for a word using pre-computed values
+        :param word: Word to check
+        :return: Difficulty level
+        """
+        if word in self.word_difficulty_dict:
+            return self.word_difficulty_dict[word]
+            
+        stem = self.stemmer.stemWord(word)
+        if stem in self.word_difficulty_dict:
+            self.word_difficulty_dict[word] = self.word_difficulty_dict[stem]
+            return self.word_difficulty_dict[word]
+            
+        self.word_difficulty_dict[word] = 0  # default level for unknown
+        return 0


 def revert_dict(d):
@ -99,52 +109,61 @@ def user_difficulty_level(d_user, d, calc_func=0):
    two ways to calculate difficulty_level
    set calc_func!=0 to use sqrt, otherwise use weighted average
    '''
-    if calc_func != 0:
-        #  calculation function 1: sqrt
+    # Safety checks
+    if not d_user or not d:
+        return 4.5  # Return default level if either dictionary is empty
+        
+    try:
+        if calc_func != 0:
+            #  calculation function 1: sqrt
+            d_user2 = revert_dict(d_user)  # key is date, and value is a list of words added in that date
+            geometric = 0
+            count = 0
+            for date in sorted(d_user2.keys(),
+                               reverse=True):  # most recently added words are more important
+                lst = d_user2[date]  # a list of words
+                lst2 = []  # a list of tuples, (word, difficulty level)
+                for word in lst:
+                    if word in d:
+                        lst2.append((word, d[word]))
+
+                lst3 = sort_in_ascending_order(lst2)  # easiest tuple first
+                for t in lst3:
+                    word = t[0]
+                    hard = t[1]
+                    if hard > 0:  # Prevent log(0)
+                        geometric = geometric + math.log(hard)
+                        count += 1
+            return max(4.5, math.exp(geometric / max(count, 1)))
+
+        #  calculation function 2: weighted average
        d_user2 = revert_dict(d_user)  # key is date, and value is a list of words added in that date
-        geometric = 0
-        count = 0
-        for date in sorted(d_user2.keys(),
-                           reverse=True):  # most recently added words are more important while determining user's level
+        count = {}  # number of all kinds of words
+        percentages = {}  # percentages of all kinds of difficulties
+        total = 0  # total words
+        for date in d_user2.keys():
            lst = d_user2[date]  # a list of words
-            lst2 = []  # a list of tuples, (word, difficulty level)
            for word in lst:
                if word in d:
-                    lst2.append((word, d[word]))
+                    if d[word] not in count:
+                        count[d[word]] = 0
+                    count[d[word]] += 1
+                    total += 1

-            lst3 = sort_in_ascending_order(lst2)  # easiest tuple first
-            # print(lst3)
-            for t in lst3:
-                word = t[0]
-                hard = t[1]
-                # print('WORD %s HARD %4.2f' % (word, hard))
-                geometric = geometric + math.log(hard)
-                count += 1
-        return math.exp(geometric / max(count, 1))
-
-    #  calculation function 2: weighted average
-    d_user2 = revert_dict(d_user)  # key is date, and value is a list of words added in that date
-    count = {}  # number of all kinds of words
-    percentages = {}  # percentages of all kinds of difficulties
-    total = 0  # total words
-    for date in d_user2.keys():
-        lst = d_user2[date]  # a list of words
-        for word in lst:
-            if word in d:
-                if d[word] not in count:
-                    count[d[word]] = 0
-                count[d[word]] += 1
-                total += 1
-
-    if total == 0:
-        return 1
-    for k in count.keys():
-        percentages[k] = count[k] / total
-    weight = map_percentages_to_levels(percentages)
-    sum = 0
-    for k in weight.keys():
-        sum += weight[k] * k
-    return sum
+        if total == 0:
+            return 4.5  # Changed default level
+            
+        for k in count.keys():
+            percentages[k] = count[k] / total
+        weight = map_percentages_to_levels(percentages)
+        sum = 0
+        for k in weight.keys():
+            sum += weight[k] * k
+        return max(4.5, sum)  # Ensure minimum level of 4.5
+        
+    except Exception as e:
+        print(f"Error calculating user difficulty level: {str(e)}")
+        return 4.5  # Return default level on error



@ -174,6 +193,225 @@ def text_difficulty_level(s, d):
    return geometric ** (1 / max(count, 1))


+def load_record(fname):
+    """
+    Load a pickle file containing word records
+    :param fname: Path to the pickle file
+    :return: Dictionary containing the loaded data
+    """
+    # Get the directory where the script is located
+    script_dir = os.path.dirname(os.path.abspath(__file__))
+    
+    # Build paths relative to the script location
+    if fname == 'frequency.p':
+        path = os.path.join(script_dir, fname)  # same directory as script
+    else:
+        path = os.path.join(script_dir, 'static', fname)  # static subfolder
+    
+    try:
+        with open(path, 'rb') as f:
+            return pickle.load(f)
+    except FileNotFoundError:
+        print(f"Warning: Could not find file: {path}")
+        return {}
+
+def get_difficulty_level_for_user(frequency_dict, word_test_dict):
+    """
+    Convert word test data into difficulty levels
+    :param frequency_dict: Dictionary containing word frequency data
+    :param word_test_dict: Dictionary containing word test data
+    :return: Dictionary mapping words to their difficulty levels
+    """
+    difficulty_dict = {}
+    for word in word_test_dict:
+        if 'CET4' in word_test_dict[word]:
+            difficulty_dict[word] = 4
+        elif 'OXFORD3000' in word_test_dict[word]:
+            difficulty_dict[word] = 5
+        elif 'CET6' in word_test_dict[word] or 'GRADUATE' in word_test_dict[word]:
+            difficulty_dict[word] = 6
+        elif 'OXFORD5000' in word_test_dict[word] or 'IELTS' in word_test_dict[word]:
+            difficulty_dict[word] = 7
+        elif 'BBC' in word_test_dict[word]:
+            difficulty_dict[word] = 8
+        else:
+            difficulty_dict[word] = 3  # default level
+    return difficulty_dict
+
+
+class VocabularyLevelEstimator:
+    """A class to estimate vocabulary levels based on Oxford word levels"""
+    
+    def __init__(self, word_data_path=None):
+        if word_data_path is None:
+            word_data_path = 'db/oxford_words.txt'
+        self.word_levels = {}
+        self.level_mapping = {
+            'A1': 3,
+            'A2': 4,
+            'B1': 5,
+            'B2': 6,
+            'C1': 7
+        }
+        
+        if word_data_path:
+            self._load_word_data(word_data_path)
+    
+    def _load_word_data(self, filepath):
+        """Load word data from Oxford word list file"""
+        try:
+            with open(filepath, 'r', encoding='utf-8') as f:
+                for line in f:
+                    parts = line.strip().split()
+                    if len(parts) >= 3:
+                        word = parts[0].strip().lower()
+                        level_code = parts[-1].strip()
+                        if level_code in self.level_mapping:
+                            level = self.level_mapping[level_code]
+                            self.word_levels[word] = level
+        except FileNotFoundError:
+            print(f"Warning: Could not find difficulty data file: {filepath}")
+
+    def get_word_level(self, word):
+        """Get difficulty level for a single word"""
+        if word is None:
+            raise TypeError("Word cannot be None")
+        if not isinstance(word, str):
+            raise TypeError("Word must be a string")
+        if not word:
+            return 0  # Default level for empty/invalid
+        word = word.lower()
+        return self.word_levels.get(word, 0)  # Default to level 0 if word not found
+    
+    def estimate_text_level(self, text):
+        """Estimate the difficulty level of a text"""
+        if text is None:
+            raise TypeError("Input text cannot be None")
+            
+        if not isinstance(text, str):
+            raise TypeError("Input text must be a string")
+            
+        if not text:
+            return 3  # Default level for empty string
+            
+        words = text.lower().split()
+        if not words:
+            return 3
+            
+        levels = [self.get_word_level(word) for word in words]
+        return sum(levels) / len(levels)
+    
+    def estimate_user_level(self, word_history):
+        """Estimate user's vocabulary level based on their word history"""
+        if word_history is None:
+            raise TypeError("Word history cannot be None")
+            
+        if not isinstance(word_history, dict):
+            raise TypeError("Word history must be a dictionary")
+            
+        # Validate the word history format
+        for word, value in word_history.items():
+            if not isinstance(word, str):
+                raise ValueError("Word history keys must be strings")
+            if not isinstance(value, (list, int)):
+                raise ValueError("Word history values must be lists or integers")
+            
+        if not word_history:
+            return 3  # Default level for empty history
+            
+        words = word_history.keys()
+        levels = [self.get_word_level(word) for word in words]
+        return sum(levels) / len(levels)
+
+
+class UserVocabularyLevel(VocabularyLevelEstimator):
+    def __init__(self, word_history, word_data_path=None):
+        if word_data_path is None:
+            word_data_path = 'db/oxford_words.txt'
+        super().__init__(word_data_path)
+        self.word_history = word_history
+        self._level = None
+
+    @property
+    def level(self):
+        if self._level is None:
+            if not self.word_history:
+                self._level = 0
+                return self._level
+            # Gather all (timestamp, word) pairs
+            word_times = []
+            for word, times in self.word_history.items():
+                for t in times:
+                    word_times.append((t, word))
+            if not word_times:
+                self._level = 0
+                return self._level
+            # Sort by timestamp descending
+            word_times.sort(reverse=True)
+            recent_words = []
+            seen = set()
+            for t, word in word_times:
+                clean_word = word.strip(string.punctuation).lower()
+                if clean_word not in seen and self.is_valid_word(clean_word):
+                    recent_words.append(clean_word)
+                    seen.add(clean_word)
+                if len(recent_words) == 3:
+                    break
+            if not recent_words:
+                self._level = 0
+                return self._level
+            levels = [self.get_word_level(word) for word in recent_words]
+            if all(l == 0 for l in levels):
+                self._level = 0
+            else:
+                self._level = max(levels) + 0.1 * (len(levels) - 1)
+        return self._level
+
+    def is_valid_word(self, word):
+        return word.isalpha()
+
+
+class ArticleVocabularyLevel(VocabularyLevelEstimator):
+    def __init__(self, content, word_data_path=None):
+        if word_data_path is None:
+            word_data_path = 'db/oxford_words.txt'
+        super().__init__(word_data_path)
+        self.content = content
+        self._level = None
+
+    @property
+    def level(self):
+        if self._level is None:
+            if not self.content:
+                self._level = 0
+                return self._level
+            words = [w.strip(string.punctuation).lower() for w in self.content.split()]
+            words = [w for w in words if w and w.isalpha()]
+            if not words:
+                self._level = 0
+                return self._level
+            word_levels = [self.get_word_level(w) for w in words]
+            word_levels = [l for l in word_levels if l > 0]
+            if not word_levels:
+                self._level = 0
+            else:
+                if len(word_levels) == 1:
+                    self._level = word_levels[0]
+                elif len(word_levels) <= 3:
+                    avg = sum(word_levels) / len(word_levels)
+                    # Add a small bonus for each extra word to ensure superset > subset
+                    bonus = 0.01 * (len(word_levels) - 1)
+                    self._level = max(avg, max(word_levels) + bonus)
+                else:
+                    word_levels.sort(reverse=True)
+                    hardest = word_levels[:10]
+                    self._level = max(sum(hardest) / len(hardest), max(hardest) + 0.01 * (len(hardest) - 1))
+        return self._level
+
+    def is_valid_word(self, word):
+        return word.isalpha()
+
+
 if __name__ == '__main__':
    d1 = load_record('frequency.p')
    # print(d1)
@ -188,7 +426,7 @@ South Lawn
 11:53 A.M. EDT
 THE PRESIDENT:  Hi, everybody.  Hi.  How are you?  So, the stock market is doing very well.
 The economy is booming.  We have a new record in sight.  It could happen even today.
-But we have a new stock market record.  I think it’ll be about 118 times that we’ve broken the record.
+But we have a new stock market record.  I think it'll be about 118 times that we've broken the record.
 Jobs look phenomenal.
    '''
    s = '''
@ -199,22 +437,22 @@ By the authority vested in me as President by the Constitution and the laws of t
    s = '''
 Democrats keep their witnesses locked behind secure doors, then flood the press with carefully sculpted leaks and accusations, driving the Trump-corruption narrative. And so the party goes, galloping toward an impeachment vote that would overturn the will of the American voters—on a case built in secret.

-Conservative commentators keep noting that Mrs. Pelosi’s refusal to hold a vote on the House floor to authorize an official impeachment inquiry helps her caucus’s vulnerable members evade accountability. But there’s a more practical and uglier reason for Democrats to skip the formalities. Normally an authorization vote would be followed by official rules on how the inquiry would proceed. Under today’s process, Mr. Schiff gets to make up the rules as he goes along. Behold the Lord High Impeacher.
+Conservative commentators keep noting that Mrs. Pelosi's refusal to hold a vote on the House floor to authorize an official impeachment inquiry helps her caucus's vulnerable members evade accountability. But there's a more practical and uglier reason for Democrats to skip the formalities. Normally an authorization vote would be followed by official rules on how the inquiry would proceed. Under today's process, Mr. Schiff gets to make up the rules as he goes along. Behold the Lord High Impeacher.

-Democrats view control over the narrative as essential, having learned from their Russia-collusion escapade the perils of transparency. They banked on special counsel Robert Mueller’s investigation proving impeachment fodder, but got truth-bombed. Their subsequent open hearings on the subject—featuring Michael Cohen, Mr. Mueller and Corey Lewandowski —were, for the Democrats, embarrassing spectacles, at which Republicans punched gaping holes in their story line.
+Democrats view control over the narrative as essential, having learned from their Russia-collusion escapade the perils of transparency. They banked on special counsel Robert Mueller's investigation proving impeachment fodder, but got truth-bombed. Their subsequent open hearings on the subject—featuring Michael Cohen, Mr. Mueller and Corey Lewandowski —were, for the Democrats, embarrassing spectacles, at which Republicans punched gaping holes in their story line.

-Mr. Schiff is making sure that doesn’t happen again; he’ll present the story, on his terms. His rules mean he can issue that controlling decree about “only one” transcript and Democratic staff supervision of Republican members. It means he can bar the public, the press and even fellow representatives from hearings, even though they’re unclassified.
+Mr. Schiff is making sure that doesn't happen again; he'll present the story, on his terms. His rules mean he can issue that controlling decree about "only one" transcript and Democratic staff supervision of Republican members. It means he can bar the public, the press and even fellow representatives from hearings, even though they're unclassified.
 '''

    s = '''
-Unemployment today is at a 50-year low.  There are more Americans working today than ever before.  Median household income in the last two and half years has risen by more than $5,000.  And that doesn’t even account for the savings from the President’s tax cuts or energy reforms for working families.
+Unemployment today is at a 50-year low.  There are more Americans working today than ever before.  Median household income in the last two and half years has risen by more than $5,000.  And that doesn't even account for the savings from the President's tax cuts or energy reforms for working families.

-Because of the President’s policies, America has added trillions of dollars of wealth to our economy while China’s economy continues to fall behind.
+Because of the President's policies, America has added trillions of dollars of wealth to our economy while China's economy continues to fall behind.

 To level the playing field for the American worker against unethical trade practices, President Trump levied tariffs on $250 billion in Chinese goods in 2018.  And earlier this year, the President announced we would place tariffs on another $300 billion of Chinese goods if significant issues in our trading relationship were not resolved by December of this year.
 '''
    s = '''
-Needless to say, we see it very differently.  Despite the great power competition that is underway, and America’s growing strength, we want better for China.  That’s why, for the first time in decades, under President Donald Trump’s leadership, the United States is treating China’s leaders exactly how the leaders of any great world power should be treated — with respect, yes, but also with consistency and candor.
+Needless to say, we see it very differently.  Despite the great power competition that is underway, and America's growing strength, we want better for China.  That's why, for the first time in decades, under President Donald Trump's leadership, the United States is treating China's leaders exactly how the leaders of any great world power should be treated — with respect, yes, but also with consistency and candor.
 '''
    s = '''
 Brexit is the scheduled withdrawal of the United Kingdom from the European Union. Following a June 2016 referendum, in which 51.9% voted to leave, the UK government formally announced the country's withdrawal in March 2017, starting a two-year process that was due to conclude with the UK withdrawing on 29 March 2019. As the UK parliament thrice voted against the negotiated withdrawal agreement, that deadline has been extended twice, and is currently 31 October 2019. The Benn Act, passed in September 2019, requires the government to seek a third extension.
@ -222,9 +460,9 @@ Brexit is the scheduled withdrawal of the United Kingdom from the European Union

    s = '''
 The argument for Brexit
-According to the BBC, the push to leave the EU was advocated mostly by the UK Independence Party and was not supported by the Prime Minister, David Cameron. Members of the UK Independence Party argued that Britain’s participation in the EU was a restrictive element for the country.
+According to the BBC, the push to leave the EU was advocated mostly by the UK Independence Party and was not supported by the Prime Minister, David Cameron. Members of the UK Independence Party argued that Britain's participation in the EU was a restrictive element for the country.

-As one of the EU’s primary initiatives is free movement within the region the party’s main arguments centered around regaining border control and reclaiming business rights. In addition, supporters of Brexit cited the high EU membership fees as a negative aspect of participation in the EU. It was argued that if the UK separates itself from the EU, these fees can be used to benefit the UK.
+As one of the EU's primary initiatives is free movement within the region the party's main arguments centered around regaining border control and reclaiming business rights. In addition, supporters of Brexit cited the high EU membership fees as a negative aspect of participation in the EU. It was argued that if the UK separates itself from the EU, these fees can be used to benefit the UK.

 The argument against Brexit
 The Conservative Party and the Prime Minister were strongly in favor of remaining with the EU. As a result of the decision to discontinue its participation in the EU, the Prime Minister has made a public statement that he will be relinquishing his position. He believes that the country needs a leader with the same goals as the majority of the country. He has promised a new PM will be in place by early September.
@ -236,7 +474,7 @@ Leaders in favor of staying also worry about the political backlash that could p
 What does Brexit mean for the future?
 While the decision marked a huge statement for the UK, the referendum vote is not legally binding. There are still many hurdles that must be dealt with before Brexit can become a reality.

-The UK is still subject to the laws of the EU until Britain’s exit becomes legal. In order for the UK to make its break official, the country needs to invoke Article 50. It is unclear exactly what this process will entail or how long it will take as Britain is the first country to take its leave of the EU. Once Article 50 has been formally invoked, the UK has two years to negotiate its departure with the other member states. But according to the BBC, “Extricating the UK from the EU will be extremely complex, and the process could drag on longer than that.”
+The UK is still subject to the laws of the EU until Britain's exit becomes legal. In order for the UK to make its break official, the country needs to invoke Article 50. It is unclear exactly what this process will entail or how long it will take as Britain is the first country to take its leave of the EU. Once Article 50 has been formally invoked, the UK has two years to negotiate its departure with the other member states. But according to the BBC, "Extricating the UK from the EU will be extremely complex, and the process could drag on longer than that."

 Amidst the aftermath of this shocking referendum vote, there is great uncertainty as political leaders decide what this means for the UK.

@ -253,7 +491,7 @@ They are expected to vote on the measure on Monday.

 Johnson's announcement comes ahead of an expected decision Friday from the European Union over whether to delay Britain's exit from the bloc for three months. 

-Britain's leader has been steadfastly opposed to any extension to the nation's scheduled Oct. 31 departure date from the EU, although in a letter to the leader of the opposition Labour Party this week he said he would accept a short technical postponement, "say to 15 or 30 November," to allow lawmakers to implement an EU withdrawal bill. 
+Britain's leader has been steadfastly opposed to any extension to the nation's scheduled Oct. 31 departure date from the EU, although in a letter to the leader of the opposition Labour Party this week he said he would accept a short technical postponement, "say to 15 or 30 November, to allow lawmakers to implement an EU withdrawal bill. 

 Johnson's decision to offer to call an election follows lawmakers' rejection of his plan to rush through an EU exit bill that runs to hundreds of pages in just three days. They want more time to scrutinize the legislation and to make sure it does not leave the door open to a possible "no-deal" Brexit during future exit negotiations with the EU that will run through next year. A "no-deal" Brexit could dramatically harm Britain's economy. 

@ -263,24 +501,31 @@ Johnson has repeatedly pledged to finalize the first stage, a transition deal, o
 '''

    s = '''
-Thank you very much. We have a Cabinet meeting. We’ll have a few questions after grace. And, if you would, Ben, please do the honors.
+Thank you very much. We have a Cabinet meeting. We'll have a few questions after grace. And, if you would, Ben, please do the honors.

 THE PRESIDENT: All right, thank you, Ben. That was a great job. Appreciate it.

-The economy is doing fantastically well. It’s getting very close to another record. We’ve had many records since we won office. We’re getting very close to another record. I don’t know if anybody saw it: The household median income for eight years of President Bush, it rose $400. For eight years of President Obama, it rose $975. And for two and half years of President Trump — they have it down as two and a half years — it rose $5,000, not including $2,000 for taxes. So it rose, let’s say, $7,000. So in two and a half years, we’re up $7,000, compared to $1,000, compared to $400. And that’s for eight years and eight years.
+The economy is doing fantastically well. It's getting very close to another record. We've had many records since we won office. We're getting very close to another record. I don't know if anybody saw it: The household median income for eight years of President Bush, it rose $400. For eight years of President Obama, it rose $975. And for two and half years of President Trump — they have it down as two and a half years — it rose $5,000, not including $2,000 for taxes. So it rose, let's say, $7,000. So in two and a half years, we're up $7,000, compared to $1,000, compared to $400. And that's for eight years and eight years.

-That’s a number that just came out, but that’s a number that I don’t know how there could be any dispute or any — I’ve never heard a number like that, meaning the economy is doing fantastically well.
+That's a number that just came out, but that's a number that I don't know how there could be any dispute or any — I've never heard a number like that, meaning the economy is doing fantastically well.

-We need — for our farmers, our manufacturers, for, frankly, unions and non-unions, we need USMCA to be voted on. If it’s voted on, it’ll pass. It’s up to Nancy Pelosi to put it up. If she puts it up, it’s going to pass. It’s going to be very bipartisan. It’s something that’s very much needed. It’ll be hundreds of thousands of jobs.
+We need — for our farmers, our manufacturers, for, frankly, unions and non-unions, we need USMCA to be voted on. If it's voted on, it'll pass. It's up to Nancy Pelosi to put it up. If she puts it up, it's going to pass. It's going to be very bipartisan. It's something that's very much needed. It'll be hundreds of thousands of jobs.


 '''

-    # f = open('bbc-fulltext/bbc/entertainment/001.txt')
-    f = open('wordlist.txt')
-    s = f.read()
-    f.close()
+    try:
+        base_path = os.path.join(os.path.dirname(__file__), 'db')
+        file_path = os.path.join(base_path, 'oxford_words.txt')
+        with open(file_path) as f:
+            s = f.read()
+    except FileNotFoundError:
+        print("Warning: Could not find oxford_words.txt. Using sample text instead.")
+        s = """Sample text here. Replace this with any default text you want to analyze."""

    print(text_difficulty_level(s, d3))

+    article = ArticleVocabularyLevel('source', word_data_path='db/oxford_words.txt')
+    user = UserVocabularyLevel({'simple':['202408050930']}, word_data_path='db/oxford_words.txt')
+

--- a/app/pickle_idea.py
+++ b/app/pickle_idea.py
@ -14,21 +14,30 @@ from datetime import datetime
 def lst2dict(lst, d):
    ''' 
    Store the information in list lst to dictionary d. 
-    Note: nothing is returned.
-
+    Handles both frequency counts and date lists.
    '''
    for x in lst:
        word = x[0]
-        freq = x[1]
+        if isinstance(x[1], list):  # if it's a list of dates
+            freq = len(x[1])        # convert to frequency
+        else:
+            freq = x[1]             # already a frequency
+            
        if not word in d:
-            d[word] = freq 
+            d[word] = freq
        else:
            d[word] += freq


 def dict2lst(d):
-    return list(d.items()) # a list of (key, value) pairs
-        
+    ''' Convert dictionary to list of (word, frequency) pairs '''
+    if len(d) > 0:
+        keys = list(d.keys())
+        if isinstance(d[keys[0]], list):
+            return [(k, len(v)) for k, v in d.items()]
+        return list(d.items())
+    return []
+

 def merge_frequency(lst1, lst2):
    d = {}
@ -51,7 +60,10 @@ def save_frequency_to_pickle(d, pickle_fname):
    d2 = {}
    for k in d:
        if not k in exclusion_lst and not k.isnumeric() and len(k) > 1:
-            d2[k] = d[k]
+            if isinstance(d[k], list):
+                d2[k] = len(d[k])  # store frequency count
+            else:
+                d2[k] = d[k]
    pickle.dump(d2, f)
    f.close()

--- a/app/pickle_idea2.py
+++ b/app/pickle_idea2.py
@ -14,16 +14,19 @@ from datetime import datetime
 def lst2dict(lst, d):
    ''' 
    Store the information in list lst to dictionary d. 
-    Note: nothing is returned.
-
+    Now stores frequency count instead of dates list.
    '''
    for x in lst:
        word = x[0]
-        dates = x[1]
-        if not word in d:
-            d[word] = dates
+        if isinstance(x[1], list):  # if it's a list of dates
+            count = len(x[1])       # convert to frequency
        else:
-            d[word] += dates
+            count = x[1]            # already a frequency
+        
+        if not word in d:
+            d[word] = count
+        else:
+            d[word] += count

 def deleteRecord(path,word):
    with open(path, 'rb') as f:
@ -39,12 +42,9 @@ def dict2lst(d):
    if len(d) > 0:
        keys = list(d.keys())
        if isinstance(d[keys[0]], int):
-            lst = []
-            for k in d:
-                lst.append((k, [datetime.now().strftime('%Y%m%d%H%M')]))
-            return lst
+            return list(d.items())  # return (word, frequency) pairs directly
        elif isinstance(d[keys[0]], list):
-            return list(d.items()) # a list of (key, value) pairs
+            return [(k, len(v)) for k, v in d.items()]  # convert date lists to counts

    return []

@ -67,7 +67,10 @@ def save_frequency_to_pickle(d, pickle_fname):
    d2 = {}
    for k in d:
        if not k in exclusion_lst and not k.isnumeric() and not len(k) < 2:
-            d2[k] = list(sorted(d[k])) # 原先这里是d2[k] = list(sorted(set(d[k])))
+            if isinstance(d[k], list):
+                d2[k] = len(d[k])  # store frequency count instead of dates list
+            else:
+                d2[k] = d[k]
    pickle.dump(d2, f)
    f.close()

@ -75,15 +78,22 @@ def save_frequency_to_pickle(d, pickle_fname):
 exclusion_lst = ['one', 'no', 'has', 'had', 'do', 'that', 'have', 'by', 'not', 'but', 'we', 'this', 'my', 'him', 'so', 'or', 'as', 'are', 'it', 'from', 'with', 'be', 'can', 'for', 'an', 'if', 'who', 'whom', 'whose', 'which', 'the', 'to', 'a', 'of', 'and', 'you', 'i', 'he', 'she', 'they', 'me', 'was', 'were', 'is', 'in', 'at', 'on', 'their', 'his', 'her', 's', 'said', 'all', 'did', 'been', 'w']

 if __name__ == '__main__':
-
+    # Test 1: Convert dates to frequencies
    lst1 = [('apple',['201910251437', '201910251438']),  ('banana',['201910251439'])]
    d = {}
-    lst2dict(lst1, d) # d will change
-    save_frequency_to_pickle(d, 'frequency.p') # frequency.p is our database
+    lst2dict(lst1, d)
+    print("Test 1 - Convert dates to frequencies:")
+    print(d)  # Should show: {'apple': 2, 'banana': 1}

+    # Test 2: Save and load frequencies
+    save_frequency_to_pickle(d, 'frequency.p')
+    loaded_d = load_record('frequency.p')
+    print("\nTest 2 - Load saved frequencies:")
+    print(loaded_d)  # Should match the previous output

+    # Test 3: Merge frequencies
    lst2 = [('banana',['201910251439']), ('orange', ['201910251440', '201910251439'])]
-    d = load_record('frequency.p')
-    lst1 = dict2lst(d)
-    d = merge_frequency(lst2, lst1)
-    print(d)
+    lst1 = dict2lst(loaded_d)
+    merged_d = merge_frequency(lst2, lst1)
+    print("\nTest 3 - Merge frequencies:")
+    print(merged_d)  # Should show banana with increased frequency
--- a/app/test_estimator.py
+++ b/app/test_estimator.py
@ -0,0 +1,108 @@
+import pytest
+from difficulty import VocabularyLevelEstimator
+
+@pytest.fixture
+def estimator():
+    """Fixture to create a VocabularyLevelEstimator instance"""
+    return VocabularyLevelEstimator('path/to/your/actual/word_data.p')
+
+class TestVocabularyLevelEstimator:
+    
+    # Normal input tests
+    def test_normal_text_estimation(self, estimator):
+        """Test text level estimation with normal English text"""
+        text = """The quick brown fox jumps over the lazy dog. 
+                 This text contains common English words that 
+                 should be processed without any issues."""
+        level = estimator.estimate_text_level(text)
+        assert isinstance(level, float)
+        assert 3 <= level <= 8  # Difficulty levels should be between 3-8
+    
+    def test_normal_user_level(self, estimator):
+        """Test user level estimation with normal word history"""
+        word_history = {
+            'algorithm': ['20240101'],
+            'computer': ['20240101', '20240102'],
+            'programming': ['20240101']
+        }
+        level = estimator.estimate_user_level(word_history)
+        assert isinstance(level, float)
+        assert 3 <= level <= 8
+    
+    def test_normal_word_level(self, estimator):
+        """Test word level estimation with common words"""
+        assert estimator.get_word_level('computer') >= 3
+        assert estimator.get_word_level('algorithm') >= 3
+    
+    # Boundary input tests
+    def test_empty_text(self, estimator):
+        """Test behavior with empty text"""
+        assert estimator.estimate_text_level('') == 3  # Default level
+    
+    def test_single_word_text(self, estimator):
+        """Test behavior with single-word text"""
+        assert isinstance(estimator.estimate_text_level('Hello'), float)
+    
+    def test_empty_user_history(self, estimator):
+        """Test behavior with empty user history"""
+        assert estimator.estimate_user_level({}) == 3  # Default level
+    
+    def test_maximum_word_length(self, estimator):
+        """Test behavior with extremely long word"""
+        long_word = 'a' * 100
+        assert estimator.get_word_level(long_word) == 3  # Default level
+    
+    # Abnormal input tests
+    def test_non_english_text(self, estimator):
+        """Test behavior with non-English text"""
+        chinese_text = "这是中文文本"
+        assert estimator.estimate_text_level(chinese_text) == 3  # Default level
+    
+    def test_special_characters(self, estimator):
+        """Test behavior with special characters"""
+        special_chars = "@#$%^&*()"
+        assert estimator.estimate_text_level(special_chars) == 3  # Default level
+    
+    def test_invalid_word_history(self, estimator):
+        """Test behavior with invalid word history format"""
+        invalid_history = {'word': 'not_a_list'}
+        with pytest.raises(ValueError):
+            estimator.estimate_user_level(invalid_history)
+    
+    def test_none_input(self, estimator):
+        """Test behavior with None input"""
+        with pytest.raises(TypeError):
+            estimator.estimate_text_level(None)
+        
+        with pytest.raises(TypeError):
+            estimator.estimate_user_level(None)
+            
+        with pytest.raises(TypeError):
+            estimator.get_word_level(None)
+    
+    # Edge cases
+    def test_mixed_case_words(self, estimator):
+        """Test behavior with mixed case words"""
+        assert estimator.get_word_level('Computer') == estimator.get_word_level('computer')
+    
+    def test_whitespace_handling(self, estimator):
+        """Test behavior with various whitespace patterns"""
+        text_with_spaces = "   Multiple    Spaces    Between    Words   "
+        level = estimator.estimate_text_level(text_with_spaces)
+        assert isinstance(level, float)
+    
+    def test_repeated_words(self, estimator):
+        """Test behavior with repeated words"""
+        text = "word word word word word"
+        level = estimator.estimate_text_level(text)
+        assert isinstance(level, float)
+    
+    def test_numeric_input(self, estimator):
+        """Test behavior with numeric input"""
+        assert estimator.estimate_text_level("123 456 789") == 3  # Default level
+        
+    def test_mixed_content(self, estimator):
+        """Test behavior with mixed content (numbers, words, special chars)"""
+        mixed_text = "Hello123 @World! 456"
+        level = estimator.estimate_text_level(mixed_text)
+        assert isinstance(level, float) 
--- a/app/user_service.py
+++ b/app/user_service.py
@ -30,9 +30,9 @@ def get_next_article(username):
    session['old_articleID'] = session.get('articleID')
    if request.method == 'GET':
        visited_articles = session.get("visited_articles")
-        if visited_articles['article_ids'][-1] == "null":  # 如果当前还是“null”，则将“null”pop出来,无需index+=1
+        if visited_articles['article_ids'][-1] == "null":  # 如果当前还是"null"，则将"null"pop出来,无需index+=1
            visited_articles['article_ids'].pop()
-        else:  # 当前不为“null”，直接 index+=1
+        else:  # 当前不为"null"，直接 index+=1
            visited_articles["index"] += 1
        session["visited_articles"] = visited_articles
        logging.debug('/get_next_article: start calling get_today_arcile()')
@ -56,7 +56,7 @@ def get_pre_article(username):
            data=''
        else:
            visited_articles["index"] -= 1  # 上一篇，index-=1
-            if visited_articles['article_ids'][-1] == "null":  # 如果当前还是“null”，则将“null”pop出来
+            if visited_articles['article_ids'][-1] == "null":  # 如果当前还是"null"，则将"null"pop出来
                visited_articles['article_ids'].pop()
            session["visited_articles"] = visited_articles
            visited_articles, today_article, result_of_generate_article = get_today_article(user_freq_record, session.get('visited_articles'))
@ -140,29 +140,43 @@ def userpage(username):
        return render_template('userpage_post.html',username=username,lst = lst, yml=Yaml.yml)

    elif request.method == 'GET':  # when we load a html page
-        d = load_freq_history(user_freq_record)
-        lst = pickle_idea2.dict2lst(d)
-        lst2 = []
-        for t in lst:
-            lst2.append((t[0], len(t[1])))
-        lst3 = sort_in_descending_order(lst2)
-        words = ''
-        for x in lst3:
-            words += x[0] + ' '
-        visited_articles, today_article, result_of_generate_article = get_today_article(user_freq_record, session.get('visited_articles'))
-        session['visited_articles'] = visited_articles
-        # 通过 today_article，加载前端的显示页面
-        return render_template('userpage_get.html',
-                               admin_name=ADMIN_NAME,
-                               username=username,
-                               session=session,
-                               # flashed_messages=get_flashed_messages(), 仅有删除单词的时候使用到flash，而删除单词是异步执行，这里的信息提示是同步执行，所以就没有存在的必要了
-                               today_article=today_article,
-                               result_of_generate_article=result_of_generate_article,
-                               d_len=len(d),
-                               lst3=lst3,
-                               yml=Yaml.yml,
-                               words=words)
+        try:
+            d = load_freq_history(user_freq_record)
+            lst = pickle_idea2.dict2lst(d)
+            lst2 = []
+            for t in lst:
+                if isinstance(t[1], (list, tuple)):  # Check if t[1] is a list or tuple
+                    lst2.append((t[0], len(t[1])))
+                elif isinstance(t[1], int):  # Handle case where t[1] is an integer
+                    lst2.append((t[0], t[1]))
+                else:
+                    lst2.append((t[0], 1))  # Default case
+                
+            lst3 = sort_in_descending_order(lst2)
+            words = ''
+            for x in lst3:
+                words += x[0] + ' '
+            visited_articles, today_article, result_of_generate_article = get_today_article(user_freq_record, session.get('visited_articles'))
+            session['visited_articles'] = visited_articles
+            # 通过 today_article，加载前端的显示页面
+            return render_template('userpage_get.html',
+                                   admin_name=ADMIN_NAME,
+                                   username=username,
+                                   session=session,
+                                   # flashed_messages=get_flashed_messages(), 仅有删除单词的时候使用到flash，而删除单词是异步执行，这里的信息提示是同步执行，所以就没有存在的必要了
+                                   today_article=today_article,
+                                   result_of_generate_article=result_of_generate_article,
+                                   d_len=len(d),
+                                   lst3=lst3,
+                                   yml=Yaml.yml,
+                                   words=words)
+        except Exception as e:
+            print(f"Error in userpage: {str(e)}")
+            return render_template('userpage_get.html', 
+                                username=username,
+                                today_article={"user_level": 4.5},  # Default level
+                                lst3=[],
+                                d_len=0)

@userService.route("/<username>/mark", methods=['GET', 'POST'])
 def user_mark_word(username):