Added a vocabulary.py,fixed bug585...likely,at least it works

2025-07-02 06:10:09 +08:00
6 changed files with 143 additions and 7 deletions
--- a/README.md
+++ b/README.md
@ -18,7 +18,7 @@ picked from articles selected for him to read according his vocabulary level.  E

 `python3 main.py`

-Make sure you have put the SQLite database file in the path `app/db` (see below).
+Make sure you have put the SQLite database file in the path `app/static` (see below).


 ## Run it as a Docker container
@ -214,5 +214,5 @@ Bug report: http://118.25.96.118/bugzilla/show_bug.cgi?id=215
 Bug report: http://118.25.96.118/bugzilla/show_bug.cgi?id=489


-*Last modified on 2026-03-12*
+*Last modified on 2023-01-30*

--- a/app/Article.py
+++ b/app/Article.py
@ -106,7 +106,7 @@ def get_today_article(user_word_list, visited_articles):
        text_level = text_difficulty_level(d['text'], d3)
        result_of_generate_article = "found"

-    today_article = {}
+    today_article = None
    if d:
        oxford_words = load_oxford_words(oxford_words_path)
        oxford_word_count, total_words = count_oxford_words(d['text'],oxford_words)
--- a/app/main.py
+++ b/app/main.py
@ -144,8 +144,8 @@ if __name__ == '__main__':
    运行程序
    '''
    # app.secret_key = os.urandom(16)
-    app.run(debug=True, port=5000)
-    # app.run(debug=True)
+    # app.run(debug=False, port='6000')
+    app.run(debug=True)
    # app.run(debug=True, port='6000')
    # app.run(host='0.0.0.0', debug=True, port='6000')
    # print(mod5('123'))
--- a/app/templates/mainpage_get.html
+++ b/app/templates/mainpage_get.html
@ -31,7 +31,7 @@
            <p><a href="/login">登录</a>  <a href="/signup">注册</a> <a href="/static/usr/instructions.html">使用说明</a></p >
            <p><b> {{ random_ads }}。 <a href="/signup">试试</a>吧！</b></p>
        {% endif %}
-        <div class="alert alert-success" role="alert">共有文章 <span class="badge bg-success"> {{ number_of_essays }} </span> 篇，Oxford 5000 单词占比 <span class="badge bg-success"> {{ (ratio * 100) | int }}%  </span> </div>
+        <div class="alert alert-success" role="alert">共有文章 <span class="badge bg-success"> {{ number_of_essays }} </span> 篇，覆盖 <span class="badge bg-success"> {{ (ratio * 100) | int }}%  </span> 的 Oxford5000 单词</div>
        <p>粘贴1篇文章 (English only)</p>
        <form method="post" action="/">
            <textarea name="content" id="article" rows="10" cols="120"></textarea><br/>
--- a/app/templates/userpage_get.html
+++ b/app/templates/userpage_get.html
@ -87,7 +87,7 @@

    <div id="text-content">
        <div id="found">
-          <div class="alert alert-success" role="alert">According to your word list, your level is <span class="text-decoration-underline" id="user_level">{{ today_article["user_level"] }}</span>  and we have chosen an article with a difficulty level of <span class="text-decoration-underline" id="text_level">{{ today_article["text_level"] }}</span> for you. <span class="text-decoration-underline" id="ratio">{{ (today_article["ratio"] * 100) | int }}%</span> of the words in this article are in Oxford Word 5000.</div>
+          <div class="alert alert-success" role="alert">According to your word list, your level is <span class="text-decoration-underline" id="user_level">{{ today_article["user_level"] }}</span>  and we have chosen an article with a difficulty level of <span class="text-decoration-underline" id="text_level">{{ today_article["text_level"] }}</span> for you. The Oxford word coverage is <span class="text-decoration-underline" id="ratio">{{ (today_article["ratio"] * 100) | int }}%.</span></div>
            <p class="text-muted" id="date">Article added on: {{ today_article["date"] }}</p><br/>

 	    <button onclick="saveArticle()" >标记文章</button>
--- a/app/vocabulary.py
+++ b/app/vocabulary.py
@ -0,0 +1,136 @@
+###########################################################################
+# Copyright 2019 (C) Hui Lan <hui.lan@cantab.net>
+# Written permission must be obtained from the author for commercial uses.
+###########################################################################
+
+# Purpose: compute difficulty level of a English text
+
+import pickle
+import math
+from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order, map_percentages_to_levels
+import snowballstemmer
+
+
+def load_record(pickle_fname):
+    with open(pickle_fname, 'rb') as f:
+        d = pickle.load(f)
+    return d
+
+
+ENGLISH_WORD_DIFFICULTY_DICT = {}
+def convert_test_type_to_difficulty_level(d):
+    """
+    对原本的单词库中的单词进行难度评级
+    :param d: 存储了单词库pickle文件中的单词的字典
+    :return:
+    """
+    result = {}
+    L = list(d.keys())  # in d, we have test types (e.g., CET4,CET6,BBC) for each word
+
+    for k in L:
+        if 'CET4' in d[k]:
+            result[k] = 4  # CET4 word has level 4
+        elif 'OXFORD3000' in d[k]:
+            result[k] = 5
+        elif 'CET6' in d[k] or 'GRADUATE' in d[k]:
+            result[k] = 6
+        elif 'OXFORD5000' in d[k] or 'IELTS' in d[k]:
+            result[k] = 7
+        elif 'BBC' in d[k]:
+            result[k] = 8
+
+    global ENGLISH_WORD_DIFFICULTY_DICT
+    ENGLISH_WORD_DIFFICULTY_DICT = result
+
+    return result  # {'apple': 4, ...}
+
+def get_difficulty_level_for_user(d1, d2):
+    """
+    d2 来自于词库的35511个已标记单词
+    d1 用户不会的词
+    在d2的后面添加单词，没有新建一个新的字典
+    """
+    # TODO: convert_test_type_to_difficulty_level() should not be called every time.  Each word's difficulty level should be pre-computed.
+    if ENGLISH_WORD_DIFFICULTY_DICT == {}:
+        d2 = convert_test_type_to_difficulty_level(d2)  # 根据d2的标记评级{'apple': 4, 'abandon': 4, ...}
+    else:
+        d2 = ENGLISH_WORD_DIFFICULTY_DICT
+
+    stemmer = snowballstemmer.stemmer('english')
+
+    for k in d1:  # 用户的词
+        if k in d2:  # 如果用户的词以原型的形式存在于词库d2中
+            continue  # 无需评级，跳过
+        else:
+            stem = stemmer.stemWord(k)
+            if stem in d2:  # 如果用户的词的词根存在于词库d2的词根库中
+                d2[k] = d2[stem]  # 按照词根进行评级
+            else:
+                d2[k] = 3  # 如果k的词根都不在，那么就当认为是3级
+    return d2
+
+
+def revert_dict(d):
+    '''
+    In d, word is the key, and value is a list of dates.
+    In d2 (the returned value of this function), time is the key, and the value is a list of words picked at that time.
+    '''
+    d2 = {}
+    for k in d:
+        if type(d[k]) is list:  # d[k] is a list of dates.
+            lst = d[k]
+        elif type(d[
+                      k]) is int:  # for backward compatibility.  d was sth like {'word':1}.  The value d[k] is not a list of dates, but a number representing how frequent this word had been added to the new word book.
+            freq = d[k]
+            lst = freq * ['2021082019']  # why choose this date?  No particular reasons.  I fix the bug in this date.
+
+        for time_info in lst:
+            date = time_info[:10]  # until hour
+            if not date in d2:
+                d2[date] = [k]
+            else:
+                d2[date].append(k)
+    return d2
+
+
+class VocabularyLevelEstimator:
+    _test = load_record('words_and_tests.p') # map a word to the sources where it appears
+
+    @property
+    def level(self):
+        total = 0.0 # TODO: need to compute this number
+        num = 1
+        for word in self.word_lst:
+            num += 1
+            if word in self._test:
+                print(f'{word} : {self._test[word]}')
+            else:
+                print(f'{word}')
+        return total/num
+
+class UserVocabularyLevel(VocabularyLevelEstimator):
+    def __init__(self, d):
+        self.d = d
+        self.word_lst = list(d.keys())
+        # just look at the most recently-added words
+
+
+
+class ArticleVocabularyLevel(VocabularyLevelEstimator):
+    def __init__(self, content):
+        self.content = content
+        self.word_lst = content.lower().split()
+        # select the 10 most difficult words
+
+
+if __name__ == '__main__':
+    d = load_record('frequency_mrlan85.pickle')
+    print(d)
+    user = UserVocabularyLevel(d)
+    print(user.level) # level is a property
+    article = ArticleVocabularyLevel('This is an interesting article')
+    print(article.level)
+    
+
+
+