新增可被分隔的中文符号

更新 'app/wordfreqCMD.py'
2023-05-12 23:22:52 +08:00 · 2023-05-04 17:40:44 +08:00 · 2023-05-04 17:39:48 +08:00 · 2023-05-04 17:18:04 +08:00 · 2023-05-04 17:02:28 +08:00 · 2022-12-08 16:03:38 +08:00
10 changed files with 114 additions and 123 deletions
--- a/app/Article.py
+++ b/app/Article.py
@ -7,7 +7,7 @@ import random, glob
 import hashlib
 from datetime import datetime
 from flask import Flask, request, redirect, render_template, url_for, session, abort, flash, get_flashed_messages
-from difficulty import get_difficulty_level_for_user, text_difficulty_level, user_difficulty_level
+from difficulty import get_difficulty_level, text_difficulty_level, user_difficulty_level
 path_prefix = '/var/www/wordfreq/wordfreq/'
@ -53,7 +53,7 @@ def get_today_article(user_word_list, visited_articles):
    # Choose article according to reader's level
    d1 = load_freq_history(path_prefix + 'static/frequency/frequency.p')
    d2 = load_freq_history(path_prefix + 'static/words_and_tests.p')
-    d3 = get_difficulty_level_for_user(d1, d2)
+    d3 = get_difficulty_level(d1, d2)
    d = None
    result_of_generate_article = "not found"
--- a/app/difficulty.py
+++ b/app/difficulty.py
@ -8,7 +8,6 @@
 import pickle
 import math
 from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order
 import snowballstemmer
 def load_record(pickle_fname):
@ -18,51 +17,41 @@ def load_record(pickle_fname):
    return d
-def convert_test_type_to_difficulty_level(d):
+def difficulty_level_from_frequency(word, d):
-    """
+    level = 1
-    对原本的单词库中的单词进行难度评级
+    if not word in d:
-    :param d: 存储了单词库pickle文件中的单词的字典
+        return level
-    :return:
+    
-    """
+    if 'what' in d:
-    result = {}
+        ratio = (d['what']+1)/(d[word]+1) # what is a frequent word
-    L = list(d.keys())  # in d, we have test types (e.g., CET4,CET6,BBC) for each word
+        level = math.log( max(ratio, 1), 2)
-    for k in L:
+    level = min(level, 8) 
-        if 'CET4' in d[k]:
+    return level
            result[k] = 4  # CET4 word has level 4
        elif 'OXFORD3000' in d[k]:
            result[k] = 5
        elif 'CET6' in d[k] or 'GRADUATE' in d[k]:
            result[k] = 6
        elif 'OXFORD5000' in d[k] or 'IELTS' in d[k]:
            result[k] = 7
        elif 'BBC' in d[k]:
            result[k] = 8
    return result  # {'apple': 4, ...}
-def get_difficulty_level_for_user(d1, d2):
+def get_difficulty_level(d1, d2):
-    """
+    d = {}
-    d2 来自于词库的35511个已标记单词
+    L = list(d1.keys())  # in d1, we have freuqence for each word
-    d1 用户不会的词
+    L2 = list(d2.keys()) # in d2, we have test types (e.g., CET4,CET6,BBC) for each word
-    在d2的后面添加单词，没有新建一个新的字典
+    L.extend(L2)
-    """
+    L3 = list(set(L)) # L3 contains all words
-    # TODO: convert_test_type_to_difficulty_level() should not be called every time.  Each word's difficulty level should be pre-computed.
+    for k in L3:
-    d2 = convert_test_type_to_difficulty_level(d2)  # 根据d2的标记评级{'apple': 4, 'abandon': 4, ...}
+        if k in d2:
-    stemmer = snowballstemmer.stemmer('english')
+            if 'CET4' in d2[k]:
                d[k] = 4 # CET4 word has level 4
            elif 'CET6' in d2[k]:
                d[k] = 6
            elif 'BBC' in d2[k]:
                d[k] = 8
                if k in d1: # BBC could contain easy words that are not in CET4 or CET6.  So 4 is not reasonable.  Recompute difficulty level.
                    d[k] = min(difficulty_level_from_frequency(k, d1), d[k])
        elif k in d1:
            d[k] = difficulty_level_from_frequency(k, d1)
-    for k in d1:  # 用户的词
+    return d
        if k in d2:  # 如果用户的词以原型的形式存在于词库d2中
            continue  # 无需评级，跳过
        else:
            stem = stemmer.stemWord(k)
            if stem in d2:  # 如果用户的词的词根存在于词库d2的词根库中
                d2[k] = d2[stem]  # 按照词根进行评级
            else:
                d2[k] = 3  # 如果k的词根都不在，那么就当认为是3级
    return d2
 def revert_dict(d):
    '''
@ -73,13 +62,12 @@ def revert_dict(d):
    for k in d:
        if type(d[k]) is list:  # d[k] is a list of dates.
            lst = d[k]
-        elif type(d[
+        elif type(d[k]) is int: # for backward compatibility.  d was sth like {'word':1}.  The value d[k] is not a list of dates, but a number representing how frequent this word had been added to the new word book. 
                      k]) is int:  # for backward compatibility.  d was sth like {'word':1}.  The value d[k] is not a list of dates, but a number representing how frequent this word had been added to the new word book.
            freq = d[k]
-            lst = freq * ['2021082019']  # why choose this date?  No particular reasons.  I fix the bug in this date.
+            lst = freq*['2021082019'] # why choose this date?  No particular reasons.  I fix the bug in this date.
        for time_info in lst:
-            date = time_info[:10]  # until hour
+            date = time_info[:10] # until hour
            if not date in d2:
                d2[date] = [k]
            else:
@ -88,44 +76,42 @@ def revert_dict(d):
 def user_difficulty_level(d_user, d):
-    d_user2 = revert_dict(d_user)  # key is date, and value is a list of words added in that date
+    d_user2 = revert_dict(d_user) # key is date, and value is a list of words added in that date
    count = 0
    geometric = 1
-    for date in sorted(d_user2.keys(),
+    for date in sorted(d_user2.keys(), reverse=True): # most recently added words are more important while determining user's level
-                       reverse=True):  # most recently added words are more important while determining user's level
+        lst = d_user2[date] # a list of words
-        lst = d_user2[date]  # a list of words
+        lst2 = [] # a list of tuples, (word, difficulty level)
-        lst2 = []  # a list of tuples, (word, difficulty level)
+        for  word in lst:
        for word in lst:
            if word in d:
                lst2.append((word, d[word]))
-        lst3 = sort_in_ascending_order(lst2)  # easiest tuple first
+        lst3 = sort_in_ascending_order(lst2) # easiest tuple first
-        # print(lst3)
+        #print(lst3)
        for t in lst3:
            word = t[0]
            hard = t[1]
-            # print('WORD %s HARD %4.2f' % (word, hard))
+            #print('WORD %s HARD %4.2f' % (word, hard))
            geometric = geometric * (hard)
            count += 1
            if count >= 10:
-                return geometric ** (1 / count)
+                return geometric**(1/count)
-    return geometric ** (1 / max(count, 1))
+    return geometric**(1/max(count,1))
 def text_difficulty_level(s, d):
    s = remove_punctuation(s)
    L = freq(s)
-    lst = []  # a list of tuples, each tuple being (word, difficulty level)
+    lst = [] # a list of tuples, each tuple being (word, difficulty level)
    stop_words = {'the':1, 'and':1, 'of':1, 'to':1, 'what':1, 'in':1, 'there':1, 'when':1, 'them':1, 'would':1, 'will':1, 'out':1, 'his':1, 'mr':1, 'that':1, 'up':1, 'more':1, 'your':1, 'it':1, 'now':1, 'very':1, 'then':1, 'could':1, 'he':1, 'any':1, 'some':1, 'with':1, 'into':1, 'you':1, 'our':1, 'man':1, 'other':1, 'time':1, 'was':1, 'than':1, 'know':1, 'about':1, 'only':1, 'like':1, 'how':1, 'see':1, 'is':1, 'before':1, 'such':1, 'little':1, 'two':1, 'its':1, 'as':1, 'these':1, 'may':1, 'much':1, 'down':1, 'for':1, 'well':1, 'should':1, 'those':1, 'after':1, 'same':1, 'must':1, 'say':1, 'first':1, 'again':1, 'us':1, 'great':1, 'where':1, 'being':1, 'come':1, 'over':1, 'good':1, 'himself':1, 'am':1, 'never':1, 'on':1, 'old':1, 'here':1, 'way':1, 'at':1, 'go':1, 'upon':1, 'have':1, 'had':1, 'without':1, 'my':1, 'day':1, 'be':1, 'but':1, 'though':1, 'from':1, 'not':1, 'too':1, 'another':1, 'this':1, 'even':1, 'still':1, 'her':1, 'yet':1, 'under':1, 'by':1, 'let':1, 'just':1, 'all':1, 'because':1, 'we':1, 'always':1, 'off':1, 'yes':1, 'so':1, 'while':1, 'why':1, 'which':1, 'me':1, 'are':1, 'or':1, 'no':1, 'if':1, 'an':1, 'also':1, 'thus':1, 'who':1, 'cannot':1, 'she':1, 'whether':1} # ignore these words while computing the artile's difficulty level
    for x in L:
        word = x[0]
-        if word not in stop_words and word in d:
+        if word in d:
            lst.append((word, d[word]))
-    lst2 = sort_in_descending_order(lst)  # most difficult words on top
+    lst2 = sort_in_descending_order(lst) # most difficult words on top
-    # print(lst2)
+    #print(lst2)
    count = 0
    geometric = 1
    for t in lst2:
@ -133,20 +119,24 @@ def text_difficulty_level(s, d):
        hard = t[1]
        geometric = geometric * (hard)
        count += 1
-        if count >= 20:  # we look for n most difficult words
+        if count >= 20: # we look for n most difficult words
-            return geometric ** (1 / count)
+            return geometric**(1/count)
    return geometric**(1/max(count,1))
    return geometric ** (1 / max(count, 1))
 if __name__ == '__main__':
    d1 = load_record('frequency.p')
-    # print(d1)
+    #print(d1)
    d2 = load_record('words_and_tests.p')
-    # print(d2)
+    #print(d2)
-    d3 = get_difficulty_level_for_user(d1, d2)
+
    d3 = get_difficulty_level(d1, d2)
    s = '''
 South Lawn
@ -207,6 +197,7 @@ Amidst the aftermath of this shocking referendum vote, there is great uncertaint
 '''
    s = '''
 British Prime Minister Boris Johnson walks towards a voting station during the Brexit referendum in Britain, June 23, 2016. (Photo: EPA-EFE)
@ -227,6 +218,7 @@ The prime minister was forced to ask for an extension to Britain's EU departure
 Johnson has repeatedly pledged to finalize the first stage, a transition deal, of Britain's EU divorce battle by Oct. 31. A second stage will involve negotiating its future relationship with the EU on trade, security and other salient issues.
 '''
    s = '''
 Thank you very much. We have a Cabinet meeting. We’ll have a few questions after grace. And, if you would, Ben, please do the honors.
@ -241,11 +233,17 @@ We need — for our farmers, our manufacturers, for, frankly, unions and non-uni
 '''
-    # f = open('bbc-fulltext/bbc/entertainment/001.txt')
+
    #f = open('bbc-fulltext/bbc/entertainment/001.txt')
    f = open('wordlist.txt')
    s = f.read()
    f.close()
    print(text_difficulty_level(s, d3))
-
+            
--- a/app/static/config.yml
+++ b/app/static/config.yml
@ -7,7 +7,6 @@ css:
 js:
  head: # 在页面加载之前加载
    - ../static/js/jquery.js
    - ../static/js/read.js
    - ../static/js/word_operation.js
  bottom: # 在页面加载完之后加载
    - ../static/js/fillword.js
--- a/app/static/js/fillword.js
+++ b/app/static/js/fillword.js
@ -1,5 +1,9 @@
 let isRead = true;
 let isChoose = true;
 let reader = window.speechSynthesis; // 全局定义朗读者，以便朗读和暂停
 let current_position = 0; // 朗读文本的当前位置
 let original_position = 0; // 朗读文本的初始位置
 let to_speak = ""; // 朗读的初始内容
 function getWord() {
    return window.getSelection ? window.getSelection() : document.selection.createRange().text;
@ -7,7 +11,7 @@ function getWord() {
 function fillInWord() {
    let word = getWord();
-    if (isRead) Reader.read(word, inputSlider.value);
+    if (isRead) read(word);
    if (!isChoose) return;
    const element = document.getElementById("selected-words");
    element.value = element.value + " " + word;
@ -15,17 +19,50 @@ function fillInWord() {
 document.getElementById("text-content").addEventListener("click", fillInWord, false);
-const sliderValue = document.getElementById("rangeValue");
+function makeUtterance(str, rate) {
-const inputSlider = document.getElementById("rangeComponent");
+    let msg = new SpeechSynthesisUtterance(str);
    msg.rate = rate;
    msg.lang = "en-US"; // TODO: add language options menu
    msg.onboundary = ev => {
        if (ev.name == "word") {
            current_position = ev.charIndex;
        }
    }
    return msg;
 }
 const sliderValue = document.getElementById("rangeValue"); // 显示值
 const inputSlider = document.getElementById("rangeComponent"); // 滑块元素
 inputSlider.oninput = () => {
-    let value = inputSlider.value;
+    let value = inputSlider.value; // 获取滑块的值
    sliderValue.textContent = value + '×';
    if (!reader.speaking) return;
    reader.cancel();
    let msg = makeUtterance(to_speak.substring(original_position + current_position), value);
    original_position = original_position + current_position;
    current_position = 0;
    reader.speak(msg);
 };
 function read(s) {
    to_speak = s.toString();
    original_position = 0;
    current_position = 0;
    let msg = makeUtterance(to_speak, inputSlider.value);
    reader.speak(msg);
 }
 function onReadClick() {
    isRead = !isRead;
    if (!isRead) {
        reader.cancel();
    }
 }
 function onChooseClick() {
    isChoose = !isChoose;
 }
 function stopRead() {
    reader.cancel();
 }
--- a/app/static/js/read.js
+++ b/app/static/js/read.js
@ -1,35 +0,0 @@
 var Reader = (function() {
    let reader = window.speechSynthesis;
    let current_position = 0;
    let original_position = 0;
    let to_speak = "";
    function makeUtterance(str, rate) {
        let msg = new SpeechSynthesisUtterance(str);
        msg.rate = rate;
        msg.lang = "en-US";
        msg.onboundary = ev => {
            if (ev.name == "word") {
                current_position = ev.charIndex;
            }
        }
        return msg;
    }
    function read(s, rate) {
        to_speak = s.toString();
        original_position = 0;
        current_position = 0;
        let msg = makeUtterance(to_speak, rate);
        reader.speak(msg);
    }
    function stopRead() {
        reader.cancel();
    }
    return {
        read: read,
        stopRead: stopRead
    };
 })();
--- a/app/static/js/word_operation.js
+++ b/app/static/js/word_operation.js
@ -62,13 +62,6 @@ function delete_word(theWord) {
    });
 }
 function read_word(theWord) {
    let to_speak = $("#word_" + theWord).text();
    original_position = 0;
    current_position = 0;
    Reader.read(to_speak, inputSlider.value);
 }
 /* 
 * interface Word {
 *   word: string,
@ -102,7 +95,6 @@ function wordTemplate(word) {
        <a class="btn btn-success" onclick="familiar('${word.word}')" role="button">熟悉</a>
        <a class="btn btn-warning" onclick="unfamiliar('${word.word}')" role="button">不熟悉</a>
        <a class="btn btn-danger" onclick="delete_word('${word.word}')" role="button">删除</a>
        <a class="btn btn-info" onclick="read_word('${word.word}')" role="button">朗读</a>
    </p>`;
 }
--- a/app/static/words_and_tests.p
+++ b/app/static/words_and_tests.p
--- a/app/templates/userpage_get.html
+++ b/app/templates/userpage_get.html
@ -133,7 +133,6 @@
                    <a class="btn btn-success" onclick="familiar('{{ word }}')" role="button">熟悉</a>
                    <a class="btn btn-warning" onclick="unfamiliar('{{ word }}')" role="button">不熟悉</a>
                    <a class="btn btn-danger" onclick="delete_word('{{ word }}')" role="button">删除</a>
                    <a class="btn btn-info" onclick="read_word('{{ word }}')" role="button">朗读</a>
                </p>
            {% endfor %}
        </div>
--- a/app/wordfreqCMD.py
+++ b/app/wordfreqCMD.py
@ -39,7 +39,8 @@ def file2str(fname):#文件转字符
 def remove_punctuation(s): # 这里是s是形参 (parameter)。函数被调用时才给s赋值。
-    special_characters = '\_©~<=>+/[]*&$%^@.,?!:;#()"“”—‘’{}|' # 把里面的字符都去掉
+    special_characters = '\_©~<=>+-/[]*&$%^@.,?!:;#()"“”—‘’{}|《》【】、！￥（）；：？。，' # 把里面的字符都去掉
    for c in special_characters:
        s = s.replace(c, ' ') # 防止出现把 apple,apple 移掉逗号后变成 appleapple 情况
    s = s.replace('--', ' ')
@ -103,6 +104,7 @@ if __name__ == '__main__':
    for x in sort_in_descending_order(L):
        print('%s\t%d\t%s' % (x[0], x[1], youdao_link(x[0])))#函数导出
    # 把频率的结果放result.html中
    make_html_page(sort_in_descending_order(L), 'result.html') 
@ -117,6 +119,7 @@ if __name__ == '__main__':
    # 合并频率
    lst_history = pickle_idea.dict2lst(d)
    d = pickle_idea.merge_frequency(L, lst_history)
    pickle_idea.save_frequency_to_pickle(d, 'frequency.p')
--- a/requirements.txt
+++ b/requirements.txt
@ -1,5 +1,3 @@
 Flask==1.1.2
 selenium==3.141.0
 PyYAML~=6.0
 pony==0.7.16
 snowballstemmer==2.2.0
Author	SHA1	Message	Date
汪瑜	b486b6b9db	新增可被分隔的中文符号	2023-05-12 23:22:52 +08:00
汪瑜	0fedf590e8	更新 'app/wordfreqCMD.py'	2023-05-04 17:40:44 +08:00
汪瑜	96dfadcde6	更新 'app/wordfreqCMD.py'	2023-05-04 17:39:48 +08:00
杨昱晨	7b55fc1859	first commit	2023-05-04 17:18:04 +08:00
杨昱晨	ac2046ac2e	first commit	2023-05-04 17:02:28 +08:00
whiost	baa1c45782	[Bugfix] 部分中文符号不会被分隔	2022-12-08 16:03:38 +08:00