Compare commits

..

17 Commits

Author SHA1 Message Date
丁晟晔 ff6286cf01 删除 app/test/test_bug551_DingZeYu.py 2024-05-06 11:42:32 +08:00
丁晟晔 1d7e61d751 上传文件至 app/test 2024-05-06 11:36:36 +08:00
顾涵 708a6a2821 Merge pull request 'WIP:Bug529-GuHan' (#88) from Bug529-GuHan into master
Reviewed-on: http://121.4.94.30:3000/mrlan/EnglishPal/pulls/88
2023-06-04 12:39:34 +08:00
顾涵 688a198768 已经与Alpha-snapshot20230525 分支同步,重新提交 2023-05-28 16:31:12 +08:00
寻宇灿 1543b3095d Merge remote-tracking branch 'origin/Alpha-snapshot20230519' into Refactor-XunYucan 2023-05-25 22:30:06 +08:00
寻宇灿 c6bf323c60 修改格式 2023-05-25 21:23:25 +08:00
寻宇灿 03ccb3527a 重构前端阅读js,新增阅读器全局对象,新增生词朗读按钮 2023-05-25 17:35:31 +08:00
Hui Lan b41e1044bc difficulty.py: add some stop words, hoping that getting the next article can be faster. 2023-05-24 10:12:44 +08:00
Hui Lan 67e921ba60 difficulty.py: todo. 2023-05-23 22:25:40 +08:00
Hui Lan a5c3564f15 difficulty.py: do not stem a word twice. 2023-05-23 22:22:57 +08:00
Hui Lan 1295616d5b Merge branch 'Bug476-YuHuangtao' of http://121.4.94.30:3000/mrlan/EnglishPal into Alpha-snapshot20230519 2023-05-23 19:50:30 +08:00
俞黄焘 c151a0efaa 去掉了get_difficulty_level_for_user的多出的break 2023-05-23 19:40:33 +08:00
顾涵 030b89706e special_characters = '\_©~<=>+/[]*&$%^@.,?!:;#()"“”—‘’{}|' 用于过滤字符,我将其中的“-”删去,使连字符没有被过滤,实现录入例如fifty-six等组合词的功能。另外对于删除过滤是否会引发字符bug,答案是肯定的,但是这段代码中的过滤字符虽然多,但是并没有完全过滤掉所有字符,(过滤的只是键盘上能打出的字符,不包括输入法中能打出的特殊字符),所以字符bug本身就一直存在,我认为减少一个“-”字符对程序的过滤过程不会造成问题。 2023-05-20 15:29:12 +08:00
Hui Lan 349488167b requirements.txt: install snowballstemmer for better computing a word's difficulty level. 2023-05-19 09:03:20 +08:00
俞黄焘 39d96014d9 pull最新的snapshot-20230511,后更新了difficulty.py和Article.py的部分代码,提交了新的pickle文件 2023-05-18 23:29:38 +08:00
顾涵 acd8db6e3e special_characters = '\_©~<=>+/[]*&$%^@.,?!:;#()"“”—‘’{}|' 用于过滤字符,我将其中的“-”删去,使连字符没有被过滤,实现录入例如fifty-six等组合词的功能。另外对于删除过滤是否会引发字符bug,答案是肯定的,但是这段代码中的过滤字符虽然多,但是并没有完全过滤掉所有字符,(过滤的只是键盘上能打出的字符,不包括输入法中能打出的特殊字符),所以字符bug本身就一直存在,我认为减少一个对“1-”字符的过滤不会造成问题。 2023-05-15 19:24:43 +08:00
顾涵 9f3f5b43e1 special_characters = '\_©~<=>+/[]*&$%^@.,?!:;#()"“”—‘’{}|' 用于过滤字符,我将其中的“-”删去,使连字符没有被过滤,实现录入例如fifty-six等组合词的功能。另外对于删除过滤是否会引发字符bug,答案是肯定的,但是这段代码中的过滤字符虽然多,但是并没有完全过滤掉所有字符,(过滤的只是键盘上能打出的字符,不包括输入法中能打出的特殊字符),所以字符bug本身就一直存在,我认为减少一个对“-”字符的过滤不会造成问题。 2023-05-15 19:15:30 +08:00
10 changed files with 123 additions and 114 deletions

View File

@ -7,7 +7,7 @@ import random, glob
import hashlib
from datetime import datetime
from flask import Flask, request, redirect, render_template, url_for, session, abort, flash, get_flashed_messages
from difficulty import get_difficulty_level, text_difficulty_level, user_difficulty_level
from difficulty import get_difficulty_level_for_user, text_difficulty_level, user_difficulty_level
path_prefix = '/var/www/wordfreq/wordfreq/'
@ -53,7 +53,7 @@ def get_today_article(user_word_list, visited_articles):
# Choose article according to reader's level
d1 = load_freq_history(path_prefix + 'static/frequency/frequency.p')
d2 = load_freq_history(path_prefix + 'static/words_and_tests.p')
d3 = get_difficulty_level(d1, d2)
d3 = get_difficulty_level_for_user(d1, d2)
d = None
result_of_generate_article = "not found"

View File

@ -8,6 +8,7 @@
import pickle
import math
from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order
import snowballstemmer
def load_record(pickle_fname):
@ -17,41 +18,51 @@ def load_record(pickle_fname):
return d
def difficulty_level_from_frequency(word, d):
level = 1
if not word in d:
return level
if 'what' in d:
ratio = (d['what']+1)/(d[word]+1) # what is a frequent word
level = math.log( max(ratio, 1), 2)
def convert_test_type_to_difficulty_level(d):
"""
对原本的单词库中的单词进行难度评级
:param d: 存储了单词库pickle文件中的单词的字典
:return:
"""
result = {}
L = list(d.keys()) # in d, we have test types (e.g., CET4,CET6,BBC) for each word
level = min(level, 8)
return level
for k in L:
if 'CET4' in d[k]:
result[k] = 4 # CET4 word has level 4
elif 'OXFORD3000' in d[k]:
result[k] = 5
elif 'CET6' in d[k] or 'GRADUATE' in d[k]:
result[k] = 6
elif 'OXFORD5000' in d[k] or 'IELTS' in d[k]:
result[k] = 7
elif 'BBC' in d[k]:
result[k] = 8
return result # {'apple': 4, ...}
def get_difficulty_level(d1, d2):
d = {}
L = list(d1.keys()) # in d1, we have freuqence for each word
L2 = list(d2.keys()) # in d2, we have test types (e.g., CET4,CET6,BBC) for each word
L.extend(L2)
L3 = list(set(L)) # L3 contains all words
for k in L3:
if k in d2:
if 'CET4' in d2[k]:
d[k] = 4 # CET4 word has level 4
elif 'CET6' in d2[k]:
d[k] = 6
elif 'BBC' in d2[k]:
d[k] = 8
if k in d1: # BBC could contain easy words that are not in CET4 or CET6. So 4 is not reasonable. Recompute difficulty level.
d[k] = min(difficulty_level_from_frequency(k, d1), d[k])
elif k in d1:
d[k] = difficulty_level_from_frequency(k, d1)
def get_difficulty_level_for_user(d1, d2):
"""
d2 来自于词库的35511个已标记单词
d1 用户不会的词
在d2的后面添加单词没有新建一个新的字典
"""
# TODO: convert_test_type_to_difficulty_level() should not be called every time. Each word's difficulty level should be pre-computed.
d2 = convert_test_type_to_difficulty_level(d2) # 根据d2的标记评级{'apple': 4, 'abandon': 4, ...}
stemmer = snowballstemmer.stemmer('english')
return d
for k in d1: # 用户的词
if k in d2: # 如果用户的词以原型的形式存在于词库d2中
continue # 无需评级,跳过
else:
stem = stemmer.stemWord(k)
if stem in d2: # 如果用户的词的词根存在于词库d2的词根库中
d2[k] = d2[stem] # 按照词根进行评级
else:
d2[k] = 3 # 如果k的词根都不在那么就当认为是3级
return d2
def revert_dict(d):
'''
@ -62,12 +73,13 @@ def revert_dict(d):
for k in d:
if type(d[k]) is list: # d[k] is a list of dates.
lst = d[k]
elif type(d[k]) is int: # for backward compatibility. d was sth like {'word':1}. The value d[k] is not a list of dates, but a number representing how frequent this word had been added to the new word book.
elif type(d[
k]) is int: # for backward compatibility. d was sth like {'word':1}. The value d[k] is not a list of dates, but a number representing how frequent this word had been added to the new word book.
freq = d[k]
lst = freq*['2021082019'] # why choose this date? No particular reasons. I fix the bug in this date.
lst = freq * ['2021082019'] # why choose this date? No particular reasons. I fix the bug in this date.
for time_info in lst:
date = time_info[:10] # until hour
date = time_info[:10] # until hour
if not date in d2:
d2[date] = [k]
else:
@ -76,42 +88,44 @@ def revert_dict(d):
def user_difficulty_level(d_user, d):
d_user2 = revert_dict(d_user) # key is date, and value is a list of words added in that date
d_user2 = revert_dict(d_user) # key is date, and value is a list of words added in that date
count = 0
geometric = 1
for date in sorted(d_user2.keys(), reverse=True): # most recently added words are more important while determining user's level
lst = d_user2[date] # a list of words
lst2 = [] # a list of tuples, (word, difficulty level)
for word in lst:
for date in sorted(d_user2.keys(),
reverse=True): # most recently added words are more important while determining user's level
lst = d_user2[date] # a list of words
lst2 = [] # a list of tuples, (word, difficulty level)
for word in lst:
if word in d:
lst2.append((word, d[word]))
lst3 = sort_in_ascending_order(lst2) # easiest tuple first
#print(lst3)
lst3 = sort_in_ascending_order(lst2) # easiest tuple first
# print(lst3)
for t in lst3:
word = t[0]
hard = t[1]
#print('WORD %s HARD %4.2f' % (word, hard))
# print('WORD %s HARD %4.2f' % (word, hard))
geometric = geometric * (hard)
count += 1
if count >= 10:
return geometric**(1/count)
return geometric ** (1 / count)
return geometric**(1/max(count,1))
return geometric ** (1 / max(count, 1))
def text_difficulty_level(s, d):
s = remove_punctuation(s)
L = freq(s)
lst = [] # a list of tuples, each tuple being (word, difficulty level)
lst = [] # a list of tuples, each tuple being (word, difficulty level)
stop_words = {'the':1, 'and':1, 'of':1, 'to':1, 'what':1, 'in':1, 'there':1, 'when':1, 'them':1, 'would':1, 'will':1, 'out':1, 'his':1, 'mr':1, 'that':1, 'up':1, 'more':1, 'your':1, 'it':1, 'now':1, 'very':1, 'then':1, 'could':1, 'he':1, 'any':1, 'some':1, 'with':1, 'into':1, 'you':1, 'our':1, 'man':1, 'other':1, 'time':1, 'was':1, 'than':1, 'know':1, 'about':1, 'only':1, 'like':1, 'how':1, 'see':1, 'is':1, 'before':1, 'such':1, 'little':1, 'two':1, 'its':1, 'as':1, 'these':1, 'may':1, 'much':1, 'down':1, 'for':1, 'well':1, 'should':1, 'those':1, 'after':1, 'same':1, 'must':1, 'say':1, 'first':1, 'again':1, 'us':1, 'great':1, 'where':1, 'being':1, 'come':1, 'over':1, 'good':1, 'himself':1, 'am':1, 'never':1, 'on':1, 'old':1, 'here':1, 'way':1, 'at':1, 'go':1, 'upon':1, 'have':1, 'had':1, 'without':1, 'my':1, 'day':1, 'be':1, 'but':1, 'though':1, 'from':1, 'not':1, 'too':1, 'another':1, 'this':1, 'even':1, 'still':1, 'her':1, 'yet':1, 'under':1, 'by':1, 'let':1, 'just':1, 'all':1, 'because':1, 'we':1, 'always':1, 'off':1, 'yes':1, 'so':1, 'while':1, 'why':1, 'which':1, 'me':1, 'are':1, 'or':1, 'no':1, 'if':1, 'an':1, 'also':1, 'thus':1, 'who':1, 'cannot':1, 'she':1, 'whether':1} # ignore these words while computing the artile's difficulty level
for x in L:
word = x[0]
if word in d:
if word not in stop_words and word in d:
lst.append((word, d[word]))
lst2 = sort_in_descending_order(lst) # most difficult words on top
#print(lst2)
lst2 = sort_in_descending_order(lst) # most difficult words on top
# print(lst2)
count = 0
geometric = 1
for t in lst2:
@ -119,24 +133,20 @@ def text_difficulty_level(s, d):
hard = t[1]
geometric = geometric * (hard)
count += 1
if count >= 20: # we look for n most difficult words
return geometric**(1/count)
return geometric**(1/max(count,1))
if count >= 20: # we look for n most difficult words
return geometric ** (1 / count)
return geometric ** (1 / max(count, 1))
if __name__ == '__main__':
d1 = load_record('frequency.p')
#print(d1)
# print(d1)
d2 = load_record('words_and_tests.p')
#print(d2)
# print(d2)
d3 = get_difficulty_level(d1, d2)
d3 = get_difficulty_level_for_user(d1, d2)
s = '''
South Lawn
@ -197,7 +207,6 @@ Amidst the aftermath of this shocking referendum vote, there is great uncertaint
'''
s = '''
British Prime Minister Boris Johnson walks towards a voting station during the Brexit referendum in Britain, June 23, 2016. (Photo: EPA-EFE)
@ -218,7 +227,6 @@ The prime minister was forced to ask for an extension to Britain's EU departure
Johnson has repeatedly pledged to finalize the first stage, a transition deal, of Britain's EU divorce battle by Oct. 31. A second stage will involve negotiating its future relationship with the EU on trade, security and other salient issues.
'''
s = '''
Thank you very much. We have a Cabinet meeting. Well have a few questions after grace. And, if you would, Ben, please do the honors.
@ -233,17 +241,11 @@ We need — for our farmers, our manufacturers, for, frankly, unions and non-uni
'''
#f = open('bbc-fulltext/bbc/entertainment/001.txt')
# f = open('bbc-fulltext/bbc/entertainment/001.txt')
f = open('wordlist.txt')
s = f.read()
f.close()
print(text_difficulty_level(s, d3))

View File

@ -7,6 +7,7 @@ css:
js:
head: # 在页面加载之前加载
- ../static/js/jquery.js
- ../static/js/read.js
- ../static/js/word_operation.js
bottom: # 在页面加载完之后加载
- ../static/js/fillword.js

View File

@ -1,9 +1,5 @@
let isRead = true;
let isChoose = true;
let reader = window.speechSynthesis; // 全局定义朗读者,以便朗读和暂停
let current_position = 0; // 朗读文本的当前位置
let original_position = 0; // 朗读文本的初始位置
let to_speak = ""; // 朗读的初始内容
function getWord() {
return window.getSelection ? window.getSelection() : document.selection.createRange().text;
@ -11,7 +7,7 @@ function getWord() {
function fillInWord() {
let word = getWord();
if (isRead) read(word);
if (isRead) Reader.read(word, inputSlider.value);
if (!isChoose) return;
const element = document.getElementById("selected-words");
element.value = element.value + " " + word;
@ -19,50 +15,17 @@ function fillInWord() {
document.getElementById("text-content").addEventListener("click", fillInWord, false);
function makeUtterance(str, rate) {
let msg = new SpeechSynthesisUtterance(str);
msg.rate = rate;
msg.lang = "en-US"; // TODO: add language options menu
msg.onboundary = ev => {
if (ev.name == "word") {
current_position = ev.charIndex;
}
}
return msg;
}
const sliderValue = document.getElementById("rangeValue"); // 显示值
const inputSlider = document.getElementById("rangeComponent"); // 滑块元素
const sliderValue = document.getElementById("rangeValue");
const inputSlider = document.getElementById("rangeComponent");
inputSlider.oninput = () => {
let value = inputSlider.value; // 获取滑块的值
let value = inputSlider.value;
sliderValue.textContent = value + '×';
if (!reader.speaking) return;
reader.cancel();
let msg = makeUtterance(to_speak.substring(original_position + current_position), value);
original_position = original_position + current_position;
current_position = 0;
reader.speak(msg);
};
function read(s) {
to_speak = s.toString();
original_position = 0;
current_position = 0;
let msg = makeUtterance(to_speak, inputSlider.value);
reader.speak(msg);
}
function onReadClick() {
isRead = !isRead;
if (!isRead) {
reader.cancel();
}
}
function onChooseClick() {
isChoose = !isChoose;
}
function stopRead() {
reader.cancel();
}

35
app/static/js/read.js Normal file
View File

@ -0,0 +1,35 @@
var Reader = (function() {
let reader = window.speechSynthesis;
let current_position = 0;
let original_position = 0;
let to_speak = "";
function makeUtterance(str, rate) {
let msg = new SpeechSynthesisUtterance(str);
msg.rate = rate;
msg.lang = "en-US";
msg.onboundary = ev => {
if (ev.name == "word") {
current_position = ev.charIndex;
}
}
return msg;
}
function read(s, rate) {
to_speak = s.toString();
original_position = 0;
current_position = 0;
let msg = makeUtterance(to_speak, rate);
reader.speak(msg);
}
function stopRead() {
reader.cancel();
}
return {
read: read,
stopRead: stopRead
};
})();

View File

@ -62,6 +62,13 @@ function delete_word(theWord) {
});
}
function read_word(theWord) {
let to_speak = $("#word_" + theWord).text();
original_position = 0;
current_position = 0;
Reader.read(to_speak, inputSlider.value);
}
/*
* interface Word {
* word: string,
@ -95,6 +102,7 @@ function wordTemplate(word) {
<a class="btn btn-success" onclick="familiar('${word.word}')" role="button">熟悉</a>
<a class="btn btn-warning" onclick="unfamiliar('${word.word}')" role="button">不熟悉</a>
<a class="btn btn-danger" onclick="delete_word('${word.word}')" role="button">删除</a>
<a class="btn btn-info" onclick="read_word('${word.word}')" role="button">朗读</a>
</p>`;
}

Binary file not shown.

View File

@ -133,6 +133,7 @@
<a class="btn btn-success" onclick="familiar('{{ word }}')" role="button">熟悉</a>
<a class="btn btn-warning" onclick="unfamiliar('{{ word }}')" role="button">不熟悉</a>
<a class="btn btn-danger" onclick="delete_word('{{ word }}')" role="button">删除</a>
<a class="btn btn-info" onclick="read_word('{{ word }}')" role="button">朗读</a>
</p>
{% endfor %}
</div>

View File

@ -39,8 +39,7 @@ def file2str(fname):#文件转字符
def remove_punctuation(s): # 这里是s是形参 (parameter)。函数被调用时才给s赋值。
special_characters = '\_©~<=>+-/[]*&$%^@.,?!:;#()"“”—‘’{}|《》【】、!¥();:?。,' # 把里面的字符都去掉
special_characters = '\_©~<=>+/[]*&$%^@.,?!:;#()"“”—‘’{}|' # 把里面的字符都去掉
for c in special_characters:
s = s.replace(c, ' ') # 防止出现把 apple,apple 移掉逗号后变成 appleapple 情况
s = s.replace('--', ' ')
@ -104,7 +103,6 @@ if __name__ == '__main__':
for x in sort_in_descending_order(L):
print('%s\t%d\t%s' % (x[0], x[1], youdao_link(x[0])))#函数导出
# 把频率的结果放result.html中
make_html_page(sort_in_descending_order(L), 'result.html')
@ -119,7 +117,6 @@ if __name__ == '__main__':
# 合并频率
lst_history = pickle_idea.dict2lst(d)
d = pickle_idea.merge_frequency(L, lst_history)
pickle_idea.save_frequency_to_pickle(d, 'frequency.p')

View File

@ -1,3 +1,5 @@
Flask==1.1.2
selenium==3.141.0
PyYAML~=6.0
pony==0.7.16
snowballstemmer==2.2.0