From 3654776aec7d2ee69d26731b34b96bdce9eec2e8 Mon Sep 17 00:00:00 2001 From: Stela <1710738188@qq.com> Date: Mon, 6 Jun 2022 19:42:26 +0800 Subject: [PATCH] BugFix358 + Improve + Refactor --- app/templates/analyse_word.html | 27 +++++++++++ app/templates/userpage_get.html | 9 +++- app/templates/userpage_post.html | 3 ++ app/user_service.py | 39 +++++++++++---- app/wordfreqCMD.py | 82 +++++++++++++++++++++++++------- requirements.txt | 4 +- 6 files changed, 136 insertions(+), 28 deletions(-) create mode 100644 app/templates/analyse_word.html diff --git a/app/templates/analyse_word.html b/app/templates/analyse_word.html new file mode 100644 index 0000000..6736f57 --- /dev/null +++ b/app/templates/analyse_word.html @@ -0,0 +1,27 @@ +<!DOCTYPE html> +<html lang="en"> +<head> + <meta charset="utf-8"> + <meta name="viewport" content="width=device-width, initial-scale=1.0, minimum-scale=0.5, maximum-scale=3.0, user-scalable=yes" /> + <title>EnglishPal - Analyse Result</title> +</head> +<body> + <div> + <br/> + <h3>单词 {{ word }} 的统计信息</h3> + <hr/> + <ul> + <li>在总 {{ resultlen }} 篇文章中,总共出现了 {{ totcount }} 次</li> + <li>总占比为 {{ '%.5f' % (100 * totcount / totcnt) }}%</li> + </ul> + <ul> + <li>在一篇文章中:</li> + <li>最多出现了 {{ maxcount }} 次</li> + <li>最高占比为 {{ '%.5f' % maxrate }}%</li> + <li>最少出现了 {{ mincount }} 次</li> + <li>最低占比为 {{ '%.5f' % minrate }}%</li> + </ul> + <input type="button" value="返回" onclick="location.href='{{ username }}'"/> + </div> +</body> +</html> \ No newline at end of file diff --git a/app/templates/userpage_get.html b/app/templates/userpage_get.html index 19542c1..9dee463 100644 --- a/app/templates/userpage_get.html +++ b/app/templates/userpage_get.html @@ -38,10 +38,17 @@ <p><b>收集生词吧</b> (可以在正文中划词,也可以复制黏贴)</p> <form method="post" action="/{{ username }}"> + <input type="hidden" name="methodtype" value="multiple"/> <textarea name="content" id="selected-words" rows="10" cols="120"></textarea><br/> <input type="submit" value="把生词加入我的生词库"/> <input type="reset" value="清除"/> - </form> + </form><br/> + <form method="post" action="/{{ username }}"> + <input type="hidden" name="methodtype" value="single"/> + <input type="text" name="content"/><br/> + <input type="submit" value="分析此单词" style="margin-top:5px"/> + <input type="reset" value="清除"/> + </form><br/> {% if session.get['thisWord'] %} <script type="text/javascript"> //point to the anchor in the page whose id is aaa if it exists diff --git a/app/templates/userpage_post.html b/app/templates/userpage_post.html index ba3b38d..fb4e14a 100644 --- a/app/templates/userpage_post.html +++ b/app/templates/userpage_post.html @@ -31,6 +31,9 @@ <a href='http://youdao.com/w/eng/{{word}}/#keyfrom=dict2.index' title={{word}}>{{word}}</a> ({{x[1]}}) <input type="checkbox" name="marked" value={{word}}> + {% if x[0] in userwordlist %} + <font color="red"><b>已在生词簿内</b></font> + {% endif %} </p> {% endfor %} diff --git a/app/user_service.py b/app/user_service.py index 139c140..30c3027 100644 --- a/app/user_service.py +++ b/app/user_service.py @@ -11,6 +11,7 @@ import Yaml from Article import get_today_article, load_freq_history from WordFreq import WordFreq from wordfreqCMD import sort_in_descending_order +from UseSqlite import RecordQuery import pickle_idea import pickle_idea2 @@ -102,10 +103,36 @@ def userpage(username): user_freq_record = path_prefix + 'static/frequency/' + 'frequency_%s.pickle' % (username) if request.method == 'POST': # when we submit a form - content = request.form['content'] - f = WordFreq(content) - lst = f.get_freq() - return render_template('userpage_post.html',username=username,lst = lst, yml=Yaml.yml) + if request.form['methodtype'] == 'multiple': + content = request.form['content'] + f = WordFreq(content) + lst = f.get_freq() + userwordlist = pickle_idea2.dict2lst(load_freq_history(user_freq_record)) + userwordlist2 = [] + for t in userwordlist: + userwordlist2.append(t[0]) + return render_template('userpage_post.html', username=username, lst = lst, yml=Yaml.yml, userwordlist=userwordlist2) + else: + word = request.form['content'].lower() # 指定单词 + rq = RecordQuery(path_prefix + 'static/wordfreqapp.db') + rq.instructions("SELECT * FROM article") # 获取所有文章 + rq.do() + result = rq.get_results() + mincount, maxcount, totcount, totcnt = 1e9, 0, 0, 0 + maxrate, minrate = 0.0, 100.0 + for d in result: # 查找数据库内的所有文章 + str = d['text'].lower().split() + cnt = str.count(word) + tot = len(str) + totcnt += tot + mincount = min(mincount, cnt) + maxcount = max(maxcount, cnt) + totcount += cnt + maxrate = max(maxrate, 100 * cnt / tot) + minrate = min(minrate, 100 * cnt / tot) + return render_template('analyse_word.html', mincount=mincount, maxcount=maxcount, totcount=totcount, totcnt=totcnt, + maxrate=maxrate, minrate=minrate, resultlen=len(result), + word=word, username=session.get('username')) elif request.method == 'GET': # when we load a html page d = load_freq_history(user_freq_record) @@ -127,10 +154,6 @@ def userpage(username): yml=Yaml.yml, words=words) - - - - @userService.route("/<username>/mark", methods=['GET', 'POST']) def user_mark_word(username): ''' diff --git a/app/wordfreqCMD.py b/app/wordfreqCMD.py index 9ee7e56..b890805 100644 --- a/app/wordfreqCMD.py +++ b/app/wordfreqCMD.py @@ -18,7 +18,6 @@ def freq(fruit): ''' result = [] - fruit = fruit.lower() # 字母转小写 flst = fruit.split() # 字符串转成list c = collections.Counter(flst) @@ -27,24 +26,63 @@ def freq(fruit): def youdao_link(s): # 有道链接 - link = 'http://youdao.com/w/eng/' + s + '/#keyfrom=dict2.index'# 网址 + link = 'http://youdao.com/w/eng/' + s + '/#keyfrom=dict2.index' # 网址 return link -def file2str(fname):#文件转字符 - f = open(fname) #打开 - s = f.read() #读取 - f.close() #关闭 +def file2str(fname): # 文件转字符 + f = open(fname) # 打开 + s = f.read() # 读取 + f.close() # 关闭 return s def remove_punctuation(s): # 这里是s是形参 (parameter)。函数被调用时才给s赋值。 - special_characters = '_©~=+[]*&$%^@.,?!:;#()"“”—‘’' # 把里面的字符都去掉 + special_characters = '_©~=+[]*&$%^@.,?!:;#()"“”—' # 把里面的字符都去掉 + for c in special_characters: - s = s.replace(c, ' ') # 防止出现把 apple,apple 移掉逗号后变成 appleapple 情况 + s = s.replace(c, ' ') # 把所有符号都替换成空格,防止出现把 apple,apple 移掉逗号后变成 appleapple 情况 s = s.replace('--', ' ') s = s.strip() # 去除前后的空格 + + single_quote = '‘’\'' # 各种单引号单独处理 + n, i = len(s), 0 + t = '' # 用来收集我需要保留的字符 + while i < n: # 只有单引号前后都有英文字符,才保留 + if s[i] in single_quote: + if i == 0 or i == n - 1 or s[i - 1] == ' ' or s[i + 1] == ' ': + i = i + 1 + continue # condition 1+2 + if s[i + 1] == 's' and (i + 2 == n or s[i + 2] == ' '): + i = i + 2 + continue # condition 2 + t += '\'' # condition 3, standardize quote + else: + t += s[i] + i = i + 1 + return t + + ''' + 单引号出现在文章中的情况: + 1、某些情况下作为双引号使用,引用段落 + 这种情况一般出现在词首或词尾 + 处理方式:直接去除 + 2、表示名词所有格 + 对于单数名词以's为后缀,对于复数名词以s'或'为后缀 + 处理方式:将'其后的部分去除 + 3、单词元音位置的缩写 + 最常见的有not->n't/is->'s/have->'ve这类 + 处理方式:保留 + 上述处理方式2/3两点可能产生一种冲突: + 某些单词元音缩写后恰好以's结尾 + 但考虑到用于学习英语的文章一般不会出现过于口语化的缩写单词 + 因此要么还是表所有格,要么就是is的缩写 + 故不考虑这种冲突情况 + ''' + + ''' + 以下是原本的代码 if '\'' in s: n = len(s) t = '' # 用来收集我需要保留的字符 @@ -58,6 +96,7 @@ def remove_punctuation(s): # 这里是s是形参 (parameter)。函数被调用 return t else: return s + ''' def sort_in_descending_order(lst):# 单词按频率降序排列 @@ -74,16 +113,25 @@ def make_html_page(lst, fname): ''' 功能:把lst的信息存到fname中,以html格式。 ''' - s = '' - count = 1 - for x in lst: - # <a href="">word</a> - s += '<p>%d <a href="%s">%s</a> (%d)</p>' % (count, youdao_link(x[0]), x[0], x[1]) - count += 1 + result = '' + id = 1 + + for word in lst: + result += '<p>' + result += '%d ' % id + result += getHyperlinkHTML(word[0]) + result += ' (%d)' % word[1] + result += '</p>' + # result += '<p>%d <a href="%s">%s</a> (%d)</p>' % (count, youdao_link(x[0]), x[0], x[1]) + id += 1 + f = open(fname, 'w') - f.write(s) + f.write(result) f.close() +def getHyperlinkHTML(word): + s = '<a href="' + youdao_link(word) + '">' + word + '</a>' + return s ## main(程序入口) if __name__ == '__main__': @@ -96,12 +144,12 @@ if __name__ == '__main__': s = file2str(fname) else: print('I can accept at most 2 arguments.') - sys.exit()# 结束程序运行, 下面的代码不会被执行了。 + sys.exit() # 结束程序运行, 下面的代码不会被执行了。 s = remove_punctuation(s) # 这里是s是实参(argument),里面有值 L = freq(s) for x in sort_in_descending_order(L): - print('%s\t%d\t%s' % (x[0], x[1], youdao_link(x[0])))#函数导出 + print('%s\t%d\t%s' % (x[0], x[1], youdao_link(x[0]))) # 函数导出 # 把频率的结果放result.html中 make_html_page(sort_in_descending_order(L), 'result.html') diff --git a/requirements.txt b/requirements.txt index 2746a3b..ce2da4f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,3 +1,3 @@ -Flask==1.1.2 -selenium==3.141.0 +Flask==2.1.2 +selenium==4.2.0 PyYAML~=6.0