Compare commits
18 Commits
master
...
Bug493-Gon
Author | SHA1 | Date |
---|---|---|
田其鹭 | 4cf201095d | |
田其鹭 | 654fd52c44 | |
田其鹭 | a5c792b782 | |
田其鹭 | 0549bd9035 | |
田其鹭 | 705aa5efcb | |
田其鹭 | 61f46f7d58 | |
dktea | b9a6e4407e | |
dktea | 0962d95ce1 | |
dktea | 071d1c968b | |
dktea | 68b254f96c | |
Lan Hui | 4fb1cad745 | |
Lan Hui | 2a553e0f8f | |
Lan Hui | 4513a80bdd | |
Lan Hui | e73c66edd3 | |
Lan Hui | 69835c7f8d | |
徐幸 | 71cc627aad | |
徐幸 | 22775f9797 | |
徐幸 | 9e36644215 |
|
@ -180,6 +180,10 @@ Demo video link: https://b23.tv/QuB77m
|
|||
|
||||
Bug report: http://118.25.96.118/bugzilla/show_bug.cgi?id=215
|
||||
|
||||
<<<<<<< HEAD
|
||||
龚科丞
|
||||
*Last modified on 2021-10-17*
|
||||
=======
|
||||
|
||||
|
||||
|
||||
|
@ -194,3 +198,4 @@ Bug report: http://118.25.96.118/bugzilla/show_bug.cgi?id=489
|
|||
|
||||
*Last modified on 2023-01-30*
|
||||
|
||||
>>>>>>> d9f6df7fbe585395a19b9a08c411d841b6b89fd4
|
||||
|
|
|
@ -7,7 +7,7 @@ import random, glob
|
|||
import hashlib
|
||||
from datetime import datetime
|
||||
from flask import Flask, request, redirect, render_template, url_for, session, abort, flash, get_flashed_messages
|
||||
from difficulty import get_difficulty_level_for_user, text_difficulty_level, user_difficulty_level
|
||||
from difficulty import get_difficulty_level, text_difficulty_level, user_difficulty_level
|
||||
|
||||
|
||||
path_prefix = '/var/www/wordfreq/wordfreq/'
|
||||
|
@ -53,7 +53,7 @@ def get_today_article(user_word_list, visited_articles):
|
|||
# Choose article according to reader's level
|
||||
d1 = load_freq_history(path_prefix + 'static/frequency/frequency.p')
|
||||
d2 = load_freq_history(path_prefix + 'static/words_and_tests.p')
|
||||
d3 = get_difficulty_level_for_user(d1, d2)
|
||||
d3 = get_difficulty_level(d1, d2)
|
||||
|
||||
d = None
|
||||
result_of_generate_article = "not found"
|
||||
|
|
|
@ -8,7 +8,6 @@
|
|||
import pickle
|
||||
import math
|
||||
from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order
|
||||
import snowballstemmer
|
||||
|
||||
|
||||
def load_record(pickle_fname):
|
||||
|
@ -18,51 +17,41 @@ def load_record(pickle_fname):
|
|||
return d
|
||||
|
||||
|
||||
def convert_test_type_to_difficulty_level(d):
|
||||
"""
|
||||
对原本的单词库中的单词进行难度评级
|
||||
:param d: 存储了单词库pickle文件中的单词的字典
|
||||
:return:
|
||||
"""
|
||||
result = {}
|
||||
L = list(d.keys()) # in d, we have test types (e.g., CET4,CET6,BBC) for each word
|
||||
def difficulty_level_from_frequency(word, d):
|
||||
level = 1
|
||||
if not word in d:
|
||||
return level
|
||||
|
||||
if 'what' in d:
|
||||
ratio = (d['what']+1)/(d[word]+1) # what is a frequent word
|
||||
level = math.log( max(ratio, 1), 2)
|
||||
|
||||
for k in L:
|
||||
if 'CET4' in d[k]:
|
||||
result[k] = 4 # CET4 word has level 4
|
||||
elif 'OXFORD3000' in d[k]:
|
||||
result[k] = 5
|
||||
elif 'CET6' in d[k] or 'GRADUATE' in d[k]:
|
||||
result[k] = 6
|
||||
elif 'OXFORD5000' in d[k] or 'IELTS' in d[k]:
|
||||
result[k] = 7
|
||||
elif 'BBC' in d[k]:
|
||||
result[k] = 8
|
||||
|
||||
return result # {'apple': 4, ...}
|
||||
level = min(level, 8)
|
||||
return level
|
||||
|
||||
|
||||
def get_difficulty_level_for_user(d1, d2):
|
||||
"""
|
||||
d2 来自于词库的35511个已标记单词
|
||||
d1 用户不会的词
|
||||
在d2的后面添加单词,没有新建一个新的字典
|
||||
"""
|
||||
# TODO: convert_test_type_to_difficulty_level() should not be called every time. Each word's difficulty level should be pre-computed.
|
||||
d2 = convert_test_type_to_difficulty_level(d2) # 根据d2的标记评级{'apple': 4, 'abandon': 4, ...}
|
||||
stemmer = snowballstemmer.stemmer('english')
|
||||
def get_difficulty_level(d1, d2):
|
||||
d = {}
|
||||
L = list(d1.keys()) # in d1, we have freuqence for each word
|
||||
L2 = list(d2.keys()) # in d2, we have test types (e.g., CET4,CET6,BBC) for each word
|
||||
L.extend(L2)
|
||||
L3 = list(set(L)) # L3 contains all words
|
||||
for k in L3:
|
||||
if k in d2:
|
||||
if 'CET4' in d2[k]:
|
||||
d[k] = 4 # CET4 word has level 4
|
||||
elif 'CET6' in d2[k]:
|
||||
d[k] = 6
|
||||
elif 'BBC' in d2[k]:
|
||||
d[k] = 8
|
||||
if k in d1: # BBC could contain easy words that are not in CET4 or CET6. So 4 is not reasonable. Recompute difficulty level.
|
||||
d[k] = min(difficulty_level_from_frequency(k, d1), d[k])
|
||||
elif k in d1:
|
||||
d[k] = difficulty_level_from_frequency(k, d1)
|
||||
|
||||
for k in d1: # 用户的词
|
||||
if k in d2: # 如果用户的词以原型的形式存在于词库d2中
|
||||
continue # 无需评级,跳过
|
||||
else:
|
||||
stem = stemmer.stemWord(k)
|
||||
if stem in d2: # 如果用户的词的词根存在于词库d2的词根库中
|
||||
d2[k] = d2[stem] # 按照词根进行评级
|
||||
else:
|
||||
d2[k] = 3 # 如果k的词根都不在,那么就当认为是3级
|
||||
return d2
|
||||
return d
|
||||
|
||||
|
||||
|
||||
def revert_dict(d):
|
||||
'''
|
||||
|
@ -73,13 +62,12 @@ def revert_dict(d):
|
|||
for k in d:
|
||||
if type(d[k]) is list: # d[k] is a list of dates.
|
||||
lst = d[k]
|
||||
elif type(d[
|
||||
k]) is int: # for backward compatibility. d was sth like {'word':1}. The value d[k] is not a list of dates, but a number representing how frequent this word had been added to the new word book.
|
||||
elif type(d[k]) is int: # for backward compatibility. d was sth like {'word':1}. The value d[k] is not a list of dates, but a number representing how frequent this word had been added to the new word book.
|
||||
freq = d[k]
|
||||
lst = freq * ['2021082019'] # why choose this date? No particular reasons. I fix the bug in this date.
|
||||
lst = freq*['2021082019'] # why choose this date? No particular reasons. I fix the bug in this date.
|
||||
|
||||
for time_info in lst:
|
||||
date = time_info[:10] # until hour
|
||||
date = time_info[:10] # until hour
|
||||
if not date in d2:
|
||||
d2[date] = [k]
|
||||
else:
|
||||
|
@ -88,44 +76,42 @@ def revert_dict(d):
|
|||
|
||||
|
||||
def user_difficulty_level(d_user, d):
|
||||
d_user2 = revert_dict(d_user) # key is date, and value is a list of words added in that date
|
||||
d_user2 = revert_dict(d_user) # key is date, and value is a list of words added in that date
|
||||
count = 0
|
||||
geometric = 1
|
||||
for date in sorted(d_user2.keys(),
|
||||
reverse=True): # most recently added words are more important while determining user's level
|
||||
lst = d_user2[date] # a list of words
|
||||
lst2 = [] # a list of tuples, (word, difficulty level)
|
||||
for word in lst:
|
||||
for date in sorted(d_user2.keys(), reverse=True): # most recently added words are more important while determining user's level
|
||||
lst = d_user2[date] # a list of words
|
||||
lst2 = [] # a list of tuples, (word, difficulty level)
|
||||
for word in lst:
|
||||
if word in d:
|
||||
lst2.append((word, d[word]))
|
||||
|
||||
lst3 = sort_in_ascending_order(lst2) # easiest tuple first
|
||||
# print(lst3)
|
||||
lst3 = sort_in_ascending_order(lst2) # easiest tuple first
|
||||
#print(lst3)
|
||||
for t in lst3:
|
||||
word = t[0]
|
||||
hard = t[1]
|
||||
# print('WORD %s HARD %4.2f' % (word, hard))
|
||||
#print('WORD %s HARD %4.2f' % (word, hard))
|
||||
geometric = geometric * (hard)
|
||||
count += 1
|
||||
if count >= 10:
|
||||
return geometric ** (1 / count)
|
||||
return geometric**(1/count)
|
||||
|
||||
return geometric ** (1 / max(count, 1))
|
||||
return geometric**(1/max(count,1))
|
||||
|
||||
|
||||
def text_difficulty_level(s, d):
|
||||
s = remove_punctuation(s)
|
||||
L = freq(s)
|
||||
|
||||
lst = [] # a list of tuples, each tuple being (word, difficulty level)
|
||||
stop_words = {'the':1, 'and':1, 'of':1, 'to':1, 'what':1, 'in':1, 'there':1, 'when':1, 'them':1, 'would':1, 'will':1, 'out':1, 'his':1, 'mr':1, 'that':1, 'up':1, 'more':1, 'your':1, 'it':1, 'now':1, 'very':1, 'then':1, 'could':1, 'he':1, 'any':1, 'some':1, 'with':1, 'into':1, 'you':1, 'our':1, 'man':1, 'other':1, 'time':1, 'was':1, 'than':1, 'know':1, 'about':1, 'only':1, 'like':1, 'how':1, 'see':1, 'is':1, 'before':1, 'such':1, 'little':1, 'two':1, 'its':1, 'as':1, 'these':1, 'may':1, 'much':1, 'down':1, 'for':1, 'well':1, 'should':1, 'those':1, 'after':1, 'same':1, 'must':1, 'say':1, 'first':1, 'again':1, 'us':1, 'great':1, 'where':1, 'being':1, 'come':1, 'over':1, 'good':1, 'himself':1, 'am':1, 'never':1, 'on':1, 'old':1, 'here':1, 'way':1, 'at':1, 'go':1, 'upon':1, 'have':1, 'had':1, 'without':1, 'my':1, 'day':1, 'be':1, 'but':1, 'though':1, 'from':1, 'not':1, 'too':1, 'another':1, 'this':1, 'even':1, 'still':1, 'her':1, 'yet':1, 'under':1, 'by':1, 'let':1, 'just':1, 'all':1, 'because':1, 'we':1, 'always':1, 'off':1, 'yes':1, 'so':1, 'while':1, 'why':1, 'which':1, 'me':1, 'are':1, 'or':1, 'no':1, 'if':1, 'an':1, 'also':1, 'thus':1, 'who':1, 'cannot':1, 'she':1, 'whether':1} # ignore these words while computing the artile's difficulty level
|
||||
lst = [] # a list of tuples, each tuple being (word, difficulty level)
|
||||
for x in L:
|
||||
word = x[0]
|
||||
if word not in stop_words and word in d:
|
||||
if word in d:
|
||||
lst.append((word, d[word]))
|
||||
|
||||
lst2 = sort_in_descending_order(lst) # most difficult words on top
|
||||
# print(lst2)
|
||||
lst2 = sort_in_descending_order(lst) # most difficult words on top
|
||||
#print(lst2)
|
||||
count = 0
|
||||
geometric = 1
|
||||
for t in lst2:
|
||||
|
@ -133,20 +119,24 @@ def text_difficulty_level(s, d):
|
|||
hard = t[1]
|
||||
geometric = geometric * (hard)
|
||||
count += 1
|
||||
if count >= 20: # we look for n most difficult words
|
||||
return geometric ** (1 / count)
|
||||
if count >= 20: # we look for n most difficult words
|
||||
return geometric**(1/count)
|
||||
|
||||
return geometric**(1/max(count,1))
|
||||
|
||||
return geometric ** (1 / max(count, 1))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
|
||||
|
||||
d1 = load_record('frequency.p')
|
||||
# print(d1)
|
||||
#print(d1)
|
||||
|
||||
d2 = load_record('words_and_tests.p')
|
||||
# print(d2)
|
||||
#print(d2)
|
||||
|
||||
d3 = get_difficulty_level_for_user(d1, d2)
|
||||
|
||||
d3 = get_difficulty_level(d1, d2)
|
||||
|
||||
s = '''
|
||||
South Lawn
|
||||
|
@ -207,6 +197,7 @@ Amidst the aftermath of this shocking referendum vote, there is great uncertaint
|
|||
|
||||
'''
|
||||
|
||||
|
||||
s = '''
|
||||
British Prime Minister Boris Johnson walks towards a voting station during the Brexit referendum in Britain, June 23, 2016. (Photo: EPA-EFE)
|
||||
|
||||
|
@ -227,6 +218,7 @@ The prime minister was forced to ask for an extension to Britain's EU departure
|
|||
Johnson has repeatedly pledged to finalize the first stage, a transition deal, of Britain's EU divorce battle by Oct. 31. A second stage will involve negotiating its future relationship with the EU on trade, security and other salient issues.
|
||||
'''
|
||||
|
||||
|
||||
s = '''
|
||||
Thank you very much. We have a Cabinet meeting. We’ll have a few questions after grace. And, if you would, Ben, please do the honors.
|
||||
|
||||
|
@ -241,11 +233,17 @@ We need — for our farmers, our manufacturers, for, frankly, unions and non-uni
|
|||
|
||||
'''
|
||||
|
||||
# f = open('bbc-fulltext/bbc/entertainment/001.txt')
|
||||
|
||||
|
||||
|
||||
#f = open('bbc-fulltext/bbc/entertainment/001.txt')
|
||||
f = open('wordlist.txt')
|
||||
s = f.read()
|
||||
f.close()
|
||||
|
||||
|
||||
|
||||
|
||||
print(text_difficulty_level(s, d3))
|
||||
|
||||
|
||||
|
||||
|
|
|
@ -84,6 +84,10 @@ def mainpage():
|
|||
content = escape(request.form['content'])
|
||||
f = WordFreq(content)
|
||||
lst = f.get_freq()
|
||||
for i in lst:
|
||||
if '&' in i[0]:
|
||||
i[0].replace('&','\'')
|
||||
#此处由于找不到输出的所以对其输入的格式进行重新规范
|
||||
# save history
|
||||
d = load_freq_history(path_prefix + 'static/frequency/frequency.p')
|
||||
lst_history = pickle_idea.dict2lst(d)
|
||||
|
|
|
@ -7,7 +7,6 @@ css:
|
|||
js:
|
||||
head: # 在页面加载之前加载
|
||||
- ../static/js/jquery.js
|
||||
- ../static/js/read.js
|
||||
- ../static/js/word_operation.js
|
||||
bottom: # 在页面加载完之后加载
|
||||
- ../static/js/fillword.js
|
||||
|
|
|
@ -1,5 +1,9 @@
|
|||
let isRead = true;
|
||||
let isChoose = true;
|
||||
let reader = window.speechSynthesis; // 全局定义朗读者,以便朗读和暂停
|
||||
let current_position = 0; // 朗读文本的当前位置
|
||||
let original_position = 0; // 朗读文本的初始位置
|
||||
let to_speak = ""; // 朗读的初始内容
|
||||
|
||||
function getWord() {
|
||||
return window.getSelection ? window.getSelection() : document.selection.createRange().text;
|
||||
|
@ -7,7 +11,7 @@ function getWord() {
|
|||
|
||||
function fillInWord() {
|
||||
let word = getWord();
|
||||
if (isRead) Reader.read(word, inputSlider.value);
|
||||
if (isRead) read(word);
|
||||
if (!isChoose) return;
|
||||
const element = document.getElementById("selected-words");
|
||||
element.value = element.value + " " + word;
|
||||
|
@ -15,17 +19,50 @@ function fillInWord() {
|
|||
|
||||
document.getElementById("text-content").addEventListener("click", fillInWord, false);
|
||||
|
||||
const sliderValue = document.getElementById("rangeValue");
|
||||
const inputSlider = document.getElementById("rangeComponent");
|
||||
function makeUtterance(str, rate) {
|
||||
let msg = new SpeechSynthesisUtterance(str);
|
||||
msg.rate = rate;
|
||||
msg.lang = "en-US"; // TODO: add language options menu
|
||||
msg.onboundary = ev => {
|
||||
if (ev.name == "word") {
|
||||
current_position = ev.charIndex;
|
||||
}
|
||||
}
|
||||
return msg;
|
||||
}
|
||||
|
||||
const sliderValue = document.getElementById("rangeValue"); // 显示值
|
||||
const inputSlider = document.getElementById("rangeComponent"); // 滑块元素
|
||||
inputSlider.oninput = () => {
|
||||
let value = inputSlider.value;
|
||||
let value = inputSlider.value; // 获取滑块的值
|
||||
sliderValue.textContent = value + '×';
|
||||
if (!reader.speaking) return;
|
||||
reader.cancel();
|
||||
let msg = makeUtterance(to_speak.substring(original_position + current_position), value);
|
||||
original_position = original_position + current_position;
|
||||
current_position = 0;
|
||||
reader.speak(msg);
|
||||
};
|
||||
|
||||
function read(s) {
|
||||
to_speak = s.toString();
|
||||
original_position = 0;
|
||||
current_position = 0;
|
||||
let msg = makeUtterance(to_speak, inputSlider.value);
|
||||
reader.speak(msg);
|
||||
}
|
||||
|
||||
function onReadClick() {
|
||||
isRead = !isRead;
|
||||
if (!isRead) {
|
||||
reader.cancel();
|
||||
}
|
||||
}
|
||||
|
||||
function onChooseClick() {
|
||||
isChoose = !isChoose;
|
||||
}
|
||||
|
||||
function stopRead() {
|
||||
reader.cancel();
|
||||
}
|
|
@ -1,35 +0,0 @@
|
|||
var Reader = (function() {
|
||||
let reader = window.speechSynthesis;
|
||||
let current_position = 0;
|
||||
let original_position = 0;
|
||||
let to_speak = "";
|
||||
|
||||
function makeUtterance(str, rate) {
|
||||
let msg = new SpeechSynthesisUtterance(str);
|
||||
msg.rate = rate;
|
||||
msg.lang = "en-US";
|
||||
msg.onboundary = ev => {
|
||||
if (ev.name == "word") {
|
||||
current_position = ev.charIndex;
|
||||
}
|
||||
}
|
||||
return msg;
|
||||
}
|
||||
|
||||
function read(s, rate) {
|
||||
to_speak = s.toString();
|
||||
original_position = 0;
|
||||
current_position = 0;
|
||||
let msg = makeUtterance(to_speak, rate);
|
||||
reader.speak(msg);
|
||||
}
|
||||
|
||||
function stopRead() {
|
||||
reader.cancel();
|
||||
}
|
||||
|
||||
return {
|
||||
read: read,
|
||||
stopRead: stopRead
|
||||
};
|
||||
})();
|
|
@ -62,13 +62,6 @@ function delete_word(theWord) {
|
|||
});
|
||||
}
|
||||
|
||||
function read_word(theWord) {
|
||||
let to_speak = $("#word_" + theWord).text();
|
||||
original_position = 0;
|
||||
current_position = 0;
|
||||
Reader.read(to_speak, inputSlider.value);
|
||||
}
|
||||
|
||||
/*
|
||||
* interface Word {
|
||||
* word: string,
|
||||
|
@ -102,7 +95,6 @@ function wordTemplate(word) {
|
|||
<a class="btn btn-success" onclick="familiar('${word.word}')" role="button">熟悉</a>
|
||||
<a class="btn btn-warning" onclick="unfamiliar('${word.word}')" role="button">不熟悉</a>
|
||||
<a class="btn btn-danger" onclick="delete_word('${word.word}')" role="button">删除</a>
|
||||
<a class="btn btn-info" onclick="read_word('${word.word}')" role="button">朗读</a>
|
||||
</p>`;
|
||||
}
|
||||
|
||||
|
|
Binary file not shown.
|
@ -37,6 +37,16 @@
|
|||
<body>
|
||||
<div class="container-fluid">
|
||||
<p><b>English Pal for <font id="username" color="red">{{ username }}</font></b>
|
||||
<a class="btn btn-secondary" href="/logout" role="button">退出</a>
|
||||
<a class="btn btn-secondary" href="/reset" role="button">重设密码</a>
|
||||
</p>
|
||||
{% if flashed_messages != [] %}
|
||||
<div class="alert alert-warning" role="alert" id="warn_info">
|
||||
{{ flashed_messages|safe }}
|
||||
</div>
|
||||
{% endif %}
|
||||
|
||||
|
||||
|
||||
{% if username == admin_name %}
|
||||
<a class="btn btn-secondary" href="/admin" role="button" onclick="stopRead()">管理</a>
|
||||
|
@ -113,7 +123,7 @@
|
|||
|
||||
{% if d_len > 0 %}
|
||||
<p>
|
||||
<b>我的生词簿</b>
|
||||
<b>我的生词簿</b>
|
||||
<label for="move_dynamiclly">
|
||||
<input type="checkbox" name="move_dynamiclly" id="move_dynamiclly" checked>
|
||||
允许动态调整顺序
|
||||
|
@ -133,7 +143,6 @@
|
|||
<a class="btn btn-success" onclick="familiar('{{ word }}')" role="button">熟悉</a>
|
||||
<a class="btn btn-warning" onclick="unfamiliar('{{ word }}')" role="button">不熟悉</a>
|
||||
<a class="btn btn-danger" onclick="delete_word('{{ word }}')" role="button">删除</a>
|
||||
<a class="btn btn-info" onclick="read_word('{{ word }}')" role="button">朗读</a>
|
||||
</p>
|
||||
{% endfor %}
|
||||
</div>
|
||||
|
|
|
@ -102,8 +102,7 @@ def deleteword(username, word):
|
|||
'''
|
||||
user_freq_record = path_prefix + 'static/frequency/' + 'frequency_%s.pickle' % (username)
|
||||
pickle_idea2.deleteRecord(user_freq_record, word)
|
||||
# 模板userpage_get.html中删除单词是异步执行,而flash的信息后续是同步执行的,所以注释这段代码;同时如果这里使用flash但不提取信息,则会影响 signup.html的显示。bug复现:删除单词后,点击退出,点击注册,注册页面就会出现提示信息
|
||||
# flash(f'{word} is no longer in your word list.')
|
||||
flash(f'{word} is no longer in your word list.')
|
||||
return "success"
|
||||
|
||||
|
||||
|
@ -151,7 +150,7 @@ def userpage(username):
|
|||
admin_name=ADMIN_NAME,
|
||||
username=username,
|
||||
session=session,
|
||||
# flashed_messages=get_flashed_messages(), 仅有删除单词的时候使用到flash,而删除单词是异步执行,这里的信息提示是同步执行,所以就没有存在的必要了
|
||||
flashed_messages=get_flashed_messages(),
|
||||
today_article=today_article,
|
||||
result_of_generate_article=result_of_generate_article,
|
||||
d_len=len(d),
|
||||
|
@ -188,3 +187,13 @@ def get_time():
|
|||
'''
|
||||
return datetime.now().strftime('%Y%m%d%H%M') # upper to minutes
|
||||
|
||||
def get_flashed_messages_if_any():
|
||||
'''
|
||||
在用户界面显示黄色提示信息
|
||||
:return: 包含HTML标签的提示信息
|
||||
'''
|
||||
messages = get_flashed_messages
|
||||
s = ''
|
||||
for message in messages:
|
||||
s += message
|
||||
return s
|
||||
|
|
|
@ -38,20 +38,46 @@ def file2str(fname):#文件转字符
|
|||
return s
|
||||
|
||||
|
||||
def str2chararray(str): # 字符串转字符数组
|
||||
chararray = []
|
||||
for i in range(len(str)):
|
||||
chararray.append(str[i])
|
||||
return chararray
|
||||
|
||||
|
||||
def remove_punctuation(s): # 这里是s是形参 (parameter)。函数被调用时才给s赋值。
|
||||
special_characters = '\_©~<=>+/[]*&$%^@.,?!:;#()"“”—‘’{}|' # 把里面的字符都去掉
|
||||
|
||||
# carr = str2chararray(s) # 字符串转字符数组
|
||||
# print(carr)
|
||||
special_characters = '&_~=+[]%^@.,?!:;#()"“”—‘’|/\\<>{}' # 把里面的字符都去掉
|
||||
special_words = ('$lt', '$gt', '$') # 特殊词汇
|
||||
|
||||
special_characters = '\_©~<=>+-/[]*&$%^@.,?!:;#()"“”—‘’{}|' # 把里面的字符都去掉
|
||||
|
||||
for c in special_characters:
|
||||
s = s.replace(c, ' ') # 防止出现把 apple,apple 移掉逗号后变成 appleapple 情况
|
||||
s = s.replace(c, ' ') # 防止出现把 apple,apple 移掉逗号后变成 appleapple 情况
|
||||
carr = str2chararray(s) # 字符串转字符数组
|
||||
for i, value in enumerate(carr):
|
||||
if value == '&': # 遍历替换
|
||||
carr[i] = '\''
|
||||
print('sss' + s)
|
||||
for j in range(1,5):
|
||||
if carr[i+j] in [' ','3','9']:
|
||||
carr[i + j]=''
|
||||
s = ''.join(carr) # 字符数组转字符串
|
||||
print('sss'+s)
|
||||
for w in special_words: # 替换字符串中的剩余特殊字符
|
||||
s = s.replace(w, ' ')
|
||||
s = s.replace('--', ' ')
|
||||
s = s.strip() # 去除前后的空格
|
||||
|
||||
s = s.strip() # 去除前后的空格
|
||||
|
||||
if '\'' in s:
|
||||
n = len(s)
|
||||
t = '' # 用来收集我需要保留的字符
|
||||
for i in range(n): # 只有单引号前后都有英文字符,才保留
|
||||
t = '' # 用来收集我需要保留的字符
|
||||
for i in range(n): # 只有单引号前后都有英文字符,才保留
|
||||
if s[i] == '\'':
|
||||
i_is_ok = i - 1 >= 0 and i + 1 < n
|
||||
if i_is_ok and s[i-1] in string.ascii_letters and s[i+1] in string.ascii_letters:
|
||||
if i_is_ok and s[i - 1] in string.ascii_letters and s[i + 1] in string.ascii_letters:
|
||||
t += s[i]
|
||||
else:
|
||||
t += s[i]
|
||||
|
@ -60,6 +86,7 @@ def remove_punctuation(s): # 这里是s是形参 (parameter)。函数被调用
|
|||
return s
|
||||
|
||||
|
||||
|
||||
def sort_in_descending_order(lst):# 单词按频率降序排列
|
||||
lst2 = sorted(lst, reverse=True, key=lambda x: (x[1], x[0]))
|
||||
return lst2
|
||||
|
|
|
@ -2,4 +2,3 @@ Flask==1.1.2
|
|||
selenium==3.141.0
|
||||
PyYAML~=6.0
|
||||
pony==0.7.16
|
||||
snowballstemmer==2.2.0
|
||||
|
|
Loading…
Reference in New Issue