pull最新的snapshot-20230511,后更新了difficulty.py和Article.py的部分代码,提交了新的pickle文件
parent
d9f6df7fbe
commit
39d96014d9
|
@ -7,7 +7,7 @@ import random, glob
|
||||||
import hashlib
|
import hashlib
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from flask import Flask, request, redirect, render_template, url_for, session, abort, flash, get_flashed_messages
|
from flask import Flask, request, redirect, render_template, url_for, session, abort, flash, get_flashed_messages
|
||||||
from difficulty import get_difficulty_level, text_difficulty_level, user_difficulty_level
|
from difficulty import get_difficulty_level_for_user, text_difficulty_level, user_difficulty_level
|
||||||
|
|
||||||
|
|
||||||
path_prefix = '/var/www/wordfreq/wordfreq/'
|
path_prefix = '/var/www/wordfreq/wordfreq/'
|
||||||
|
@ -53,7 +53,7 @@ def get_today_article(user_word_list, visited_articles):
|
||||||
# Choose article according to reader's level
|
# Choose article according to reader's level
|
||||||
d1 = load_freq_history(path_prefix + 'static/frequency/frequency.p')
|
d1 = load_freq_history(path_prefix + 'static/frequency/frequency.p')
|
||||||
d2 = load_freq_history(path_prefix + 'static/words_and_tests.p')
|
d2 = load_freq_history(path_prefix + 'static/words_and_tests.p')
|
||||||
d3 = get_difficulty_level(d1, d2)
|
d3 = get_difficulty_level_for_user(d1, d2)
|
||||||
|
|
||||||
d = None
|
d = None
|
||||||
result_of_generate_article = "not found"
|
result_of_generate_article = "not found"
|
||||||
|
|
|
@ -8,6 +8,7 @@
|
||||||
import pickle
|
import pickle
|
||||||
import math
|
import math
|
||||||
from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order
|
from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order
|
||||||
|
import snowballstemmer
|
||||||
|
|
||||||
|
|
||||||
def load_record(pickle_fname):
|
def load_record(pickle_fname):
|
||||||
|
@ -17,40 +18,48 @@ def load_record(pickle_fname):
|
||||||
return d
|
return d
|
||||||
|
|
||||||
|
|
||||||
def difficulty_level_from_frequency(word, d):
|
def convert_test_type_to_difficulty_level(d):
|
||||||
level = 1
|
"""
|
||||||
if not word in d:
|
对原本的单词库中的单词进行难度评级
|
||||||
return level
|
:param d: 存储了单词库pickle文件中的单词的字典
|
||||||
|
:return:
|
||||||
|
"""
|
||||||
|
result = {}
|
||||||
|
L = list(d.keys()) # in d, we have test types (e.g., CET4,CET6,BBC) for each word
|
||||||
|
|
||||||
if 'what' in d:
|
for k in L:
|
||||||
ratio = (d['what']+1)/(d[word]+1) # what is a frequent word
|
if 'CET4' in d[k]:
|
||||||
level = math.log( max(ratio, 1), 2)
|
result[k] = 4 # CET4 word has level 4
|
||||||
|
elif 'OXFORD3000' in d[k]:
|
||||||
|
result[k] = 5
|
||||||
|
elif 'CET6' in d[k] or 'GRADUATE' in d[k]:
|
||||||
|
result[k] = 6
|
||||||
|
elif 'OXFORD5000' in d[k] or 'IELTS' in d[k]:
|
||||||
|
result[k] = 7
|
||||||
|
elif 'BBC' in d[k]:
|
||||||
|
result[k] = 8
|
||||||
|
|
||||||
level = min(level, 8)
|
return result # {'apple': 4, ...}
|
||||||
return level
|
|
||||||
|
|
||||||
|
|
||||||
def get_difficulty_level(d1, d2):
|
def get_difficulty_level_for_user(d1, d2):
|
||||||
d = {}
|
"""
|
||||||
L = list(d1.keys()) # in d1, we have freuqence for each word
|
d2 来自于词库的35511个已标记单词
|
||||||
L2 = list(d2.keys()) # in d2, we have test types (e.g., CET4,CET6,BBC) for each word
|
d1 用户不会的词
|
||||||
L.extend(L2)
|
在d2的后面添加单词,没有新建一个新的字典
|
||||||
L3 = list(set(L)) # L3 contains all words
|
"""
|
||||||
for k in L3:
|
d2 = convert_test_type_to_difficulty_level(d2) # 根据d2的标记评级{'apple': 4, 'abandon': 4, ...}
|
||||||
if k in d2:
|
stem = snowballstemmer.stemmer('english')
|
||||||
if 'CET4' in d2[k]:
|
|
||||||
d[k] = 4 # CET4 word has level 4
|
|
||||||
elif 'CET6' in d2[k]:
|
|
||||||
d[k] = 6
|
|
||||||
elif 'BBC' in d2[k]:
|
|
||||||
d[k] = 8
|
|
||||||
if k in d1: # BBC could contain easy words that are not in CET4 or CET6. So 4 is not reasonable. Recompute difficulty level.
|
|
||||||
d[k] = min(difficulty_level_from_frequency(k, d1), d[k])
|
|
||||||
elif k in d1:
|
|
||||||
d[k] = difficulty_level_from_frequency(k, d1)
|
|
||||||
|
|
||||||
return d
|
|
||||||
|
|
||||||
|
for k in d1: # 用户的词
|
||||||
|
if k in d2: # 如果用户的词以原型的形式存在于词库d2中
|
||||||
|
continue # 无需评级,跳过
|
||||||
|
elif stem.stemWord(k) in d2: # 如果用户的词的词根存在于词库d2的词根库中
|
||||||
|
d2[k] = d2[stem.stemWord(k)] # 按照词根进行评级
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
d2[k] = 3 # 如果k的词根都不在,那么就当认为是3级
|
||||||
|
return d2
|
||||||
|
|
||||||
|
|
||||||
def revert_dict(d):
|
def revert_dict(d):
|
||||||
|
@ -62,12 +71,13 @@ def revert_dict(d):
|
||||||
for k in d:
|
for k in d:
|
||||||
if type(d[k]) is list: # d[k] is a list of dates.
|
if type(d[k]) is list: # d[k] is a list of dates.
|
||||||
lst = d[k]
|
lst = d[k]
|
||||||
elif type(d[k]) is int: # for backward compatibility. d was sth like {'word':1}. The value d[k] is not a list of dates, but a number representing how frequent this word had been added to the new word book.
|
elif type(d[
|
||||||
|
k]) is int: # for backward compatibility. d was sth like {'word':1}. The value d[k] is not a list of dates, but a number representing how frequent this word had been added to the new word book.
|
||||||
freq = d[k]
|
freq = d[k]
|
||||||
lst = freq*['2021082019'] # why choose this date? No particular reasons. I fix the bug in this date.
|
lst = freq * ['2021082019'] # why choose this date? No particular reasons. I fix the bug in this date.
|
||||||
|
|
||||||
for time_info in lst:
|
for time_info in lst:
|
||||||
date = time_info[:10] # until hour
|
date = time_info[:10] # until hour
|
||||||
if not date in d2:
|
if not date in d2:
|
||||||
d2[date] = [k]
|
d2[date] = [k]
|
||||||
else:
|
else:
|
||||||
|
@ -76,42 +86,43 @@ def revert_dict(d):
|
||||||
|
|
||||||
|
|
||||||
def user_difficulty_level(d_user, d):
|
def user_difficulty_level(d_user, d):
|
||||||
d_user2 = revert_dict(d_user) # key is date, and value is a list of words added in that date
|
d_user2 = revert_dict(d_user) # key is date, and value is a list of words added in that date
|
||||||
count = 0
|
count = 0
|
||||||
geometric = 1
|
geometric = 1
|
||||||
for date in sorted(d_user2.keys(), reverse=True): # most recently added words are more important while determining user's level
|
for date in sorted(d_user2.keys(),
|
||||||
lst = d_user2[date] # a list of words
|
reverse=True): # most recently added words are more important while determining user's level
|
||||||
lst2 = [] # a list of tuples, (word, difficulty level)
|
lst = d_user2[date] # a list of words
|
||||||
for word in lst:
|
lst2 = [] # a list of tuples, (word, difficulty level)
|
||||||
|
for word in lst:
|
||||||
if word in d:
|
if word in d:
|
||||||
lst2.append((word, d[word]))
|
lst2.append((word, d[word]))
|
||||||
|
|
||||||
lst3 = sort_in_ascending_order(lst2) # easiest tuple first
|
lst3 = sort_in_ascending_order(lst2) # easiest tuple first
|
||||||
#print(lst3)
|
# print(lst3)
|
||||||
for t in lst3:
|
for t in lst3:
|
||||||
word = t[0]
|
word = t[0]
|
||||||
hard = t[1]
|
hard = t[1]
|
||||||
#print('WORD %s HARD %4.2f' % (word, hard))
|
# print('WORD %s HARD %4.2f' % (word, hard))
|
||||||
geometric = geometric * (hard)
|
geometric = geometric * (hard)
|
||||||
count += 1
|
count += 1
|
||||||
if count >= 10:
|
if count >= 10:
|
||||||
return geometric**(1/count)
|
return geometric ** (1 / count)
|
||||||
|
|
||||||
return geometric**(1/max(count,1))
|
return geometric ** (1 / max(count, 1))
|
||||||
|
|
||||||
|
|
||||||
def text_difficulty_level(s, d):
|
def text_difficulty_level(s, d):
|
||||||
s = remove_punctuation(s)
|
s = remove_punctuation(s)
|
||||||
L = freq(s)
|
L = freq(s)
|
||||||
|
|
||||||
lst = [] # a list of tuples, each tuple being (word, difficulty level)
|
lst = [] # a list of tuples, each tuple being (word, difficulty level)
|
||||||
for x in L:
|
for x in L:
|
||||||
word = x[0]
|
word = x[0]
|
||||||
if word in d:
|
if word in d:
|
||||||
lst.append((word, d[word]))
|
lst.append((word, d[word]))
|
||||||
|
|
||||||
lst2 = sort_in_descending_order(lst) # most difficult words on top
|
lst2 = sort_in_descending_order(lst) # most difficult words on top
|
||||||
#print(lst2)
|
# print(lst2)
|
||||||
count = 0
|
count = 0
|
||||||
geometric = 1
|
geometric = 1
|
||||||
for t in lst2:
|
for t in lst2:
|
||||||
|
@ -119,24 +130,20 @@ def text_difficulty_level(s, d):
|
||||||
hard = t[1]
|
hard = t[1]
|
||||||
geometric = geometric * (hard)
|
geometric = geometric * (hard)
|
||||||
count += 1
|
count += 1
|
||||||
if count >= 20: # we look for n most difficult words
|
if count >= 20: # we look for n most difficult words
|
||||||
return geometric**(1/count)
|
return geometric ** (1 / count)
|
||||||
|
|
||||||
return geometric**(1/max(count,1))
|
|
||||||
|
|
||||||
|
return geometric ** (1 / max(count, 1))
|
||||||
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
if __name__ == '__main__':
|
||||||
|
|
||||||
|
|
||||||
d1 = load_record('frequency.p')
|
d1 = load_record('frequency.p')
|
||||||
#print(d1)
|
# print(d1)
|
||||||
|
|
||||||
d2 = load_record('words_and_tests.p')
|
d2 = load_record('words_and_tests.p')
|
||||||
#print(d2)
|
# print(d2)
|
||||||
|
|
||||||
|
d3 = get_difficulty_level_for_user(d1, d2)
|
||||||
d3 = get_difficulty_level(d1, d2)
|
|
||||||
|
|
||||||
s = '''
|
s = '''
|
||||||
South Lawn
|
South Lawn
|
||||||
|
@ -197,7 +204,6 @@ Amidst the aftermath of this shocking referendum vote, there is great uncertaint
|
||||||
|
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
|
||||||
s = '''
|
s = '''
|
||||||
British Prime Minister Boris Johnson walks towards a voting station during the Brexit referendum in Britain, June 23, 2016. (Photo: EPA-EFE)
|
British Prime Minister Boris Johnson walks towards a voting station during the Brexit referendum in Britain, June 23, 2016. (Photo: EPA-EFE)
|
||||||
|
|
||||||
|
@ -218,7 +224,6 @@ The prime minister was forced to ask for an extension to Britain's EU departure
|
||||||
Johnson has repeatedly pledged to finalize the first stage, a transition deal, of Britain's EU divorce battle by Oct. 31. A second stage will involve negotiating its future relationship with the EU on trade, security and other salient issues.
|
Johnson has repeatedly pledged to finalize the first stage, a transition deal, of Britain's EU divorce battle by Oct. 31. A second stage will involve negotiating its future relationship with the EU on trade, security and other salient issues.
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
|
||||||
s = '''
|
s = '''
|
||||||
Thank you very much. We have a Cabinet meeting. We’ll have a few questions after grace. And, if you would, Ben, please do the honors.
|
Thank you very much. We have a Cabinet meeting. We’ll have a few questions after grace. And, if you would, Ben, please do the honors.
|
||||||
|
|
||||||
|
@ -233,17 +238,11 @@ We need — for our farmers, our manufacturers, for, frankly, unions and non-uni
|
||||||
|
|
||||||
'''
|
'''
|
||||||
|
|
||||||
|
# f = open('bbc-fulltext/bbc/entertainment/001.txt')
|
||||||
|
|
||||||
|
|
||||||
#f = open('bbc-fulltext/bbc/entertainment/001.txt')
|
|
||||||
f = open('wordlist.txt')
|
f = open('wordlist.txt')
|
||||||
s = f.read()
|
s = f.read()
|
||||||
f.close()
|
f.close()
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
print(text_difficulty_level(s, d3))
|
print(text_difficulty_level(s, d3))
|
||||||
|
|
||||||
|
|
||||||
|
|
Binary file not shown.
Loading…
Reference in New Issue