fix BUG543
parent
cb576b40ed
commit
db66c8ed86
|
@ -7,7 +7,7 @@
|
||||||
|
|
||||||
import pickle
|
import pickle
|
||||||
import math
|
import math
|
||||||
from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order
|
from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order, map_percentages_to_levels
|
||||||
import snowballstemmer
|
import snowballstemmer
|
||||||
|
|
||||||
|
|
||||||
|
@ -94,30 +94,58 @@ def revert_dict(d):
|
||||||
return d2
|
return d2
|
||||||
|
|
||||||
|
|
||||||
def user_difficulty_level(d_user, d):
|
def user_difficulty_level(d_user, d, calc_func=0):
|
||||||
|
'''
|
||||||
|
two ways to calculate difficulty_level
|
||||||
|
set calc_func!=0 to use sqrt, otherwise use weighted average
|
||||||
|
'''
|
||||||
|
if calc_func != 0:
|
||||||
|
# calculation function 1: sqrt
|
||||||
|
d_user2 = revert_dict(d_user) # key is date, and value is a list of words added in that date
|
||||||
|
geometric = 0
|
||||||
|
count = 0
|
||||||
|
for date in sorted(d_user2.keys(),
|
||||||
|
reverse=True): # most recently added words are more important while determining user's level
|
||||||
|
lst = d_user2[date] # a list of words
|
||||||
|
lst2 = [] # a list of tuples, (word, difficulty level)
|
||||||
|
for word in lst:
|
||||||
|
if word in d:
|
||||||
|
lst2.append((word, d[word]))
|
||||||
|
|
||||||
|
lst3 = sort_in_ascending_order(lst2) # easiest tuple first
|
||||||
|
# print(lst3)
|
||||||
|
for t in lst3:
|
||||||
|
word = t[0]
|
||||||
|
hard = t[1]
|
||||||
|
# print('WORD %s HARD %4.2f' % (word, hard))
|
||||||
|
geometric = geometric + math.log(hard)
|
||||||
|
count += 1
|
||||||
|
return math.exp(geometric / max(count, 1))
|
||||||
|
|
||||||
|
# calculation function 2: weighted average
|
||||||
d_user2 = revert_dict(d_user) # key is date, and value is a list of words added in that date
|
d_user2 = revert_dict(d_user) # key is date, and value is a list of words added in that date
|
||||||
count = 0
|
count = {} # number of all kinds of words
|
||||||
geometric = 1
|
percentages = {} # percentages of all kinds of difficulties
|
||||||
for date in sorted(d_user2.keys(),
|
total = 0 # total words
|
||||||
reverse=True): # most recently added words are more important while determining user's level
|
for date in d_user2.keys():
|
||||||
lst = d_user2[date] # a list of words
|
lst = d_user2[date] # a list of words
|
||||||
lst2 = [] # a list of tuples, (word, difficulty level)
|
|
||||||
for word in lst:
|
for word in lst:
|
||||||
if word in d:
|
if word in d:
|
||||||
lst2.append((word, d[word]))
|
if d[word] not in count:
|
||||||
|
count[d[word]] = 0
|
||||||
|
count[d[word]] += 1
|
||||||
|
total += 1
|
||||||
|
|
||||||
lst3 = sort_in_ascending_order(lst2) # easiest tuple first
|
if total == 0:
|
||||||
# print(lst3)
|
return 1
|
||||||
for t in lst3:
|
for k in count.keys():
|
||||||
word = t[0]
|
percentages[k] = count[k] / total
|
||||||
hard = t[1]
|
weight = map_percentages_to_levels(percentages)
|
||||||
# print('WORD %s HARD %4.2f' % (word, hard))
|
sum = 0
|
||||||
geometric = geometric * (hard)
|
for k in weight.keys():
|
||||||
count += 1
|
sum += weight[k] * k
|
||||||
if count >= 10:
|
return sum
|
||||||
return geometric ** (1 / count)
|
|
||||||
|
|
||||||
return geometric ** (1 / max(count, 1))
|
|
||||||
|
|
||||||
|
|
||||||
def text_difficulty_level(s, d):
|
def text_difficulty_level(s, d):
|
||||||
|
|
|
@ -10,6 +10,32 @@ import operator
|
||||||
import os, sys # 引入模块sys,因为我要用里面的sys.argv列表中的信息来读取命令行参数。
|
import os, sys # 引入模块sys,因为我要用里面的sys.argv列表中的信息来读取命令行参数。
|
||||||
import pickle_idea
|
import pickle_idea
|
||||||
|
|
||||||
|
|
||||||
|
def map_percentages_to_levels(percentages):
|
||||||
|
'''
|
||||||
|
功能:按照加权平均难度,给生词本计算难度分,计算权重的规则是(10 - 该词汇难度) * 该难度词汇占总词汇的比例,再进行归一化处理
|
||||||
|
输入:难度占比字典,键代表难度3~8,值代表每种难度的单词的占比
|
||||||
|
输出:权重字典,键代表难度3~8,值代表每种难度的单词的权重
|
||||||
|
'''
|
||||||
|
# 已排序的键
|
||||||
|
sorted_keys = sorted(percentages.keys())
|
||||||
|
|
||||||
|
# 计算权重和权重总和
|
||||||
|
sum = 0 # 总和
|
||||||
|
levels_proportions = {}
|
||||||
|
for k in sorted_keys:
|
||||||
|
levels_proportions[k] = 10 - k
|
||||||
|
for k in sorted_keys:
|
||||||
|
levels_proportions[k] *= percentages[k]
|
||||||
|
sum += levels_proportions[k]
|
||||||
|
|
||||||
|
# 归一化权重到权重总和为1
|
||||||
|
for k in sorted_keys:
|
||||||
|
levels_proportions[k] /= sum
|
||||||
|
|
||||||
|
return levels_proportions
|
||||||
|
|
||||||
|
|
||||||
def freq(fruit):
|
def freq(fruit):
|
||||||
'''
|
'''
|
||||||
功能: 把字符串转成列表。 目的是得到每个单词的频率。
|
功能: 把字符串转成列表。 目的是得到每个单词的频率。
|
||||||
|
|
Loading…
Reference in New Issue