diff --git a/app/difficulty.py b/app/difficulty.py index 1bd8d68..39d4a50 100644 --- a/app/difficulty.py +++ b/app/difficulty.py @@ -7,7 +7,7 @@ import pickle import math -from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order +from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order, map_percentages_to_levels import snowballstemmer @@ -94,30 +94,58 @@ def revert_dict(d): return d2 -def user_difficulty_level(d_user, d): +def user_difficulty_level(d_user, d, calc_func=0): + ''' + two ways to calculate difficulty_level + set calc_func!=0 to use sqrt, otherwise use weighted average + ''' + if calc_func != 0: + # calculation function 1: sqrt + d_user2 = revert_dict(d_user) # key is date, and value is a list of words added in that date + geometric = 0 + count = 0 + for date in sorted(d_user2.keys(), + reverse=True): # most recently added words are more important while determining user's level + lst = d_user2[date] # a list of words + lst2 = [] # a list of tuples, (word, difficulty level) + for word in lst: + if word in d: + lst2.append((word, d[word])) + + lst3 = sort_in_ascending_order(lst2) # easiest tuple first + # print(lst3) + for t in lst3: + word = t[0] + hard = t[1] + # print('WORD %s HARD %4.2f' % (word, hard)) + geometric = geometric + math.log(hard) + count += 1 + return math.exp(geometric / max(count, 1)) + + # calculation function 2: weighted average d_user2 = revert_dict(d_user) # key is date, and value is a list of words added in that date - count = 0 - geometric = 1 - for date in sorted(d_user2.keys(), - reverse=True): # most recently added words are more important while determining user's level + count = {} # number of all kinds of words + percentages = {} # percentages of all kinds of difficulties + total = 0 # total words + for date in d_user2.keys(): lst = d_user2[date] # a list of words - lst2 = [] # a list of tuples, (word, difficulty level) for word in lst: if word in d: - lst2.append((word, d[word])) + if d[word] not in count: + count[d[word]] = 0 + count[d[word]] += 1 + total += 1 - lst3 = sort_in_ascending_order(lst2) # easiest tuple first - # print(lst3) - for t in lst3: - word = t[0] - hard = t[1] - # print('WORD %s HARD %4.2f' % (word, hard)) - geometric = geometric * (hard) - count += 1 - if count >= 10: - return geometric ** (1 / count) + if total == 0: + return 1 + for k in count.keys(): + percentages[k] = count[k] / total + weight = map_percentages_to_levels(percentages) + sum = 0 + for k in weight.keys(): + sum += weight[k] * k + return sum - return geometric ** (1 / max(count, 1)) def text_difficulty_level(s, d): diff --git a/app/wordfreqCMD.py b/app/wordfreqCMD.py index feeafbd..f127dda 100644 --- a/app/wordfreqCMD.py +++ b/app/wordfreqCMD.py @@ -10,6 +10,32 @@ import operator import os, sys # 引入模块sys,因为我要用里面的sys.argv列表中的信息来读取命令行参数。 import pickle_idea + +def map_percentages_to_levels(percentages): + ''' + 功能:按照加权平均难度,给生词本计算难度分,计算权重的规则是(10 - 该词汇难度) * 该难度词汇占总词汇的比例,再进行归一化处理 + 输入:难度占比字典,键代表难度3~8,值代表每种难度的单词的占比 + 输出:权重字典,键代表难度3~8,值代表每种难度的单词的权重 + ''' + # 已排序的键 + sorted_keys = sorted(percentages.keys()) + + # 计算权重和权重总和 + sum = 0 # 总和 + levels_proportions = {} + for k in sorted_keys: + levels_proportions[k] = 10 - k + for k in sorted_keys: + levels_proportions[k] *= percentages[k] + sum += levels_proportions[k] + + # 归一化权重到权重总和为1 + for k in sorted_keys: + levels_proportions[k] /= sum + + return levels_proportions + + def freq(fruit): ''' 功能: 把字符串转成列表。 目的是得到每个单词的频率。