forked from mrlan/EnglishPal
				
			fix BUG543
							parent
							
								
									cb576b40ed
								
							
						
					
					
						commit
						db66c8ed86
					
				|  | @ -7,7 +7,7 @@ | |||
| 
 | ||||
| import pickle | ||||
| import math | ||||
| from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order | ||||
| from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order, map_percentages_to_levels | ||||
| import snowballstemmer | ||||
| 
 | ||||
| 
 | ||||
|  | @ -94,30 +94,58 @@ def revert_dict(d): | |||
|     return d2 | ||||
| 
 | ||||
| 
 | ||||
| def user_difficulty_level(d_user, d): | ||||
| def user_difficulty_level(d_user, d, calc_func=0): | ||||
|     ''' | ||||
|     two ways to calculate difficulty_level | ||||
|     set calc_func!=0 to use sqrt, otherwise use weighted average | ||||
|     ''' | ||||
|     if calc_func != 0: | ||||
|         #  calculation function 1: sqrt | ||||
|         d_user2 = revert_dict(d_user)  # key is date, and value is a list of words added in that date | ||||
|         geometric = 0 | ||||
|         count = 0 | ||||
|         for date in sorted(d_user2.keys(), | ||||
|                            reverse=True):  # most recently added words are more important while determining user's level | ||||
|             lst = d_user2[date]  # a list of words | ||||
|             lst2 = []  # a list of tuples, (word, difficulty level) | ||||
|             for word in lst: | ||||
|                 if word in d: | ||||
|                     lst2.append((word, d[word])) | ||||
| 
 | ||||
|             lst3 = sort_in_ascending_order(lst2)  # easiest tuple first | ||||
|             # print(lst3) | ||||
|             for t in lst3: | ||||
|                 word = t[0] | ||||
|                 hard = t[1] | ||||
|                 # print('WORD %s HARD %4.2f' % (word, hard)) | ||||
|                 geometric = geometric + math.log(hard) | ||||
|                 count += 1 | ||||
|         return math.exp(geometric / max(count, 1)) | ||||
| 
 | ||||
|     #  calculation function 2: weighted average | ||||
|     d_user2 = revert_dict(d_user)  # key is date, and value is a list of words added in that date | ||||
|     count = 0 | ||||
|     geometric = 1 | ||||
|     for date in sorted(d_user2.keys(), | ||||
|                        reverse=True):  # most recently added words are more important while determining user's level | ||||
|     count = {}  # number of all kinds of words | ||||
|     percentages = {}  # percentages of all kinds of difficulties | ||||
|     total = 0  # total words | ||||
|     for date in d_user2.keys(): | ||||
|         lst = d_user2[date]  # a list of words | ||||
|         lst2 = []  # a list of tuples, (word, difficulty level) | ||||
|         for word in lst: | ||||
|             if word in d: | ||||
|                 lst2.append((word, d[word])) | ||||
|                 if d[word] not in count: | ||||
|                     count[d[word]] = 0 | ||||
|                 count[d[word]] += 1 | ||||
|                 total += 1 | ||||
| 
 | ||||
|         lst3 = sort_in_ascending_order(lst2)  # easiest tuple first | ||||
|         # print(lst3) | ||||
|         for t in lst3: | ||||
|             word = t[0] | ||||
|             hard = t[1] | ||||
|             # print('WORD %s HARD %4.2f' % (word, hard)) | ||||
|             geometric = geometric * (hard) | ||||
|             count += 1 | ||||
|             if count >= 10: | ||||
|                 return geometric ** (1 / count) | ||||
|     if total == 0: | ||||
|         return 1 | ||||
|     for k in count.keys(): | ||||
|         percentages[k] = count[k] / total | ||||
|     weight = map_percentages_to_levels(percentages) | ||||
|     sum = 0 | ||||
|     for k in weight.keys(): | ||||
|         sum += weight[k] * k | ||||
|     return sum | ||||
| 
 | ||||
|     return geometric ** (1 / max(count, 1)) | ||||
| 
 | ||||
| 
 | ||||
| def text_difficulty_level(s, d): | ||||
|  |  | |||
|  | @ -10,6 +10,32 @@ import operator | |||
| import os, sys # 引入模块sys,因为我要用里面的sys.argv列表中的信息来读取命令行参数。 | ||||
| import pickle_idea | ||||
| 
 | ||||
| 
 | ||||
| def map_percentages_to_levels(percentages): | ||||
|     ''' | ||||
|     功能:按照加权平均难度,给生词本计算难度分,计算权重的规则是(10 - 该词汇难度) * 该难度词汇占总词汇的比例,再进行归一化处理 | ||||
|     输入:难度占比字典,键代表难度3~8,值代表每种难度的单词的占比 | ||||
|     输出:权重字典,键代表难度3~8,值代表每种难度的单词的权重 | ||||
|     ''' | ||||
|     # 已排序的键 | ||||
|     sorted_keys = sorted(percentages.keys()) | ||||
| 
 | ||||
|     # 计算权重和权重总和 | ||||
|     sum = 0  # 总和 | ||||
|     levels_proportions = {} | ||||
|     for k in sorted_keys: | ||||
|         levels_proportions[k] = 10 - k | ||||
|     for k in sorted_keys: | ||||
|         levels_proportions[k] *= percentages[k] | ||||
|         sum += levels_proportions[k] | ||||
| 
 | ||||
|     # 归一化权重到权重总和为1 | ||||
|     for k in sorted_keys: | ||||
|         levels_proportions[k] /= sum | ||||
| 
 | ||||
|     return levels_proportions | ||||
| 
 | ||||
| 
 | ||||
| def freq(fruit): | ||||
|     ''' | ||||
|     功能: 把字符串转成列表。 目的是得到每个单词的频率。 | ||||
|  |  | |||
		Loading…
	
		Reference in New Issue