forked from mrlan/EnglishPal
				
			fix BUG543
							parent
							
								
									cb576b40ed
								
							
						
					
					
						commit
						db66c8ed86
					
				|  | @ -7,7 +7,7 @@ | ||||||
| 
 | 
 | ||||||
| import pickle | import pickle | ||||||
| import math | import math | ||||||
| from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order | from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order, map_percentages_to_levels | ||||||
| import snowballstemmer | import snowballstemmer | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
|  | @ -94,10 +94,16 @@ def revert_dict(d): | ||||||
|     return d2 |     return d2 | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def user_difficulty_level(d_user, d): | def user_difficulty_level(d_user, d, calc_func=0): | ||||||
|  |     ''' | ||||||
|  |     two ways to calculate difficulty_level | ||||||
|  |     set calc_func!=0 to use sqrt, otherwise use weighted average | ||||||
|  |     ''' | ||||||
|  |     if calc_func != 0: | ||||||
|  |         #  calculation function 1: sqrt | ||||||
|         d_user2 = revert_dict(d_user)  # key is date, and value is a list of words added in that date |         d_user2 = revert_dict(d_user)  # key is date, and value is a list of words added in that date | ||||||
|  |         geometric = 0 | ||||||
|         count = 0 |         count = 0 | ||||||
|     geometric = 1 |  | ||||||
|         for date in sorted(d_user2.keys(), |         for date in sorted(d_user2.keys(), | ||||||
|                            reverse=True):  # most recently added words are more important while determining user's level |                            reverse=True):  # most recently added words are more important while determining user's level | ||||||
|             lst = d_user2[date]  # a list of words |             lst = d_user2[date]  # a list of words | ||||||
|  | @ -112,12 +118,34 @@ def user_difficulty_level(d_user, d): | ||||||
|                 word = t[0] |                 word = t[0] | ||||||
|                 hard = t[1] |                 hard = t[1] | ||||||
|                 # print('WORD %s HARD %4.2f' % (word, hard)) |                 # print('WORD %s HARD %4.2f' % (word, hard)) | ||||||
|             geometric = geometric * (hard) |                 geometric = geometric + math.log(hard) | ||||||
|                 count += 1 |                 count += 1 | ||||||
|             if count >= 10: |         return math.exp(geometric / max(count, 1)) | ||||||
|                 return geometric ** (1 / count) | 
 | ||||||
|  |     #  calculation function 2: weighted average | ||||||
|  |     d_user2 = revert_dict(d_user)  # key is date, and value is a list of words added in that date | ||||||
|  |     count = {}  # number of all kinds of words | ||||||
|  |     percentages = {}  # percentages of all kinds of difficulties | ||||||
|  |     total = 0  # total words | ||||||
|  |     for date in d_user2.keys(): | ||||||
|  |         lst = d_user2[date]  # a list of words | ||||||
|  |         for word in lst: | ||||||
|  |             if word in d: | ||||||
|  |                 if d[word] not in count: | ||||||
|  |                     count[d[word]] = 0 | ||||||
|  |                 count[d[word]] += 1 | ||||||
|  |                 total += 1 | ||||||
|  | 
 | ||||||
|  |     if total == 0: | ||||||
|  |         return 1 | ||||||
|  |     for k in count.keys(): | ||||||
|  |         percentages[k] = count[k] / total | ||||||
|  |     weight = map_percentages_to_levels(percentages) | ||||||
|  |     sum = 0 | ||||||
|  |     for k in weight.keys(): | ||||||
|  |         sum += weight[k] * k | ||||||
|  |     return sum | ||||||
| 
 | 
 | ||||||
|     return geometric ** (1 / max(count, 1)) |  | ||||||
| 
 | 
 | ||||||
| 
 | 
 | ||||||
| def text_difficulty_level(s, d): | def text_difficulty_level(s, d): | ||||||
|  |  | ||||||
|  | @ -10,6 +10,32 @@ import operator | ||||||
| import os, sys # 引入模块sys,因为我要用里面的sys.argv列表中的信息来读取命令行参数。 | import os, sys # 引入模块sys,因为我要用里面的sys.argv列表中的信息来读取命令行参数。 | ||||||
| import pickle_idea | import pickle_idea | ||||||
| 
 | 
 | ||||||
|  | 
 | ||||||
|  | def map_percentages_to_levels(percentages): | ||||||
|  |     ''' | ||||||
|  |     功能:按照加权平均难度,给生词本计算难度分,计算权重的规则是(10 - 该词汇难度) * 该难度词汇占总词汇的比例,再进行归一化处理 | ||||||
|  |     输入:难度占比字典,键代表难度3~8,值代表每种难度的单词的占比 | ||||||
|  |     输出:权重字典,键代表难度3~8,值代表每种难度的单词的权重 | ||||||
|  |     ''' | ||||||
|  |     # 已排序的键 | ||||||
|  |     sorted_keys = sorted(percentages.keys()) | ||||||
|  | 
 | ||||||
|  |     # 计算权重和权重总和 | ||||||
|  |     sum = 0  # 总和 | ||||||
|  |     levels_proportions = {} | ||||||
|  |     for k in sorted_keys: | ||||||
|  |         levels_proportions[k] = 10 - k | ||||||
|  |     for k in sorted_keys: | ||||||
|  |         levels_proportions[k] *= percentages[k] | ||||||
|  |         sum += levels_proportions[k] | ||||||
|  | 
 | ||||||
|  |     # 归一化权重到权重总和为1 | ||||||
|  |     for k in sorted_keys: | ||||||
|  |         levels_proportions[k] /= sum | ||||||
|  | 
 | ||||||
|  |     return levels_proportions | ||||||
|  | 
 | ||||||
|  | 
 | ||||||
| def freq(fruit): | def freq(fruit): | ||||||
|     ''' |     ''' | ||||||
|     功能: 把字符串转成列表。 目的是得到每个单词的频率。 |     功能: 把字符串转成列表。 目的是得到每个单词的频率。 | ||||||
|  |  | ||||||
		Loading…
	
		Reference in New Issue