fix BUG543

2024-07-03 14:36:01 +08:00 · 2024-07-03 14:36:01 +08:00 · db66c8ed86
parent cb576b40ed
commit db66c8ed86
2 changed files with 73 additions and 19 deletions
--- a/app/difficulty.py
+++ b/app/difficulty.py
@ -7,7 +7,7 @@
 import pickle
 import math
-from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order
+from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order, map_percentages_to_levels
 import snowballstemmer
@ -94,30 +94,58 @@ def revert_dict(d):
    return d2
-def user_difficulty_level(d_user, d):
+def user_difficulty_level(d_user, d, calc_func=0):
    '''
    two ways to calculate difficulty_level
    set calc_func!=0 to use sqrt, otherwise use weighted average
    '''
    if calc_func != 0:
        #  calculation function 1: sqrt
        d_user2 = revert_dict(d_user)  # key is date, and value is a list of words added in that date
        geometric = 0
        count = 0
        for date in sorted(d_user2.keys(),
                           reverse=True):  # most recently added words are more important while determining user's level
            lst = d_user2[date]  # a list of words
            lst2 = []  # a list of tuples, (word, difficulty level)
            for word in lst:
                if word in d:
                    lst2.append((word, d[word]))
            lst3 = sort_in_ascending_order(lst2)  # easiest tuple first
            # print(lst3)
            for t in lst3:
                word = t[0]
                hard = t[1]
                # print('WORD %s HARD %4.2f' % (word, hard))
                geometric = geometric + math.log(hard)
                count += 1
        return math.exp(geometric / max(count, 1))
    #  calculation function 2: weighted average
    d_user2 = revert_dict(d_user)  # key is date, and value is a list of words added in that date
-    count = 0
+    count = {}  # number of all kinds of words
-    geometric = 1
+    percentages = {}  # percentages of all kinds of difficulties
-    for date in sorted(d_user2.keys(),
+    total = 0  # total words
-                       reverse=True):  # most recently added words are more important while determining user's level
+    for date in d_user2.keys():
        lst = d_user2[date]  # a list of words
        lst2 = []  # a list of tuples, (word, difficulty level)
        for word in lst:
            if word in d:
-                lst2.append((word, d[word]))
+                if d[word] not in count:
                    count[d[word]] = 0
                count[d[word]] += 1
                total += 1
-        lst3 = sort_in_ascending_order(lst2)  # easiest tuple first
+    if total == 0:
-        # print(lst3)
+        return 1
-        for t in lst3:
+    for k in count.keys():
-            word = t[0]
+        percentages[k] = count[k] / total
-            hard = t[1]
+    weight = map_percentages_to_levels(percentages)
-            # print('WORD %s HARD %4.2f' % (word, hard))
+    sum = 0
-            geometric = geometric * (hard)
+    for k in weight.keys():
-            count += 1
+        sum += weight[k] * k
-            if count >= 10:
+    return sum
                return geometric ** (1 / count)
    return geometric ** (1 / max(count, 1))
 def text_difficulty_level(s, d):
--- a/app/wordfreqCMD.py
+++ b/app/wordfreqCMD.py
@ -10,6 +10,32 @@ import operator
 import os, sys # 引入模块sys，因为我要用里面的sys.argv列表中的信息来读取命令行参数。
 import pickle_idea
 def map_percentages_to_levels(percentages):
    '''
    功能：按照加权平均难度，给生词本计算难度分，计算权重的规则是(10 - 该词汇难度) * 该难度词汇占总词汇的比例，再进行归一化处理
    输入：难度占比字典，键代表难度3~8，值代表每种难度的单词的占比
    输出：权重字典，键代表难度3~8，值代表每种难度的单词的权重
    '''
    # 已排序的键
    sorted_keys = sorted(percentages.keys())
    # 计算权重和权重总和
    sum = 0  # 总和
    levels_proportions = {}
    for k in sorted_keys:
        levels_proportions[k] = 10 - k
    for k in sorted_keys:
        levels_proportions[k] *= percentages[k]
        sum += levels_proportions[k]
    # 归一化权重到权重总和为1
    for k in sorted_keys:
        levels_proportions[k] /= sum
    return levels_proportions
 def freq(fruit):
    '''
    功能： 把字符串转成列表。 目的是得到每个单词的频率。