fix BUG543

2024-07-03 14:36:01 +08:00 · 2024-07-03 14:36:01 +08:00 · db66c8ed86
parent cb576b40ed
commit db66c8ed86
2 changed files with 73 additions and 19 deletions
--- a/app/difficulty.py
+++ b/app/difficulty.py
@ -7,7 +7,7 @@

 import pickle
 import math
-from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order
+from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order, map_percentages_to_levels
 import snowballstemmer


@ -94,30 +94,58 @@ def revert_dict(d):
    return d2


-def user_difficulty_level(d_user, d):
+def user_difficulty_level(d_user, d, calc_func=0):
+    '''
+    two ways to calculate difficulty_level
+    set calc_func!=0 to use sqrt, otherwise use weighted average
+    '''
+    if calc_func != 0:
+        #  calculation function 1: sqrt
+        d_user2 = revert_dict(d_user)  # key is date, and value is a list of words added in that date
+        geometric = 0
+        count = 0
+        for date in sorted(d_user2.keys(),
+                           reverse=True):  # most recently added words are more important while determining user's level
+            lst = d_user2[date]  # a list of words
+            lst2 = []  # a list of tuples, (word, difficulty level)
+            for word in lst:
+                if word in d:
+                    lst2.append((word, d[word]))
+
+            lst3 = sort_in_ascending_order(lst2)  # easiest tuple first
+            # print(lst3)
+            for t in lst3:
+                word = t[0]
+                hard = t[1]
+                # print('WORD %s HARD %4.2f' % (word, hard))
+                geometric = geometric + math.log(hard)
+                count += 1
+        return math.exp(geometric / max(count, 1))
+
+    #  calculation function 2: weighted average
    d_user2 = revert_dict(d_user)  # key is date, and value is a list of words added in that date
-    count = 0
-    geometric = 1
-    for date in sorted(d_user2.keys(),
-                       reverse=True):  # most recently added words are more important while determining user's level
+    count = {}  # number of all kinds of words
+    percentages = {}  # percentages of all kinds of difficulties
+    total = 0  # total words
+    for date in d_user2.keys():
        lst = d_user2[date]  # a list of words
-        lst2 = []  # a list of tuples, (word, difficulty level)
        for word in lst:
            if word in d:
-                lst2.append((word, d[word]))
+                if d[word] not in count:
+                    count[d[word]] = 0
+                count[d[word]] += 1
+                total += 1

-        lst3 = sort_in_ascending_order(lst2)  # easiest tuple first
-        # print(lst3)
-        for t in lst3:
-            word = t[0]
-            hard = t[1]
-            # print('WORD %s HARD %4.2f' % (word, hard))
-            geometric = geometric * (hard)
-            count += 1
-            if count >= 10:
-                return geometric ** (1 / count)
+    if total == 0:
+        return 1
+    for k in count.keys():
+        percentages[k] = count[k] / total
+    weight = map_percentages_to_levels(percentages)
+    sum = 0
+    for k in weight.keys():
+        sum += weight[k] * k
+    return sum

-    return geometric ** (1 / max(count, 1))


 def text_difficulty_level(s, d):
--- a/app/wordfreqCMD.py
+++ b/app/wordfreqCMD.py
@ -10,6 +10,32 @@ import operator
 import os, sys # 引入模块sys，因为我要用里面的sys.argv列表中的信息来读取命令行参数。
 import pickle_idea

+
+def map_percentages_to_levels(percentages):
+    '''
+    功能：按照加权平均难度，给生词本计算难度分，计算权重的规则是(10 - 该词汇难度) * 该难度词汇占总词汇的比例，再进行归一化处理
+    输入：难度占比字典，键代表难度3~8，值代表每种难度的单词的占比
+    输出：权重字典，键代表难度3~8，值代表每种难度的单词的权重
+    '''
+    # 已排序的键
+    sorted_keys = sorted(percentages.keys())
+
+    # 计算权重和权重总和
+    sum = 0  # 总和
+    levels_proportions = {}
+    for k in sorted_keys:
+        levels_proportions[k] = 10 - k
+    for k in sorted_keys:
+        levels_proportions[k] *= percentages[k]
+        sum += levels_proportions[k]
+
+    # 归一化权重到权重总和为1
+    for k in sorted_keys:
+        levels_proportions[k] /= sum
+
+    return levels_proportions
+
+
 def freq(fruit):
    '''
    功能： 把字符串转成列表。 目的是得到每个单词的频率。