From 7a2d14901d26efdaacec21c8d3b8028c0e4b8b6f Mon Sep 17 00:00:00 2001
From: Hui Lan <lanhui@zjnu.edu.cn>
Date: Sat, 2 Nov 2019 13:16:21 +0800
Subject: app/difficulty.py: make the computed difficulty level more reasonable

If a word belongs to CET4, then it has level 1.
If a word belongs to CET6, then it has level 2.
If a word does not belong to either CET4 or CET6, then we need to decide its level using word frequency.
---
 app/difficulty.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/app/difficulty.py b/app/difficulty.py
index 58740f6..1c8a08f 100644
--- a/app/difficulty.py
+++ b/app/difficulty.py
@@ -20,29 +20,32 @@ def load_record(pickle_fname):
 
 def difficulty_level_from_frequency(word, d):
     level = 0
+    if not word in d:
+        return level
+    
     if 'what' in d:
-        ratio = (d['what']+1)/(d[word]+1)
+        ratio = (d['what']+1)/(d[word]+1) # what is a frequent word
         level = math.log( max(ratio, 1), 10)
 
-    level = min(level+1, 4) 
+    level = min(level, 4) 
     return level
 
 
 def get_difficulty_level(d1, d2):
     d = {}
-    L = list(d1.keys()) # in d1, we have freuqence for each word
+    L = list(d1.keys())  # in d1, we have freuqence for each word
     L2 = list(d2.keys()) # in d2, we have test types (e.g., CET4,CET6,BBC) for each word
     L.extend(L2)
-    L3 = list(set(L))
+    L3 = list(set(L)) # L3 contains all words
     for k in L3:
         if k in d2:
             if 'CET4' in d2[k]:
-                d[k] = 1
+                d[k] = 1 # CET4 word has level 1
             elif 'CET6' in d2[k]:
                 d[k] = 2
             elif 'BBC' in d2[k]:
                 d[k] = 4
-                if k in d1: # BBC could contain easy words not in CET4 or CET6.  So 4 is not reasonable.  Recompute difficulty level.
+                if k in d1: # BBC could contain easy words that are not in CET4 or CET6.  So 4 is not reasonable.  Recompute difficulty level.
                     d[k] = min(difficulty_level_from_frequency(k, d1), d[k])
         elif k in d1:
             d[k] = difficulty_level_from_frequency(k, d1)
-- 
cgit v1.2.1