From 0b6addd0b81098ad879731147c5f63045f8aa863 Mon Sep 17 00:00:00 2001
From: hid49776631 <3063528407@qq.com>
Date: Thu, 29 May 2025 14:58:11 +0800
Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=AD=A3=E5=90=8E=E7=9A=84=E4=BB=A3?=
 =?UTF-8?q?=E7=A0=81?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 VocabularyLevelEstimator.py | 127 +++++++++++++++++++++++++++++++++++
 test_vocabulary.py          | 128 ++++++++++++++++++++++++++++++++++++
 2 files changed, 255 insertions(+)
 create mode 100644 VocabularyLevelEstimator.py
 create mode 100644 test_vocabulary.py

diff --git a/VocabularyLevelEstimator.py b/VocabularyLevelEstimator.py
new file mode 100644
index 0000000..1bb54e5
--- /dev/null
+++ b/VocabularyLevelEstimator.py
@@ -0,0 +1,127 @@
+import pickle
+import math
+import snowballstemmer
+from pathlib import Path
+from typing import Dict, List
+from wordfreqCMD import (
+    remove_punctuation,
+    freq,
+    sort_in_descending_order,
+    sort_in_ascending_order,
+    map_percentages_to_levels,
+)
+
+class VocabularyLevelEstimator:
+    """Estimate text difficulty and user vocabulary level.
+
+    All *public* method names and signatures stay **identical** to the original
+    version so existing imports / calls continue to work.  Internals are
+    refactored to match the newer logic you提供 (Oxford A1‑D2 loading, stem
+    fallback, stop‑word handling, etc.)."""
+
+    def __init__(self):
+        self.ENGLISH_WORD_DIFFICULTY_DICT: Dict[str, int] = {}
+        self._stemmer = snowballstemmer.stemmer("english")
+
+    def load_record(self, pickle_fname: str):
+        path = Path(pickle_fname)
+        if path.suffix in {".p", ".pkl"}:
+            with path.open("rb") as fh:
+                return pickle.load(fh)
+        level_map = {
+            "A1": 1, "A2": 2, "B1": 3, "B2": 4, "C1": 5, "C2": 6, "D1": 7, "D2": 8,
+        }
+        d: Dict[str, int] = {}
+        with path.open("r", encoding="utf‑8") as fh:
+            for line in fh:
+                parts = line.strip().split()
+                if len(parts) == 3:
+                    word, _pos, tag = parts
+                    lvl = level_map.get(tag)
+                    if lvl:
+                        d[word] = min(d.get(word, lvl), lvl)
+        return d
+
+    def convert_test_type_to_difficulty_level(self, d):
+        self.ENGLISH_WORD_DIFFICULTY_DICT = d
+        return d
+
+    def get_difficulty_level_for_user(self, d1, d2):
+        diff = self.ENGLISH_WORD_DIFFICULTY_DICT or self.convert_test_type_to_difficulty_level(d2)
+        for word in d1:
+            if word in diff:
+                continue
+            stem = self._stemmer.stemWord(word)
+            if stem in diff:
+                diff[word] = diff[stem]
+            else:
+                if len(word) <= 3:
+                    diff[word] = 0
+                else:
+                    diff[word] = 6
+        return diff
+
+    def revert_dict(self, d):
+        d2 = {}
+        for w, dates in d.items():
+            dates = dates if isinstance(dates, list) else ["202108201900"] * dates
+            for ts in dates:
+                key = ts[:10]
+                d2.setdefault(key, []).append(w)
+        return d2
+
+    def user_difficulty_level(self, d_user, d, calc_func=0):
+        inverted = self.revert_dict(d_user)
+        # geometric path
+        if calc_func:
+            log_sum = count = 0
+            for date in sorted(inverted, reverse=True):
+                tuples = [(w, d[w]) for w in inverted[date] if w in d]
+                for _, lvl in sort_in_ascending_order(tuples):
+                    log_sum += math.log(lvl)
+                    count += 1
+            return math.exp(log_sum / max(count, 1))
+        # weighted avg path
+        bucket, total = {}, 0
+        for words in inverted.values():
+            for w in words:
+                if w in d:
+                    lvl = d[w]
+                    bucket[lvl] = bucket.get(lvl, 0) + 1
+                    total += 1
+        print("count =", bucket, "total =", total)
+        if total == 0:
+            return 0
+        percentages = {k: v / total for k, v in bucket.items()}
+        print("percentages =", percentages)
+        weights = map_percentages_to_levels(percentages)
+        return round(sum(weights[k] * k for k in weights))
+
+    def text_difficulty_level(self, s, d):
+        s = remove_punctuation(s).lower()
+        pairs = []
+        stop_words = {
+            'the':1, 'and':1, 'of':1, 'to':1, 'what':1, 'in':1, 'there':1, 'when':1, 'them':1, 'would':1,
+            'will':1, 'out':1, 'his':1, 'mr':1, 'that':1, 'up':1, 'more':1, 'your':1, 'it':1, 'now':1,
+            'very':1, 'then':1, 'could':1, 'he':1, 'any':1, 'some':1, 'with':1, 'into':1, 'you':1, 'our':1,
+            'man':1, 'other':1, 'time':1, 'was':1, 'than':1, 'know':1, 'about':1, 'only':1, 'like':1,
+            'how':1, 'see':1, 'is':1, 'before':1, 'such':1, 'little':1, 'two':1, 'its':1, 'as':1, 'these':1,
+            'may':1, 'much':1, 'down':1, 'for':1, 'well':1, 'should':1, 'those':1, 'after':1, 'same':1,
+            'must':1, 'say':1, 'first':1, 'again':1, 'us':1, 'great':1, 'where':1, 'being':1, 'come':1,
+            'over':1, 'good':1, 'himself':1, 'am':1, 'never':1, 'on':1, 'old':1, 'here':1, 'way':1, 'at':1,
+            'go':1, 'upon':1, 'have':1, 'had':1, 'without':1, 'my':1, 'day':1, 'be':1, 'but':1, 'though':1,
+            'from':1, 'not':1, 'too':1, 'another':1, 'this':1, 'even':1, 'still':1, 'her':1, 'yet':1,
+            'under':1, 'by':1, 'let':1, 'just':1, 'all':1, 'because':1, 'we':1, 'always':1, 'off':1,
+            'yes':1, 'so':1, 'while':1, 'why':1, 'which':1, 'me':1, 'are':1, 'or':1, 'no':1, 'if':1,
+            'an':1, 'also':1, 'thus':1, 'who':1, 'cannot':1, 'she':1, 'whether':1, 'a':1,
+        }
+        for word, _ in freq(s):
+            lvl = 1 if word in stop_words else d.get(word, 0)
+            pairs.append((word, lvl))
+        hardest = sort_in_descending_order(pairs)[:10]
+        geo = n = 1
+        for _, lvl in hardest:
+            if lvl >= 2:
+                geo *= lvl
+                n += 1
+        return 0 if n == 1 else round(geo ** (1 / (n - 1)))
diff --git a/test_vocabulary.py b/test_vocabulary.py
new file mode 100644
index 0000000..676797c
--- /dev/null
+++ b/test_vocabulary.py
@@ -0,0 +1,128 @@
+# Run this test script on the command line:
+#   pytest test_vocabulary.py
+#
+# Last modified by Mr Lan Hui on 2025-05-08
+
+from difficulty import (
+    user_difficulty_level,
+    text_difficulty_level,
+    get_difficulty_level_for_user,
+    load_record
+)
+
+# 词库加载（只加载一次）
+d_word_test = load_record("../../../updated_source_code/English Pal/app/oxford_words.txt")
+d_difficulty_default = get_difficulty_level_for_user({}, d_word_test)
+print(d_difficulty_default)
+# ----------------------------- 文章难度测试 -----------------------------
+
+def test_article_level_empty_content():
+    level = text_difficulty_level('', d_difficulty_default)
+    assert level == 0
+
+def test_article_level_punctuation_only():
+    level = text_difficulty_level(',', d_difficulty_default)
+    assert level == 0
+
+def test_article_level_digit_only():
+    level = text_difficulty_level('1', d_difficulty_default)
+    assert level == 0
+
+def test_article_level_single_word():
+    level = text_difficulty_level('source', d_difficulty_default)
+    assert 2 <= level <= 4
+
+def test_article_level_subset_vs_superset():
+    level1 = text_difficulty_level('source', d_difficulty_default)
+    level2 = text_difficulty_level('open source', d_difficulty_default)
+    print(f"Word: 'source', Difficulty: {d_difficulty_default.get('source', 'Unknown')}")
+    assert level1 <= level2
+
+def test_article_level_multiple_words():
+    content = 'Producing Open Source Software - How to Run a Successful Free Software Project'
+    level = text_difficulty_level(content, d_difficulty_default)
+    assert 3 <= level <= 5
+
+def test_article_level_short_paragraph():
+    content = ("At parties, people no longer give me a blank stare when I tell them I work in open source software. "
+               "‘Oh, yes — like Linux?’ they say. I nod eagerly in agreement. ‘Yes, exactly! That’s what I do.’ "
+               "It’s nice not to be completely fringe anymore...")
+    level = text_difficulty_level(content, d_difficulty_default)
+    assert 4 <= level <= 6
+
+def test_article_level_medium_paragraph():
+    content = ("In considering the Origin of Species  it is quite conceivable that a naturalist  reflecting on the mutual affinities of organic being ")  # truncated for brevity
+    level = text_difficulty_level(content, d_difficulty_default)
+    for word in content.split():
+        print(f"Word: {word} , Difficulty: {d_difficulty_default.get(word.lower(), 'Unknown')}")
+
+    assert 5 <= level <= 7
+
+def test_article_level_long_paragraph():
+    content = ("These several facts accord well with my theory. I believe in no fixed law of development, causing all the "
+               "inhabitants of a country to change abruptly...")  # truncated for brevity
+    level = text_difficulty_level(content, d_difficulty_default)
+    assert 4 <= level <= 8
+
+# ----------------------------- 用户词汇难度测试 -----------------------------
+
+def test_user_level_empty_dictionary():
+    d_user = {}
+    d_diff = get_difficulty_level_for_user(d_user, d_word_test)
+    level = user_difficulty_level(d_user, d_diff)
+    assert level == 0
+
+def test_user_level_one_simple_word():
+    d_user = {'simple': ['202408050930']}
+    d_diff = get_difficulty_level_for_user(d_user, d_word_test)
+    level = user_difficulty_level(d_user, d_diff)
+    assert 0 < level <= 4
+
+def test_user_level_invalid_word():
+    d_user = {'xyz': ['202408050930']}
+    d_diff = get_difficulty_level_for_user(d_user, d_word_test)
+    level = user_difficulty_level(d_user, d_diff)
+    assert level == 0
+
+def test_user_level_one_hard_word():
+    d_user = {'pasture': ['202408050930']}
+    d_diff = get_difficulty_level_for_user(d_user, d_word_test)
+    level = user_difficulty_level(d_user, d_diff)
+    assert 5 <= level <= 8
+
+def test_user_level_multiple_words():
+    d_user = {
+        'sessile': ['202408050930'], 'putrid': ['202408050930'], 'prodigal': ['202408050930'],
+        'presumptuous': ['202408050930'], 'prehension': ['202408050930'], 'pied': ['202408050930'],
+        'pedunculated': ['202408050930'], 'pasture': ['202408050930'], 'parturition': ['202408050930'],
+        'ovigerous': ['202408050930'],  'orifice': ['202408050930'], 'aberration': ['202408050930'],
+        'obliterate': ['202408050930'], 'niggard': ['202408050930'], 'neuter': ['202408050930'],
+        'locomotion': ['202408050930'], 'lineal': ['202408050930'], 'glottis': ['202408050930'],
+        'frivolous': ['202408050930'], 'frena': ['202408050930'], 'flotation': ['202408050930'],
+        'ductus': ['202408050930'], 'dorsal': ['202408050930'], 'dearth': ['202408050930'],
+        'crustacean': ['202408050930'], 'cornea': ['202408050930'], 'contrivance': ['202408050930'],
+        'collateral': ['202408050930'], 'cirriped': ['202408050930'], 'canon': ['202408050930'],
+        'branchiae': ['202408050930'], 'auditory': ['202408050930'], 'articulata': ['202408050930'],
+        'alimentary': ['202408050930'], 'adduce': ['202408050930'],
+    }
+    d_diff = get_difficulty_level_for_user(d_user, d_word_test)
+    level = user_difficulty_level(d_user, d_diff)
+    assert 6 <= level <= 8
+
+def test_user_level_consider_only_most_recent_words_difficult_words_most_recent():
+    d_user = {
+        'pasture': ['202408050930'], 'putrid': ['202408040000'], 'frivolous': ['202408030000'],
+        'simple': ['202408020000'], 'apple': ['202408010000']
+    }
+    d_diff = get_difficulty_level_for_user(d_user, d_word_test)
+    level = user_difficulty_level(d_user, d_diff)
+    assert 5 <= level <= 8
+
+def test_user_level_consider_only_most_recent_words_easy_words_most_recent():
+    d_user = {
+        'simple': ['202408050930'], 'apple': ['202408040000'], 'happy': ['202408030000'],
+        'pasture': ['202408020000'], 'putrid': ['202408010000'], 'dearth': ['202407310000']
+    }
+    d_diff = get_difficulty_level_for_user(d_user, d_word_test)
+    level = user_difficulty_level(d_user, d_diff)
+    assert 3 <= level <= 5