EnglishPal/VocabularyLevelEstimator.py

import pickle
import math
import snowballstemmer
from pathlib import Path
from typing import Dict, List
from wordfreqCMD import (
    remove_punctuation,
    freq,
    sort_in_descending_order,
    sort_in_ascending_order,
    map_percentages_to_levels,
)

class VocabularyLevelEstimator:
    """Estimate text difficulty and user vocabulary level.

    All *public* method names and signatures stay **identical** to the original
    version so existing imports / calls continue to work.  Internals are
    refactored to match the newer logic you提供 (Oxford A1‑D2 loading, stem
    fallback, stop‑word handling, etc.)."""

    def __init__(self):
        self.ENGLISH_WORD_DIFFICULTY_DICT: Dict[str, int] = {}
        self._stemmer = snowballstemmer.stemmer("english")

    def load_record(self, pickle_fname: str):
        path = Path(pickle_fname)
        if path.suffix in {".p", ".pkl"}:
            with path.open("rb") as fh:
                return pickle.load(fh)
        level_map = {
            "A1": 1, "A2": 2, "B1": 3, "B2": 4, "C1": 5, "C2": 6, "D1": 7, "D2": 8,
        }
        d: Dict[str, int] = {}
        with path.open("r", encoding="utf‑8") as fh:
            for line in fh:
                parts = line.strip().split()
                if len(parts) == 3:
                    word, _pos, tag = parts
                    lvl = level_map.get(tag)
                    if lvl:
                        d[word] = min(d.get(word, lvl), lvl)
        return d

    def convert_test_type_to_difficulty_level(self, d):
        self.ENGLISH_WORD_DIFFICULTY_DICT = d
        return d

    def get_difficulty_level_for_user(self, d1, d2):
        diff = self.ENGLISH_WORD_DIFFICULTY_DICT or self.convert_test_type_to_difficulty_level(d2)
        for word in d1:
            if word in diff:
                continue
            stem = self._stemmer.stemWord(word)
            if stem in diff:
                diff[word] = diff[stem]
            else:
                if len(word) <= 3:
                    diff[word] = 0
                else:
                    diff[word] = 6
        return diff

    def revert_dict(self, d):
        d2 = {}
        for w, dates in d.items():
            dates = dates if isinstance(dates, list) else ["202108201900"] * dates
            for ts in dates:
                key = ts[:10]
                d2.setdefault(key, []).append(w)
        return d2

    def user_difficulty_level(self, d_user, d, calc_func=0):
        inverted = self.revert_dict(d_user)
        # geometric path
        if calc_func:
            log_sum = count = 0
            for date in sorted(inverted, reverse=True):
                tuples = [(w, d[w]) for w in inverted[date] if w in d]
                for _, lvl in sort_in_ascending_order(tuples):
                    log_sum += math.log(lvl)
                    count += 1
            return math.exp(log_sum / max(count, 1))
        # weighted avg path
        bucket, total = {}, 0
        for words in inverted.values():
            for w in words:
                if w in d:
                    lvl = d[w]
                    bucket[lvl] = bucket.get(lvl, 0) + 1
                    total += 1
        print("count =", bucket, "total =", total)
        if total == 0:
            return 0
        percentages = {k: v / total for k, v in bucket.items()}
        print("percentages =", percentages)
        weights = map_percentages_to_levels(percentages)
        return round(sum(weights[k] * k for k in weights))

    def text_difficulty_level(self, s, d):
        s = remove_punctuation(s).lower()
        pairs = []
        stop_words = {
            'the':1, 'and':1, 'of':1, 'to':1, 'what':1, 'in':1, 'there':1, 'when':1, 'them':1, 'would':1,
            'will':1, 'out':1, 'his':1, 'mr':1, 'that':1, 'up':1, 'more':1, 'your':1, 'it':1, 'now':1,
            'very':1, 'then':1, 'could':1, 'he':1, 'any':1, 'some':1, 'with':1, 'into':1, 'you':1, 'our':1,
            'man':1, 'other':1, 'time':1, 'was':1, 'than':1, 'know':1, 'about':1, 'only':1, 'like':1,
            'how':1, 'see':1, 'is':1, 'before':1, 'such':1, 'little':1, 'two':1, 'its':1, 'as':1, 'these':1,
            'may':1, 'much':1, 'down':1, 'for':1, 'well':1, 'should':1, 'those':1, 'after':1, 'same':1,
            'must':1, 'say':1, 'first':1, 'again':1, 'us':1, 'great':1, 'where':1, 'being':1, 'come':1,
            'over':1, 'good':1, 'himself':1, 'am':1, 'never':1, 'on':1, 'old':1, 'here':1, 'way':1, 'at':1,
            'go':1, 'upon':1, 'have':1, 'had':1, 'without':1, 'my':1, 'day':1, 'be':1, 'but':1, 'though':1,
            'from':1, 'not':1, 'too':1, 'another':1, 'this':1, 'even':1, 'still':1, 'her':1, 'yet':1,
            'under':1, 'by':1, 'let':1, 'just':1, 'all':1, 'because':1, 'we':1, 'always':1, 'off':1,
            'yes':1, 'so':1, 'while':1, 'why':1, 'which':1, 'me':1, 'are':1, 'or':1, 'no':1, 'if':1,
            'an':1, 'also':1, 'thus':1, 'who':1, 'cannot':1, 'she':1, 'whether':1, 'a':1,
        }
        for word, _ in freq(s):
            lvl = 1 if word in stop_words else d.get(word, 0)
            pairs.append((word, lvl))
        hardest = sort_in_descending_order(pairs)[:10]
        geo = n = 1
        for _, lvl in hardest:
            if lvl >= 2:
                geo *= lvl
                n += 1
        return 0 if n == 1 else round(geo ** (1 / (n - 1)))