EnglishPal/VocabularyLevelEstimator.py

128 lines
5.2 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters!

This file contains ambiguous Unicode characters that may be confused with others in your current locale. If your use case is intentional and legitimate, you can safely ignore this warning. Use the Escape button to highlight these characters.

import pickle
import math
import snowballstemmer
from pathlib import Path
from typing import Dict, List
from wordfreqCMD import (
remove_punctuation,
freq,
sort_in_descending_order,
sort_in_ascending_order,
map_percentages_to_levels,
)
class VocabularyLevelEstimator:
"""Estimate text difficulty and user vocabulary level.
All *public* method names and signatures stay **identical** to the original
version so existing imports / calls continue to work. Internals are
refactored to match the newer logic you提供 (Oxford A1D2 loading, stem
fallback, stopword handling, etc.)."""
def __init__(self):
self.ENGLISH_WORD_DIFFICULTY_DICT: Dict[str, int] = {}
self._stemmer = snowballstemmer.stemmer("english")
def load_record(self, pickle_fname: str):
path = Path(pickle_fname)
if path.suffix in {".p", ".pkl"}:
with path.open("rb") as fh:
return pickle.load(fh)
level_map = {
"A1": 1, "A2": 2, "B1": 3, "B2": 4, "C1": 5, "C2": 6, "D1": 7, "D2": 8,
}
d: Dict[str, int] = {}
with path.open("r", encoding="utf8") as fh:
for line in fh:
parts = line.strip().split()
if len(parts) == 3:
word, _pos, tag = parts
lvl = level_map.get(tag)
if lvl:
d[word] = min(d.get(word, lvl), lvl)
return d
def convert_test_type_to_difficulty_level(self, d):
self.ENGLISH_WORD_DIFFICULTY_DICT = d
return d
def get_difficulty_level_for_user(self, d1, d2):
diff = self.ENGLISH_WORD_DIFFICULTY_DICT or self.convert_test_type_to_difficulty_level(d2)
for word in d1:
if word in diff:
continue
stem = self._stemmer.stemWord(word)
if stem in diff:
diff[word] = diff[stem]
else:
if len(word) <= 3:
diff[word] = 0
else:
diff[word] = 6
return diff
def revert_dict(self, d):
d2 = {}
for w, dates in d.items():
dates = dates if isinstance(dates, list) else ["202108201900"] * dates
for ts in dates:
key = ts[:10]
d2.setdefault(key, []).append(w)
return d2
def user_difficulty_level(self, d_user, d, calc_func=0):
inverted = self.revert_dict(d_user)
# geometric path
if calc_func:
log_sum = count = 0
for date in sorted(inverted, reverse=True):
tuples = [(w, d[w]) for w in inverted[date] if w in d]
for _, lvl in sort_in_ascending_order(tuples):
log_sum += math.log(lvl)
count += 1
return math.exp(log_sum / max(count, 1))
# weighted avg path
bucket, total = {}, 0
for words in inverted.values():
for w in words:
if w in d:
lvl = d[w]
bucket[lvl] = bucket.get(lvl, 0) + 1
total += 1
print("count =", bucket, "total =", total)
if total == 0:
return 0
percentages = {k: v / total for k, v in bucket.items()}
print("percentages =", percentages)
weights = map_percentages_to_levels(percentages)
return round(sum(weights[k] * k for k in weights))
def text_difficulty_level(self, s, d):
s = remove_punctuation(s).lower()
pairs = []
stop_words = {
'the':1, 'and':1, 'of':1, 'to':1, 'what':1, 'in':1, 'there':1, 'when':1, 'them':1, 'would':1,
'will':1, 'out':1, 'his':1, 'mr':1, 'that':1, 'up':1, 'more':1, 'your':1, 'it':1, 'now':1,
'very':1, 'then':1, 'could':1, 'he':1, 'any':1, 'some':1, 'with':1, 'into':1, 'you':1, 'our':1,
'man':1, 'other':1, 'time':1, 'was':1, 'than':1, 'know':1, 'about':1, 'only':1, 'like':1,
'how':1, 'see':1, 'is':1, 'before':1, 'such':1, 'little':1, 'two':1, 'its':1, 'as':1, 'these':1,
'may':1, 'much':1, 'down':1, 'for':1, 'well':1, 'should':1, 'those':1, 'after':1, 'same':1,
'must':1, 'say':1, 'first':1, 'again':1, 'us':1, 'great':1, 'where':1, 'being':1, 'come':1,
'over':1, 'good':1, 'himself':1, 'am':1, 'never':1, 'on':1, 'old':1, 'here':1, 'way':1, 'at':1,
'go':1, 'upon':1, 'have':1, 'had':1, 'without':1, 'my':1, 'day':1, 'be':1, 'but':1, 'though':1,
'from':1, 'not':1, 'too':1, 'another':1, 'this':1, 'even':1, 'still':1, 'her':1, 'yet':1,
'under':1, 'by':1, 'let':1, 'just':1, 'all':1, 'because':1, 'we':1, 'always':1, 'off':1,
'yes':1, 'so':1, 'while':1, 'why':1, 'which':1, 'me':1, 'are':1, 'or':1, 'no':1, 'if':1,
'an':1, 'also':1, 'thus':1, 'who':1, 'cannot':1, 'she':1, 'whether':1, 'a':1,
}
for word, _ in freq(s):
lvl = 1 if word in stop_words else d.get(word, 0)
pairs.append((word, lvl))
hardest = sort_in_descending_order(pairs)[:10]
geo = n = 1
for _, lvl in hardest:
if lvl >= 2:
geo *= lvl
n += 1
return 0 if n == 1 else round(geo ** (1 / (n - 1)))