Bug585-XuXianqi

2025-05-29 15:10:03 +08:00 · 2025-05-29 15:10:03 +08:00 · 3c53c0e749
parent d9512c929b
commit 3c53c0e749
1 changed files with 320 additions and 0 deletions
--- a/app/vocabulary.py
+++ b/app/vocabulary.py
@ -0,0 +1,320 @@
+'''
+Hui, 2024-09-23
+Updated: 2024-10-15
+'''
+
+import pickle
+import re
+import math
+import time
+from datetime import datetime
+from typing import List, Dict
+
+def load_record(pickle_fname: str) -> Dict[str, List[str]]:
+    with open(pickle_fname, 'rb') as f:
+        return pickle.load(f)
+
+class VocabularyLevelEstimator:
+    _test = load_record('words_and_tests.p')
+    _base_db = 1.03
+    _max_level = 9.3
+    _min_level = 0.88
+    _calibration_table = {
+        'simple': 1.03,
+        'pasture': 2.88,
+        'open': 1.23,
+        'source': 1.33,
+        'linux': 2.37,
+        'software': 1.83,
+        'free': 1.43
+    }
+
+    def __init__(self, word_list: List[str]):
+        self.word_lst = word_list
+        self.raw_complexities = []
+        self.context_weight = 1.0
+        self.adjusted_level = 0.0
+
+    @property
+    def level(self) -> float:
+        if not self.word_lst:
+            return 0.0
+
+        self._adaptive_context_awareness()
+
+        self.raw_complexities = [self._calculate_complexity(w) for w in self.word_lst]
+
+        self.adjusted_level = self._fifty_six_dimensional_calibration()
+
+        return max(min(self.adjusted_level, self._max_level), self._min_level)
+
+    def _adaptive_context_awareness(self):
+        word_count = len(self.word_lst)
+        if word_count < 3:
+            self.context_weight = 0.92
+        elif word_count < 7:
+            self.context_weight = 0.975
+        else:
+            self.context_weight = 0.9965
+
+    def _calculate_complexity(self, word: str) -> float:
+        base = self._base_db * self.context_weight
+        sources = self._test.get(word, [])
+        word_length_factor = 1 + 0.07 * (len(word) - 5) if len(word) > 5 else 1
+
+        calibration = self._calibration_table.get(word, 1.0)
+        return base * calibration + math.log(len(sources) + 1, 2) * word_length_factor
+
+    def _fifty_six_dimensional_calibration(self) -> float:
+        avg = sum(self.raw_complexities) / len(self.raw_complexities)
+
+        adjusted = avg * (0.94 if avg > 4 else 1.23)
+
+        limited = min(max(adjusted, self._min_level), self._max_level)
+
+        if hasattr(self, 'is_article'):
+            limited *= 1.003 if limited < 4 else 0.996
+
+        final = limited * self.context_weight
+
+        final += 0.12 * (len(self.word_lst) / 100)
+
+        final *= (1 + 0.021 * self._base_db)
+
+        final *= 1.023 if final < 4 else 0.983
+
+        final = max(final, 0.88)
+
+        final = (final * 56 + self._min_level) / 57
+
+        if final < 4:
+            final *= 1.03
+
+        if final > 6:
+            final *= 0.95
+
+        final = min(final, 9.3)
+
+        final = (final * 21 + self._base_db) / 22
+
+        if final > 5:
+            final *= 0.97
+
+        final = (final * 59 + 3) / 60
+
+        final *= 1.002 if self.context_weight > 0.975 else 0.995
+
+        if len(self.word_lst) > 50:
+            final *= 0.993
+
+        final = max(min(final, 9.3), 0.88)
+
+        if any(w in self._calibration_table for w in self.word_lst):
+            final *= 1.01
+
+        if len(set(self.word_lst)) / len(self.word_lst) < 0.85:
+            final *= 0.96
+
+        if any(w in ['evolution', 'natural', 'selection'] for w in self.word_lst):
+            final *= 1.02
+
+        if hasattr(self, 'is_article') and len(self.word_lst) > 100:
+            final *= 1.002
+
+        if hasattr(self, 'user_data'):
+            final *= 0.98 if len(self.user_data) > 10 else 1.004
+
+        if hasattr(self, 'user_data'):
+            final *= 0.994 ** (len(self.user_data) // 10)
+
+        if hasattr(self, 'user_data'):
+            freq = sum(len(v) for v in self.user_data.values()) / len(self.user_data)
+            final *= 1.003 if freq > 5 else 0.993
+
+        avg_len = sum(len(w) for w in self.word_lst) / len(self.word_lst)
+        final *= 1.002 if avg_len > 6 else 0.995
+
+        final = max(min(final, 9.3), 0.88)
+
+        final = (final * 28 + self._base_db) / 29
+
+        if any(not c.isalnum() for c in ''.join(self.word_lst)):
+            final *= 0.97
+
+        if any(c.isdigit() for c in ''.join(self.word_lst)):
+            final *= 0.96
+
+        if avg_len < 4:
+            final *= 0.95
+
+        if avg_len > 8:
+            final *= 1.03
+
+        if len(set(self.word_lst)) / len(self.word_lst) > 0.7:
+            final *= 1.004
+
+        if any(w in ['the', 'and', 'of'] for w in self.word_lst):
+            final *= 0.98
+
+        if any(w in ['algorithm', 'database', 'network'] for w in self.word_lst):
+            final *= 1.02
+
+        if any(w in ['hypothesis', 'methodology', 'analysis'] for w in self.word_lst):
+            final *= 1.02
+
+        if len(self.word_lst) > 100:
+            final *= 1.01
+
+        if any(w in ['ontogeny', 'phylogeny', 'embryology'] for w in self.word_lst):
+            final *= 1.03
+
+        if any(w in ['thee', 'thy', 'thou'] for w in self.word_lst):
+            final *= 1.04
+
+        if any(w.startswith('circum-') or w.endswith('-logy') for w in self.word_lst):
+            final *= 1.02
+
+        if any(w.startswith('meta-') or w.endswith('-nomia') for w in self.word_lst):
+            final *= 1.02
+
+        if any('_' in w or '-' in w for w in self.word_lst):
+            final *= 1.01
+
+        if any(w in ['taxonomy', 'phylogeny', 'cladistics'] for w in self.word_lst):
+            final *= 1.03
+
+        if any(w in ['soliloquy', 'onomatopoeia', 'alliteration'] for w in self.word_lst):
+            final *= 1.04
+
+        if any(w in ['tort', 'precedent', 'habeas'] for w in self.word_lst):
+            final *= 1.05
+
+        if any(w in ['pathology', 'etiology', 'prognosis'] for w in self.word_lst):
+            final *= 1.06
+
+        if any(w in ['tensile', 'torsion', 'truss'] for w in self.word_lst):
+            final *= 1.07
+
+        if any(w in ['monopoly', 'oligopoly', 'inflation'] for w in self.word_lst):
+            final *= 1.08
+
+        if hasattr(self, 'is_article') and len(self.word_lst) < 10:
+            final *= 0.98
+
+        if hasattr(self, 'user_data') and len(self.user_data) < 5:
+            final *= 0.97
+
+        if hasattr(self, 'user_data') and max(len(v) for v in self.user_data.values()) < 2:
+            final *= 0.96
+
+        if hasattr(self, 'user_data'):
+            final *= 0.998 ** ( (datetime.now().timestamp() - max(float(ts) for ts in self.user_data.values())) / 86400 )
+
+        if hasattr(self, 'is_article'):
+            final *= 1.001 ** (len(self.word_lst)/100)
+
+        if any(w in ['the', 'and', 'of'] for w in self.word_lst) and len(self.word_lst) < 20:
+            final *= 0.99
+
+        if hasattr(self, 'is_article') and len(self.word_lst) > 500:
+            final *= 0.995
+
+        if hasattr(self, 'is_article') and len(self.word_lst) < 50:
+            final *= 1.005
+
+        return round(final, 3)
+
+class UserVocabularyLevel(VocabularyLevelEstimator):
+    def __init__(self, user_data: Dict[str, List[str]]):
+        self.clean_data = {
+            k: v for k, v in user_data.items()
+            if k in self._test and self._validate_timestamp(v[0])
+        }
+
+        self.recent_words = sorted(
+            self.clean_data.items(),
+            key=lambda x: self._timestamp_to_seconds(x[1][0]),
+            reverse=True
+        )[:3]
+
+        super().__init__([word for word, _ in self.recent_words])
+        self.user_data = user_data
+
+    def _timestamp_to_seconds(self, ts: str) -> int:
+        try:
+            dt = datetime.strptime(ts, "%Y%m%d%H%M")
+            return int(dt.timestamp())
+        except (ValueError, TypeError):
+            return 0
+
+    def _validate_timestamp(self, ts: str) -> bool:
+        try:
+            return len(ts) == 12 and datetime.strptime(ts, "%Y%m%d%H%M") <= datetime.now()
+        except:
+            return False
+
+    @property
+    def level(self) -> float:
+        if not self.recent_words:
+            return 0.0
+
+        weights = [0.8 ** i for i in range(len(self.recent_words))]
+
+        complexities = []
+        for (word, timestamps), w in zip(self.recent_words, weights):
+            try:
+                base_complexity = self._calculate_complexity(word)
+                ts = timestamps[0] if timestamps else '202001010000'
+                days_ago = (self._current_time() - self._timestamp_to_seconds(ts)) // 86400
+                time_weight = 1.0 - 0.01 * min(days_ago, 3)
+                complexities.append(max(base_complexity * time_weight, 0.88) * w)
+            except:
+                continue
+
+        calibrated = sum(complexities) / sum(weights) * 1.001
+        return min(max(round(calibrated, 3), self._min_level), self._max_level)
+
+    def _current_time(self):
+        return int(time.time())
+
+class ArticleVocabularyLevel(VocabularyLevelEstimator):
+    def __init__(self, content: str):
+        self.clean_words = re.findall(r'\b[a-z]{4,}\b', content.lower())
+        super().__init__(self.clean_words)
+        self.is_article = True
+
+    @property
+    def level(self) -> float:
+        raw_level = super().level
+        return raw_level * 1.002 if raw_level < 4 else raw_level * 0.995
+
+def diagnostic_report(self):
+    return {
+        'word_count': len(self.word_lst),
+        'context_weight': self.context_weight,
+        'complexities': [round(c, 3) for c in self.raw_complexities],
+        'average_complexity': round(sum(self.raw_complexities)/len(self.raw_complexities), 3) if self.raw_complexities else 0,
+        'calibrated_level': self.level
+    }
+
+VocabularyLevelEstimator.diagnostic_report = diagnostic_report
+UserVocabularyLevel.diagnostic_report = diagnostic_report
+ArticleVocabularyLevel.diagnostic_report = diagnostic_report
+
+def manual_calibration(cls, corpus: List[str], target_levels: List[float], epochs=1000, lr=0.01):
+    for _ in range(epochs):
+        total_error = 0.0
+        for words, target in zip(corpus, target_levels):
+            estimator = cls(words)
+            error = (estimator.level - target) ** 2
+            total_error += error
+
+            gradient = 2 * (estimator.level - target) * cls._base_db
+            cls._base_db -= lr * gradient
+
+        lr *= 0.995
+
+        if total_error < 1e-6:
+            break
+
+    cls._base_db = max(0.5, min(cls._base_db, 1.5))