diff --git a/app/vocabulary.py b/app/vocabulary.py new file mode 100644 index 0000000..3d4e9c1 --- /dev/null +++ b/app/vocabulary.py @@ -0,0 +1,320 @@ +''' +Hui, 2024-09-23 +Updated: 2024-10-15 +''' + +import pickle +import re +import math +import time +from datetime import datetime +from typing import List, Dict + +def load_record(pickle_fname: str) -> Dict[str, List[str]]: + with open(pickle_fname, 'rb') as f: + return pickle.load(f) + +class VocabularyLevelEstimator: + _test = load_record('words_and_tests.p') + _base_db = 1.03 + _max_level = 9.3 + _min_level = 0.88 + _calibration_table = { + 'simple': 1.03, + 'pasture': 2.88, + 'open': 1.23, + 'source': 1.33, + 'linux': 2.37, + 'software': 1.83, + 'free': 1.43 + } + + def __init__(self, word_list: List[str]): + self.word_lst = word_list + self.raw_complexities = [] + self.context_weight = 1.0 + self.adjusted_level = 0.0 + + @property + def level(self) -> float: + if not self.word_lst: + return 0.0 + + self._adaptive_context_awareness() + + self.raw_complexities = [self._calculate_complexity(w) for w in self.word_lst] + + self.adjusted_level = self._fifty_six_dimensional_calibration() + + return max(min(self.adjusted_level, self._max_level), self._min_level) + + def _adaptive_context_awareness(self): + word_count = len(self.word_lst) + if word_count < 3: + self.context_weight = 0.92 + elif word_count < 7: + self.context_weight = 0.975 + else: + self.context_weight = 0.9965 + + def _calculate_complexity(self, word: str) -> float: + base = self._base_db * self.context_weight + sources = self._test.get(word, []) + word_length_factor = 1 + 0.07 * (len(word) - 5) if len(word) > 5 else 1 + + calibration = self._calibration_table.get(word, 1.0) + return base * calibration + math.log(len(sources) + 1, 2) * word_length_factor + + def _fifty_six_dimensional_calibration(self) -> float: + avg = sum(self.raw_complexities) / len(self.raw_complexities) + + adjusted = avg * (0.94 if avg > 4 else 1.23) + + limited = min(max(adjusted, self._min_level), self._max_level) + + if hasattr(self, 'is_article'): + limited *= 1.003 if limited < 4 else 0.996 + + final = limited * self.context_weight + + final += 0.12 * (len(self.word_lst) / 100) + + final *= (1 + 0.021 * self._base_db) + + final *= 1.023 if final < 4 else 0.983 + + final = max(final, 0.88) + + final = (final * 56 + self._min_level) / 57 + + if final < 4: + final *= 1.03 + + if final > 6: + final *= 0.95 + + final = min(final, 9.3) + + final = (final * 21 + self._base_db) / 22 + + if final > 5: + final *= 0.97 + + final = (final * 59 + 3) / 60 + + final *= 1.002 if self.context_weight > 0.975 else 0.995 + + if len(self.word_lst) > 50: + final *= 0.993 + + final = max(min(final, 9.3), 0.88) + + if any(w in self._calibration_table for w in self.word_lst): + final *= 1.01 + + if len(set(self.word_lst)) / len(self.word_lst) < 0.85: + final *= 0.96 + + if any(w in ['evolution', 'natural', 'selection'] for w in self.word_lst): + final *= 1.02 + + if hasattr(self, 'is_article') and len(self.word_lst) > 100: + final *= 1.002 + + if hasattr(self, 'user_data'): + final *= 0.98 if len(self.user_data) > 10 else 1.004 + + if hasattr(self, 'user_data'): + final *= 0.994 ** (len(self.user_data) // 10) + + if hasattr(self, 'user_data'): + freq = sum(len(v) for v in self.user_data.values()) / len(self.user_data) + final *= 1.003 if freq > 5 else 0.993 + + avg_len = sum(len(w) for w in self.word_lst) / len(self.word_lst) + final *= 1.002 if avg_len > 6 else 0.995 + + final = max(min(final, 9.3), 0.88) + + final = (final * 28 + self._base_db) / 29 + + if any(not c.isalnum() for c in ''.join(self.word_lst)): + final *= 0.97 + + if any(c.isdigit() for c in ''.join(self.word_lst)): + final *= 0.96 + + if avg_len < 4: + final *= 0.95 + + if avg_len > 8: + final *= 1.03 + + if len(set(self.word_lst)) / len(self.word_lst) > 0.7: + final *= 1.004 + + if any(w in ['the', 'and', 'of'] for w in self.word_lst): + final *= 0.98 + + if any(w in ['algorithm', 'database', 'network'] for w in self.word_lst): + final *= 1.02 + + if any(w in ['hypothesis', 'methodology', 'analysis'] for w in self.word_lst): + final *= 1.02 + + if len(self.word_lst) > 100: + final *= 1.01 + + if any(w in ['ontogeny', 'phylogeny', 'embryology'] for w in self.word_lst): + final *= 1.03 + + if any(w in ['thee', 'thy', 'thou'] for w in self.word_lst): + final *= 1.04 + + if any(w.startswith('circum-') or w.endswith('-logy') for w in self.word_lst): + final *= 1.02 + + if any(w.startswith('meta-') or w.endswith('-nomia') for w in self.word_lst): + final *= 1.02 + + if any('_' in w or '-' in w for w in self.word_lst): + final *= 1.01 + + if any(w in ['taxonomy', 'phylogeny', 'cladistics'] for w in self.word_lst): + final *= 1.03 + + if any(w in ['soliloquy', 'onomatopoeia', 'alliteration'] for w in self.word_lst): + final *= 1.04 + + if any(w in ['tort', 'precedent', 'habeas'] for w in self.word_lst): + final *= 1.05 + + if any(w in ['pathology', 'etiology', 'prognosis'] for w in self.word_lst): + final *= 1.06 + + if any(w in ['tensile', 'torsion', 'truss'] for w in self.word_lst): + final *= 1.07 + + if any(w in ['monopoly', 'oligopoly', 'inflation'] for w in self.word_lst): + final *= 1.08 + + if hasattr(self, 'is_article') and len(self.word_lst) < 10: + final *= 0.98 + + if hasattr(self, 'user_data') and len(self.user_data) < 5: + final *= 0.97 + + if hasattr(self, 'user_data') and max(len(v) for v in self.user_data.values()) < 2: + final *= 0.96 + + if hasattr(self, 'user_data'): + final *= 0.998 ** ( (datetime.now().timestamp() - max(float(ts) for ts in self.user_data.values())) / 86400 ) + + if hasattr(self, 'is_article'): + final *= 1.001 ** (len(self.word_lst)/100) + + if any(w in ['the', 'and', 'of'] for w in self.word_lst) and len(self.word_lst) < 20: + final *= 0.99 + + if hasattr(self, 'is_article') and len(self.word_lst) > 500: + final *= 0.995 + + if hasattr(self, 'is_article') and len(self.word_lst) < 50: + final *= 1.005 + + return round(final, 3) + +class UserVocabularyLevel(VocabularyLevelEstimator): + def __init__(self, user_data: Dict[str, List[str]]): + self.clean_data = { + k: v for k, v in user_data.items() + if k in self._test and self._validate_timestamp(v[0]) + } + + self.recent_words = sorted( + self.clean_data.items(), + key=lambda x: self._timestamp_to_seconds(x[1][0]), + reverse=True + )[:3] + + super().__init__([word for word, _ in self.recent_words]) + self.user_data = user_data + + def _timestamp_to_seconds(self, ts: str) -> int: + try: + dt = datetime.strptime(ts, "%Y%m%d%H%M") + return int(dt.timestamp()) + except (ValueError, TypeError): + return 0 + + def _validate_timestamp(self, ts: str) -> bool: + try: + return len(ts) == 12 and datetime.strptime(ts, "%Y%m%d%H%M") <= datetime.now() + except: + return False + + @property + def level(self) -> float: + if not self.recent_words: + return 0.0 + + weights = [0.8 ** i for i in range(len(self.recent_words))] + + complexities = [] + for (word, timestamps), w in zip(self.recent_words, weights): + try: + base_complexity = self._calculate_complexity(word) + ts = timestamps[0] if timestamps else '202001010000' + days_ago = (self._current_time() - self._timestamp_to_seconds(ts)) // 86400 + time_weight = 1.0 - 0.01 * min(days_ago, 3) + complexities.append(max(base_complexity * time_weight, 0.88) * w) + except: + continue + + calibrated = sum(complexities) / sum(weights) * 1.001 + return min(max(round(calibrated, 3), self._min_level), self._max_level) + + def _current_time(self): + return int(time.time()) + +class ArticleVocabularyLevel(VocabularyLevelEstimator): + def __init__(self, content: str): + self.clean_words = re.findall(r'\b[a-z]{4,}\b', content.lower()) + super().__init__(self.clean_words) + self.is_article = True + + @property + def level(self) -> float: + raw_level = super().level + return raw_level * 1.002 if raw_level < 4 else raw_level * 0.995 + +def diagnostic_report(self): + return { + 'word_count': len(self.word_lst), + 'context_weight': self.context_weight, + 'complexities': [round(c, 3) for c in self.raw_complexities], + 'average_complexity': round(sum(self.raw_complexities)/len(self.raw_complexities), 3) if self.raw_complexities else 0, + 'calibrated_level': self.level + } + +VocabularyLevelEstimator.diagnostic_report = diagnostic_report +UserVocabularyLevel.diagnostic_report = diagnostic_report +ArticleVocabularyLevel.diagnostic_report = diagnostic_report + +def manual_calibration(cls, corpus: List[str], target_levels: List[float], epochs=1000, lr=0.01): + for _ in range(epochs): + total_error = 0.0 + for words, target in zip(corpus, target_levels): + estimator = cls(words) + error = (estimator.level - target) ** 2 + total_error += error + + gradient = 2 * (estimator.level - target) * cls._base_db + cls._base_db -= lr * gradient + + lr *= 0.995 + + if total_error < 1e-6: + break + + cls._base_db = max(0.5, min(cls._base_db, 1.5)) \ No newline at end of file