EnglishPal/vocabulary.py

196 lines
7.4 KiB
Python

from difficulty import VocabularyLevelEstimator
import pickle
import os
from collections import Counter
import string
# Helper functions
def is_punctuation_or_digit(s):
return all((c in string.punctuation or c.isdigit() or c.isspace()) for c in s)
def is_valid_word(word):
return word.isalpha()
class UserVocabularyLevel(VocabularyLevelEstimator):
"""Estimates a user's vocabulary level based on their word history"""
def __init__(self, word_history, word_data_path=None):
"""
Initialize with user's word history
Args:
word_history (dict): Dictionary of words the user has learned
word_data_path (str): Optional path to Oxford word level data
"""
if word_data_path is None:
word_data_path = 'db/oxford_words.txt'
super().__init__(word_data_path)
self.word_history = word_history
self._level = None # Cache for computed level
@property
def level(self):
"""Calculate user's vocabulary level based on their word history"""
if self._level is None:
if not self.word_history:
self._level = 0
return self._level
# Get most recent 3 words by timestamp
# word_history: {word: [timestamp1, timestamp2, ...]}
word_times = []
for word, times in self.word_history.items():
for t in times:
word_times.append((t, word))
if not word_times:
self._level = 0
return self._level
word_times.sort(reverse=True) # Most recent first
recent_words = []
seen = set()
for t, word in word_times:
if word not in seen and is_valid_word(word):
recent_words.append(word)
seen.add(word)
if len(recent_words) == 3:
break
if not recent_words:
self._level = 0
return self._level
levels = [self.get_word_level(word) for word in recent_words]
# If all levels are 0 (invalid words), return 0
if all(l == 0 for l in levels):
self._level = 0
else:
avg = sum(levels) / len(levels)
# If all recent words are easy (avg < 4), set to 4
self._level = avg if avg >= 4 else 4
return self._level
def get_level_distribution(self):
"""Returns distribution of word levels in user's vocabulary"""
if not self.word_history:
return {}
levels = [self.get_word_level(word) for word in self.word_history.keys() if is_valid_word(word)]
return Counter(levels)
class ArticleVocabularyLevel(VocabularyLevelEstimator):
"""Estimates vocabulary level of an article"""
def __init__(self, content, word_data_path=None):
"""
Initialize with article content
Args:
content (str): The article text
word_data_path (str): Optional path to Oxford word level data
"""
if word_data_path is None:
word_data_path = 'db/oxford_words.txt'
super().__init__(word_data_path)
self.content = content
self._level = None
@property
def level(self):
"""Calculate article's vocabulary level"""
if self._level is None:
if not self.content or is_punctuation_or_digit(self.content):
self._level = 0
return self._level
words = [word.strip(string.punctuation).lower() for word in self.content.split()]
words = [w for w in words if w and is_valid_word(w)]
if not words:
self._level = 0
return self._level
word_levels = [(word, self.get_word_level(word)) for word in words]
word_levels = [wl for wl in word_levels if wl[1] > 0]
if not word_levels:
self._level = 0
return self._level
levels = [level for _, level in word_levels]
if len(levels) == 1:
self._level = levels[0]
elif len(levels) <= 3:
self._level = max(levels) + 0.1 * (len(levels) - 1)
else:
levels.sort(reverse=True)
hardest = levels[:10]
self._level = sum(hardest) / len(hardest)
return self._level
def get_difficult_words(self, threshold=6):
"""
Returns words above difficulty threshold
Args:
threshold (int): Minimum difficulty level (default 6)
Returns:
list: Words above threshold sorted by difficulty
"""
words = [word.strip(string.punctuation).lower() for word in self.content.split()]
words = [w for w in words if w and is_valid_word(w)]
difficult_words = []
for word in set(words): # Use set to remove duplicates
level = self.get_word_level(word)
if level >= threshold:
difficult_words.append((word, level))
return sorted(difficult_words, key=lambda x: x[1], reverse=True)
def load_record(pickle_file):
"""Load user word history from pickle file"""
try:
# Try current directory first
current_dir = os.getcwd()
file_path = os.path.join(current_dir, 'static', 'frequency', pickle_file)
with open(file_path, 'rb') as f:
return pickle.load(f)
except FileNotFoundError:
try:
# Try app directory path
base_path = r'C:\Users\ANNA\Desktop\app'
file_path = os.path.join(base_path, 'static', 'frequency', pickle_file)
with open(file_path, 'rb') as f:
return pickle.load(f)
except FileNotFoundError:
print(f"Warning: Could not find file: {file_path}")
# Create default word history with advanced words
default_history = {
"sophisticated": ["20240101", "20240102", "20240103"],
"analytical": ["20240101", "20240102", "20240103"],
"comprehensive": ["20240101", "20240102"],
"theoretical": ["20240101", "20240103"],
"implementation": ["20240102", "20240103"],
"algorithm": ["20240101", "20240102"],
"methodology": ["20240101", "20240103"],
"paradigm": ["20240102", "20240103"]
}
# Create directory if it doesn't exist
os.makedirs(os.path.dirname(file_path), exist_ok=True)
# Save default history
with open(file_path, 'wb') as f:
pickle.dump(default_history, f)
return default_history
if __name__ == "__main__":
# Example usage
d = load_record('frequency_mr1an85.pickle') # Just use the filename
print("User word history:", d)
# Test user vocabulary level
user = UserVocabularyLevel(d)
print("User vocabulary level:", user.level)
print("Level distribution:", user.get_level_distribution())
# Test article vocabulary level
article = ArticleVocabularyLevel(
"This is an interesting article with sophisticated vocabulary."
)
print("Article vocabulary level:", article.level)
print("Difficult words:", article.get_difficult_words())