196 lines
7.4 KiB
Python
196 lines
7.4 KiB
Python
from difficulty import VocabularyLevelEstimator
|
|
import pickle
|
|
import os
|
|
from collections import Counter
|
|
import string
|
|
|
|
# Helper functions
|
|
|
|
def is_punctuation_or_digit(s):
|
|
return all((c in string.punctuation or c.isdigit() or c.isspace()) for c in s)
|
|
|
|
def is_valid_word(word):
|
|
return word.isalpha()
|
|
|
|
class UserVocabularyLevel(VocabularyLevelEstimator):
|
|
"""Estimates a user's vocabulary level based on their word history"""
|
|
|
|
def __init__(self, word_history, word_data_path=None):
|
|
"""
|
|
Initialize with user's word history
|
|
|
|
Args:
|
|
word_history (dict): Dictionary of words the user has learned
|
|
word_data_path (str): Optional path to Oxford word level data
|
|
"""
|
|
if word_data_path is None:
|
|
word_data_path = 'db/oxford_words.txt'
|
|
super().__init__(word_data_path)
|
|
self.word_history = word_history
|
|
self._level = None # Cache for computed level
|
|
|
|
@property
|
|
def level(self):
|
|
"""Calculate user's vocabulary level based on their word history"""
|
|
if self._level is None:
|
|
if not self.word_history:
|
|
self._level = 0
|
|
return self._level
|
|
# Get most recent 3 words by timestamp
|
|
# word_history: {word: [timestamp1, timestamp2, ...]}
|
|
word_times = []
|
|
for word, times in self.word_history.items():
|
|
for t in times:
|
|
word_times.append((t, word))
|
|
if not word_times:
|
|
self._level = 0
|
|
return self._level
|
|
word_times.sort(reverse=True) # Most recent first
|
|
recent_words = []
|
|
seen = set()
|
|
for t, word in word_times:
|
|
if word not in seen and is_valid_word(word):
|
|
recent_words.append(word)
|
|
seen.add(word)
|
|
if len(recent_words) == 3:
|
|
break
|
|
if not recent_words:
|
|
self._level = 0
|
|
return self._level
|
|
levels = [self.get_word_level(word) for word in recent_words]
|
|
# If all levels are 0 (invalid words), return 0
|
|
if all(l == 0 for l in levels):
|
|
self._level = 0
|
|
else:
|
|
avg = sum(levels) / len(levels)
|
|
# If all recent words are easy (avg < 4), set to 4
|
|
self._level = avg if avg >= 4 else 4
|
|
return self._level
|
|
|
|
def get_level_distribution(self):
|
|
"""Returns distribution of word levels in user's vocabulary"""
|
|
if not self.word_history:
|
|
return {}
|
|
levels = [self.get_word_level(word) for word in self.word_history.keys() if is_valid_word(word)]
|
|
return Counter(levels)
|
|
|
|
class ArticleVocabularyLevel(VocabularyLevelEstimator):
|
|
"""Estimates vocabulary level of an article"""
|
|
|
|
def __init__(self, content, word_data_path=None):
|
|
"""
|
|
Initialize with article content
|
|
|
|
Args:
|
|
content (str): The article text
|
|
word_data_path (str): Optional path to Oxford word level data
|
|
"""
|
|
if word_data_path is None:
|
|
word_data_path = 'db/oxford_words.txt'
|
|
super().__init__(word_data_path)
|
|
self.content = content
|
|
self._level = None
|
|
|
|
@property
|
|
def level(self):
|
|
"""Calculate article's vocabulary level"""
|
|
if self._level is None:
|
|
if not self.content or is_punctuation_or_digit(self.content):
|
|
self._level = 0
|
|
return self._level
|
|
words = [word.strip(string.punctuation).lower() for word in self.content.split()]
|
|
words = [w for w in words if w and is_valid_word(w)]
|
|
if not words:
|
|
self._level = 0
|
|
return self._level
|
|
word_levels = [(word, self.get_word_level(word)) for word in words]
|
|
word_levels = [wl for wl in word_levels if wl[1] > 0]
|
|
if not word_levels:
|
|
self._level = 0
|
|
return self._level
|
|
levels = [level for _, level in word_levels]
|
|
if len(levels) == 1:
|
|
self._level = levels[0]
|
|
elif len(levels) <= 3:
|
|
self._level = max(levels) + 0.1 * (len(levels) - 1)
|
|
else:
|
|
levels.sort(reverse=True)
|
|
hardest = levels[:10]
|
|
self._level = sum(hardest) / len(hardest)
|
|
return self._level
|
|
|
|
def get_difficult_words(self, threshold=6):
|
|
"""
|
|
Returns words above difficulty threshold
|
|
|
|
Args:
|
|
threshold (int): Minimum difficulty level (default 6)
|
|
|
|
Returns:
|
|
list: Words above threshold sorted by difficulty
|
|
"""
|
|
words = [word.strip(string.punctuation).lower() for word in self.content.split()]
|
|
words = [w for w in words if w and is_valid_word(w)]
|
|
|
|
difficult_words = []
|
|
for word in set(words): # Use set to remove duplicates
|
|
level = self.get_word_level(word)
|
|
if level >= threshold:
|
|
difficult_words.append((word, level))
|
|
|
|
return sorted(difficult_words, key=lambda x: x[1], reverse=True)
|
|
|
|
def load_record(pickle_file):
|
|
"""Load user word history from pickle file"""
|
|
try:
|
|
# Try current directory first
|
|
current_dir = os.getcwd()
|
|
file_path = os.path.join(current_dir, 'static', 'frequency', pickle_file)
|
|
with open(file_path, 'rb') as f:
|
|
return pickle.load(f)
|
|
except FileNotFoundError:
|
|
try:
|
|
# Try app directory path
|
|
base_path = r'C:\Users\ANNA\Desktop\app'
|
|
file_path = os.path.join(base_path, 'static', 'frequency', pickle_file)
|
|
with open(file_path, 'rb') as f:
|
|
return pickle.load(f)
|
|
except FileNotFoundError:
|
|
print(f"Warning: Could not find file: {file_path}")
|
|
# Create default word history with advanced words
|
|
default_history = {
|
|
"sophisticated": ["20240101", "20240102", "20240103"],
|
|
"analytical": ["20240101", "20240102", "20240103"],
|
|
"comprehensive": ["20240101", "20240102"],
|
|
"theoretical": ["20240101", "20240103"],
|
|
"implementation": ["20240102", "20240103"],
|
|
"algorithm": ["20240101", "20240102"],
|
|
"methodology": ["20240101", "20240103"],
|
|
"paradigm": ["20240102", "20240103"]
|
|
}
|
|
|
|
# Create directory if it doesn't exist
|
|
os.makedirs(os.path.dirname(file_path), exist_ok=True)
|
|
|
|
# Save default history
|
|
with open(file_path, 'wb') as f:
|
|
pickle.dump(default_history, f)
|
|
|
|
return default_history
|
|
|
|
if __name__ == "__main__":
|
|
# Example usage
|
|
d = load_record('frequency_mr1an85.pickle') # Just use the filename
|
|
print("User word history:", d)
|
|
|
|
# Test user vocabulary level
|
|
user = UserVocabularyLevel(d)
|
|
print("User vocabulary level:", user.level)
|
|
print("Level distribution:", user.get_level_distribution())
|
|
|
|
# Test article vocabulary level
|
|
article = ArticleVocabularyLevel(
|
|
"This is an interesting article with sophisticated vocabulary."
|
|
)
|
|
print("Article vocabulary level:", article.level)
|
|
print("Difficult words:", article.get_difficult_words()) |