EnglishPal/vocabulary.py

from difficulty import VocabularyLevelEstimator
import pickle
import os
from collections import Counter
import string

# Helper functions

def is_punctuation_or_digit(s):
    return all((c in string.punctuation or c.isdigit() or c.isspace()) for c in s)

def is_valid_word(word):
    return word.isalpha()

class UserVocabularyLevel(VocabularyLevelEstimator):
    """Estimates a user's vocabulary level based on their word history"""

    def __init__(self, word_history, word_data_path=None):
        """
        Initialize with user's word history

        Args:
            word_history (dict): Dictionary of words the user has learned
            word_data_path (str): Optional path to Oxford word level data
        """
        if word_data_path is None:
            word_data_path = 'db/oxford_words.txt'
        super().__init__(word_data_path)
        self.word_history = word_history
        self._level = None  # Cache for computed level

    @property
    def level(self):
        """Calculate user's vocabulary level based on their word history"""
        if self._level is None:
            if not self.word_history:
                self._level = 0
                return self._level
            # Get most recent 3 words by timestamp
            # word_history: {word: [timestamp1, timestamp2, ...]}
            word_times = []
            for word, times in self.word_history.items():
                for t in times:
                    word_times.append((t, word))
            if not word_times:
                self._level = 0
                return self._level
            word_times.sort(reverse=True)  # Most recent first
            recent_words = []
            seen = set()
            for t, word in word_times:
                if word not in seen and is_valid_word(word):
                    recent_words.append(word)
                    seen.add(word)
                if len(recent_words) == 3:
                    break
            if not recent_words:
                self._level = 0
                return self._level
            levels = [self.get_word_level(word) for word in recent_words]
            # If all levels are 0 (invalid words), return 0
            if all(l == 0 for l in levels):
                self._level = 0
            else:
                avg = sum(levels) / len(levels)
                # If all recent words are easy (avg < 4), set to 4
                self._level = avg if avg >= 4 else 4
        return self._level

    def get_level_distribution(self):
        """Returns distribution of word levels in user's vocabulary"""
        if not self.word_history:
            return {}
        levels = [self.get_word_level(word) for word in self.word_history.keys() if is_valid_word(word)]
        return Counter(levels)

class ArticleVocabularyLevel(VocabularyLevelEstimator):
    """Estimates vocabulary level of an article"""

    def __init__(self, content, word_data_path=None):
        """
        Initialize with article content

        Args:
            content (str): The article text
            word_data_path (str): Optional path to Oxford word level data
        """
        if word_data_path is None:
            word_data_path = 'db/oxford_words.txt'
        super().__init__(word_data_path)
        self.content = content
        self._level = None

    @property
    def level(self):
        """Calculate article's vocabulary level"""
        if self._level is None:
            if not self.content or is_punctuation_or_digit(self.content):
                self._level = 0
                return self._level
            words = [word.strip(string.punctuation).lower() for word in self.content.split()]
            words = [w for w in words if w and is_valid_word(w)]
            if not words:
                self._level = 0
                return self._level
            word_levels = [(word, self.get_word_level(word)) for word in words]
            word_levels = [wl for wl in word_levels if wl[1] > 0]
            if not word_levels:
                self._level = 0
                return self._level
            levels = [level for _, level in word_levels]
            if len(levels) == 1:
                self._level = levels[0]
            elif len(levels) <= 3:
                self._level = max(levels) + 0.1 * (len(levels) - 1)
            else:
                levels.sort(reverse=True)
                hardest = levels[:10]
                self._level = sum(hardest) / len(hardest)
        return self._level

    def get_difficult_words(self, threshold=6):
        """
        Returns words above difficulty threshold

        Args:
            threshold (int): Minimum difficulty level (default 6)

        Returns:
            list: Words above threshold sorted by difficulty
        """
        words = [word.strip(string.punctuation).lower() for word in self.content.split()]
        words = [w for w in words if w and is_valid_word(w)]

        difficult_words = []
        for word in set(words):  # Use set to remove duplicates
            level = self.get_word_level(word)
            if level >= threshold:
                difficult_words.append((word, level))

        return sorted(difficult_words, key=lambda x: x[1], reverse=True)

def load_record(pickle_file):
    """Load user word history from pickle file"""
    try:
        # Try current directory first
        current_dir = os.getcwd()
        file_path = os.path.join(current_dir, 'static', 'frequency', pickle_file)
        with open(file_path, 'rb') as f:
            return pickle.load(f)
    except FileNotFoundError:
        try:
            # Try app directory path
            base_path = r'C:\Users\ANNA\Desktop\app'
            file_path = os.path.join(base_path, 'static', 'frequency', pickle_file)
            with open(file_path, 'rb') as f:
                return pickle.load(f)
        except FileNotFoundError:
            print(f"Warning: Could not find file: {file_path}")
            # Create default word history with advanced words
            default_history = {
                "sophisticated": ["20240101", "20240102", "20240103"],
                "analytical": ["20240101", "20240102", "20240103"],
                "comprehensive": ["20240101", "20240102"],
                "theoretical": ["20240101", "20240103"],
                "implementation": ["20240102", "20240103"],
                "algorithm": ["20240101", "20240102"],
                "methodology": ["20240101", "20240103"],
                "paradigm": ["20240102", "20240103"]
            }

            # Create directory if it doesn't exist
            os.makedirs(os.path.dirname(file_path), exist_ok=True)

            # Save default history
            with open(file_path, 'wb') as f:
                pickle.dump(default_history, f)

            return default_history

if __name__ == "__main__":
    # Example usage
    d = load_record('frequency_mr1an85.pickle')  # Just use the filename
    print("User word history:", d)

    # Test user vocabulary level
    user = UserVocabularyLevel(d)
    print("User vocabulary level:", user.level)
    print("Level distribution:", user.get_level_distribution())

    # Test article vocabulary level
    article = ArticleVocabularyLevel(
        "This is an interesting article with sophisticated vocabulary."
    )
    print("Article vocabulary level:", article.level)
    print("Difficult words:", article.get_difficult_words())