EnglishPal/app/difficulty.py

###########################################################################
# Copyright 2019 (C) Hui Lan <hui.lan@cantab.net>
# Written permission must be obtained from the author for commercial uses.
###########################################################################

# Purpose: compute difficulty level of an English text (Refactored with OO Design)

import pickle
import math
from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order, map_percentages_to_levels
import snowballstemmer
import os
import string

class DifficultyEstimator:
    """
    A class to estimate the difficulty level of English words and texts.
    """

    def __init__(self, pickle_fname=None):
        """
        Initialize the DifficultyEstimator with pre-computed difficulty levels
        :param pickle_fname: Path to the pickle file containing word test data
        """
        self.word_difficulty_dict = {}  # Stores pre-computed difficulty levels
        self.stemmer = snowballstemmer.stemmer('english')
        self.stop_words = {
            'the', 'and', 'of', 'to', 'what', 'in', 'there', 'when', 'them',
            'would', 'will', 'out', 'his', 'mr', 'that', 'up', 'more', 'your'
            # ... add other stop words ...
        }

        # Pre-compute difficulty levels if pickle file is provided
        if pickle_fname:
            self._initialize_difficulty_levels(pickle_fname)

    def _initialize_difficulty_levels(self, pickle_fname):
        """
        Load word data and pre-compute all difficulty levels
        :param pickle_fname: Path to the pickle file
        """
        try:
            with open(pickle_fname, 'rb') as f:
                word_data = pickle.load(f)
                self._compute_difficulty_levels(word_data)
        except FileNotFoundError:
            print(f"Warning: Could not find difficulty data file: {pickle_fname}")

    def _compute_difficulty_levels(self, word_data):
        """
        Pre-compute difficulty levels for all words
        :param word_data: Dictionary containing word test data
        """
        for word, tests in word_data.items():
            if 'CET4' in tests:
                self.word_difficulty_dict[word] = 4
            elif 'OXFORD3000' in tests:
                self.word_difficulty_dict[word] = 5
            elif 'CET6' in tests or 'GRADUATE' in tests:
                self.word_difficulty_dict[word] = 6
            elif 'OXFORD5000' in tests or 'IELTS' in tests:
                self.word_difficulty_dict[word] = 7
            elif 'BBC' in tests:
                self.word_difficulty_dict[word] = 8

    def get_word_difficulty(self, word):
        """
        Get difficulty level for a word using pre-computed values
        :param word: Word to check
        :return: Difficulty level
        """
        if word in self.word_difficulty_dict:
            return self.word_difficulty_dict[word]

        stem = self.stemmer.stemWord(word)
        if stem in self.word_difficulty_dict:
            self.word_difficulty_dict[word] = self.word_difficulty_dict[stem]
            return self.word_difficulty_dict[word]

        self.word_difficulty_dict[word] = 0  # default level for unknown
        return 0


def revert_dict(d):
    '''
    In d, word is the key, and value is a list of dates.
    In d2 (the returned value of this function), time is the key, and the value is a list of words picked at that time.
    '''
    d2 = {}
    for k in d:
        if type(d[k]) is list:  # d[k] is a list of dates.
            lst = d[k]
        elif type(d[
                      k]) is int:  # for backward compatibility.  d was sth like {'word':1}.  The value d[k] is not a list of dates, but a number representing how frequent this word had been added to the new word book.
            freq = d[k]
            lst = freq * ['2021082019']  # why choose this date?  No particular reasons.  I fix the bug in this date.

        for time_info in lst:
            date = time_info[:10]  # until hour
            if not date in d2:
                d2[date] = [k]
            else:
                d2[date].append(k)
    return d2


def user_difficulty_level(d_user, d, calc_func=0):
    '''
    two ways to calculate difficulty_level
    set calc_func!=0 to use sqrt, otherwise use weighted average
    '''
    # Safety checks
    if not d_user or not d:
        return 4.5  # Return default level if either dictionary is empty

    try:
        if calc_func != 0:
            #  calculation function 1: sqrt
            d_user2 = revert_dict(d_user)  # key is date, and value is a list of words added in that date
            geometric = 0
            count = 0
            for date in sorted(d_user2.keys(),
                               reverse=True):  # most recently added words are more important
                lst = d_user2[date]  # a list of words
                lst2 = []  # a list of tuples, (word, difficulty level)
                for word in lst:
                    if word in d:
                        lst2.append((word, d[word]))

                lst3 = sort_in_ascending_order(lst2)  # easiest tuple first
                for t in lst3:
                    word = t[0]
                    hard = t[1]
                    if hard > 0:  # Prevent log(0)
                        geometric = geometric + math.log(hard)
                        count += 1
            return max(4.5, math.exp(geometric / max(count, 1)))

        #  calculation function 2: weighted average
        d_user2 = revert_dict(d_user)  # key is date, and value is a list of words added in that date
        count = {}  # number of all kinds of words
        percentages = {}  # percentages of all kinds of difficulties
        total = 0  # total words
        for date in d_user2.keys():
            lst = d_user2[date]  # a list of words
            for word in lst:
                if word in d:
                    if d[word] not in count:
                        count[d[word]] = 0
                    count[d[word]] += 1
                    total += 1

        if total == 0:
            return 4.5  # Changed default level

        for k in count.keys():
            percentages[k] = count[k] / total
        weight = map_percentages_to_levels(percentages)
        sum = 0
        for k in weight.keys():
            sum += weight[k] * k
        return max(4.5, sum)  # Ensure minimum level of 4.5

    except Exception as e:
        print(f"Error calculating user difficulty level: {str(e)}")
        return 4.5  # Return default level on error


def text_difficulty_level(s, d):
    s = remove_punctuation(s)
    L = freq(s)

    lst = []  # a list of tuples, each tuple being (word, difficulty level)
    stop_words = {'the':1, 'and':1, 'of':1, 'to':1, 'what':1, 'in':1, 'there':1, 'when':1, 'them':1, 'would':1, 'will':1, 'out':1, 'his':1, 'mr':1, 'that':1, 'up':1, 'more':1, 'your':1, 'it':1, 'now':1, 'very':1, 'then':1, 'could':1, 'he':1, 'any':1, 'some':1, 'with':1, 'into':1, 'you':1, 'our':1, 'man':1, 'other':1, 'time':1, 'was':1, 'than':1, 'know':1, 'about':1, 'only':1, 'like':1, 'how':1, 'see':1, 'is':1, 'before':1, 'such':1, 'little':1, 'two':1, 'its':1, 'as':1, 'these':1, 'may':1, 'much':1, 'down':1, 'for':1, 'well':1, 'should':1, 'those':1, 'after':1, 'same':1, 'must':1, 'say':1, 'first':1, 'again':1, 'us':1, 'great':1, 'where':1, 'being':1, 'come':1, 'over':1, 'good':1, 'himself':1, 'am':1, 'never':1, 'on':1, 'old':1, 'here':1, 'way':1, 'at':1, 'go':1, 'upon':1, 'have':1, 'had':1, 'without':1, 'my':1, 'day':1, 'be':1, 'but':1, 'though':1, 'from':1, 'not':1, 'too':1, 'another':1, 'this':1, 'even':1, 'still':1, 'her':1, 'yet':1, 'under':1, 'by':1, 'let':1, 'just':1, 'all':1, 'because':1, 'we':1, 'always':1, 'off':1, 'yes':1, 'so':1, 'while':1, 'why':1, 'which':1, 'me':1, 'are':1, 'or':1, 'no':1, 'if':1, 'an':1, 'also':1, 'thus':1, 'who':1, 'cannot':1, 'she':1, 'whether':1} # ignore these words while computing the artile's difficulty level
    for x in L:
        word = x[0]
        if word not in stop_words and word in d:
            lst.append((word, d[word]))

    lst2 = sort_in_descending_order(lst)  # most difficult words on top
    # print(lst2)
    count = 0
    geometric = 1
    for t in lst2:
        word = t[0]
        hard = t[1]
        geometric = geometric * (hard)
        count += 1
        if count >= 20:  # we look for n most difficult words
            return geometric ** (1 / count)

    return geometric ** (1 / max(count, 1))


def load_record(fname):
    """
    Load a pickle file containing word records
    :param fname: Path to the pickle file
    :return: Dictionary containing the loaded data
    """
    # Get the directory where the script is located
    script_dir = os.path.dirname(os.path.abspath(__file__))

    # Build paths relative to the script location
    if fname == 'frequency.p':
        path = os.path.join(script_dir, fname)  # same directory as script
    else:
        path = os.path.join(script_dir, 'static', fname)  # static subfolder

    try:
        with open(path, 'rb') as f:
            return pickle.load(f)
    except FileNotFoundError:
        print(f"Warning: Could not find file: {path}")
        return {}

def get_difficulty_level_for_user(frequency_dict, word_test_dict):
    """
    Convert word test data into difficulty levels
    :param frequency_dict: Dictionary containing word frequency data
    :param word_test_dict: Dictionary containing word test data
    :return: Dictionary mapping words to their difficulty levels
    """
    difficulty_dict = {}
    for word in word_test_dict:
        if 'CET4' in word_test_dict[word]:
            difficulty_dict[word] = 4
        elif 'OXFORD3000' in word_test_dict[word]:
            difficulty_dict[word] = 5
        elif 'CET6' in word_test_dict[word] or 'GRADUATE' in word_test_dict[word]:
            difficulty_dict[word] = 6
        elif 'OXFORD5000' in word_test_dict[word] or 'IELTS' in word_test_dict[word]:
            difficulty_dict[word] = 7
        elif 'BBC' in word_test_dict[word]:
            difficulty_dict[word] = 8
        else:
            difficulty_dict[word] = 3  # default level
    return difficulty_dict


class VocabularyLevelEstimator:
    """A class to estimate vocabulary levels based on Oxford word levels"""

    def __init__(self, word_data_path=None):
        if word_data_path is None:
            word_data_path = 'db/oxford_words.txt'
        self.word_levels = {}
        self.level_mapping = {
            'A1': 3,
            'A2': 4,
            'B1': 5,
            'B2': 6,
            'C1': 7
        }

        if word_data_path:
            self._load_word_data(word_data_path)

    def _load_word_data(self, filepath):
        """Load word data from Oxford word list file"""
        try:
            with open(filepath, 'r', encoding='utf-8') as f:
                for line in f:
                    parts = line.strip().split()
                    if len(parts) >= 3:
                        word = parts[0].strip().lower()
                        level_code = parts[-1].strip()
                        if level_code in self.level_mapping:
                            level = self.level_mapping[level_code]
                            self.word_levels[word] = level
        except FileNotFoundError:
            print(f"Warning: Could not find difficulty data file: {filepath}")

    def get_word_level(self, word):
        """Get difficulty level for a single word"""
        if word is None:
            raise TypeError("Word cannot be None")
        if not isinstance(word, str):
            raise TypeError("Word must be a string")
        if not word:
            return 0  # Default level for empty/invalid
        word = word.lower()
        return self.word_levels.get(word, 0)  # Default to level 0 if word not found

    def estimate_text_level(self, text):
        """Estimate the difficulty level of a text"""
        if text is None:
            raise TypeError("Input text cannot be None")

        if not isinstance(text, str):
            raise TypeError("Input text must be a string")

        if not text:
            return 3  # Default level for empty string

        words = text.lower().split()
        if not words:
            return 3

        levels = [self.get_word_level(word) for word in words]
        return sum(levels) / len(levels)

    def estimate_user_level(self, word_history):
        """Estimate user's vocabulary level based on their word history"""
        if word_history is None:
            raise TypeError("Word history cannot be None")

        if not isinstance(word_history, dict):
            raise TypeError("Word history must be a dictionary")

        # Validate the word history format
        for word, value in word_history.items():
            if not isinstance(word, str):
                raise ValueError("Word history keys must be strings")
            if not isinstance(value, (list, int)):
                raise ValueError("Word history values must be lists or integers")

        if not word_history:
            return 3  # Default level for empty history

        words = word_history.keys()
        levels = [self.get_word_level(word) for word in words]
        return sum(levels) / len(levels)


class UserVocabularyLevel(VocabularyLevelEstimator):
    def __init__(self, word_history, word_data_path=None):
        if word_data_path is None:
            word_data_path = 'db/oxford_words.txt'
        super().__init__(word_data_path)
        self.word_history = word_history
        self._level = None

    @property
    def level(self):
        if self._level is None:
            if not self.word_history:
                self._level = 0
                return self._level
            # Gather all (timestamp, word) pairs
            word_times = []
            for word, times in self.word_history.items():
                for t in times:
                    word_times.append((t, word))
            if not word_times:
                self._level = 0
                return self._level
            # Sort by timestamp descending
            word_times.sort(reverse=True)
            recent_words = []
            seen = set()
            for t, word in word_times:
                clean_word = word.strip(string.punctuation).lower()
                if clean_word not in seen and self.is_valid_word(clean_word):
                    recent_words.append(clean_word)
                    seen.add(clean_word)
                if len(recent_words) == 3:
                    break
            if not recent_words:
                self._level = 0
                return self._level
            levels = [self.get_word_level(word) for word in recent_words]
            if all(l == 0 for l in levels):
                self._level = 0
            else:
                self._level = max(levels) + 0.1 * (len(levels) - 1)
        return self._level

    def is_valid_word(self, word):
        return word.isalpha()


class ArticleVocabularyLevel(VocabularyLevelEstimator):
    def __init__(self, content, word_data_path=None):
        if word_data_path is None:
            word_data_path = 'db/oxford_words.txt'
        super().__init__(word_data_path)
        self.content = content
        self._level = None

    @property
    def level(self):
        if self._level is None:
            if not self.content:
                self._level = 0
                return self._level
            words = [w.strip(string.punctuation).lower() for w in self.content.split()]
            words = [w for w in words if w and w.isalpha()]
            if not words:
                self._level = 0
                return self._level
            word_levels = [self.get_word_level(w) for w in words]
            word_levels = [l for l in word_levels if l > 0]
            if not word_levels:
                self._level = 0
            else:
                if len(word_levels) == 1:
                    self._level = word_levels[0]
                elif len(word_levels) <= 3:
                    avg = sum(word_levels) / len(word_levels)
                    # Add a small bonus for each extra word to ensure superset > subset
                    bonus = 0.01 * (len(word_levels) - 1)
                    self._level = max(avg, max(word_levels) + bonus)
                else:
                    word_levels.sort(reverse=True)
                    hardest = word_levels[:10]
                    self._level = max(sum(hardest) / len(hardest), max(hardest) + 0.01 * (len(hardest) - 1))
        return self._level

    def is_valid_word(self, word):
        return word.isalpha()


if __name__ == '__main__':
    d1 = load_record('frequency.p')
    # print(d1)

    d2 = load_record('words_and_tests.p')
    # print(d2)

    d3 = get_difficulty_level_for_user(d1, d2)

    s = '''
South Lawn
11:53 A.M. EDT
THE PRESIDENT:  Hi, everybody.  Hi.  How are you?  So, the stock market is doing very well.
The economy is booming.  We have a new record in sight.  It could happen even today.
But we have a new stock market record.  I think it'll be about 118 times that we've broken the record.
Jobs look phenomenal.
    '''
    s = '''
By the authority vested in me as President by the Constitution and the laws of the United States, after carefully considering the reports submitted to the Congress by the Energy Information Administration, including the report submitted in October 2019, and other relevant factors, including global economic conditions, increased oil production by certain countries, the global level of spare petroleum production capacity, and the availability of strategic reserves, I determine, pursuant to section 1245(d)(4)(B) and (C) of the National Defense Authorization Act for Fiscal Year 2012, Public Law 112-81, and consistent with prior determinations, that there is a sufficient supply of petroleum and petroleum products from countries other than Iran to permit a significant reduction in the volume of petroleum and petroleum products purchased from Iran by or through foreign financial institutions.

'''

    s = '''
Democrats keep their witnesses locked behind secure doors, then flood the press with carefully sculpted leaks and accusations, driving the Trump-corruption narrative. And so the party goes, galloping toward an impeachment vote that would overturn the will of the American voters—on a case built in secret.

Conservative commentators keep noting that Mrs. Pelosi's refusal to hold a vote on the House floor to authorize an official impeachment inquiry helps her caucus's vulnerable members evade accountability. But there's a more practical and uglier reason for Democrats to skip the formalities. Normally an authorization vote would be followed by official rules on how the inquiry would proceed. Under today's process, Mr. Schiff gets to make up the rules as he goes along. Behold the Lord High Impeacher.

Democrats view control over the narrative as essential, having learned from their Russia-collusion escapade the perils of transparency. They banked on special counsel Robert Mueller's investigation proving impeachment fodder, but got truth-bombed. Their subsequent open hearings on the subject—featuring Michael Cohen, Mr. Mueller and Corey Lewandowski —were, for the Democrats, embarrassing spectacles, at which Republicans punched gaping holes in their story line.

Mr. Schiff is making sure that doesn't happen again; he'll present the story, on his terms. His rules mean he can issue that controlling decree about "only one" transcript and Democratic staff supervision of Republican members. It means he can bar the public, the press and even fellow representatives from hearings, even though they're unclassified.
'''

    s = '''
Unemployment today is at a 50-year low.  There are more Americans working today than ever before.  Median household income in the last two and half years has risen by more than $5,000.  And that doesn't even account for the savings from the President's tax cuts or energy reforms for working families.

Because of the President's policies, America has added trillions of dollars of wealth to our economy while China's economy continues to fall behind.

To level the playing field for the American worker against unethical trade practices, President Trump levied tariffs on $250 billion in Chinese goods in 2018.  And earlier this year, the President announced we would place tariffs on another $300 billion of Chinese goods if significant issues in our trading relationship were not resolved by December of this year.
'''
    s = '''
Needless to say, we see it very differently.  Despite the great power competition that is underway, and America's growing strength, we want better for China.  That's why, for the first time in decades, under President Donald Trump's leadership, the United States is treating China's leaders exactly how the leaders of any great world power should be treated — with respect, yes, but also with consistency and candor.
'''
    s = '''
Brexit is the scheduled withdrawal of the United Kingdom from the European Union. Following a June 2016 referendum, in which 51.9% voted to leave, the UK government formally announced the country's withdrawal in March 2017, starting a two-year process that was due to conclude with the UK withdrawing on 29 March 2019. As the UK parliament thrice voted against the negotiated withdrawal agreement, that deadline has been extended twice, and is currently 31 October 2019. The Benn Act, passed in September 2019, requires the government to seek a third extension.
'''

    s = '''
The argument for Brexit
According to the BBC, the push to leave the EU was advocated mostly by the UK Independence Party and was not supported by the Prime Minister, David Cameron. Members of the UK Independence Party argued that Britain's participation in the EU was a restrictive element for the country.

As one of the EU's primary initiatives is free movement within the region the party's main arguments centered around regaining border control and reclaiming business rights. In addition, supporters of Brexit cited the high EU membership fees as a negative aspect of participation in the EU. It was argued that if the UK separates itself from the EU, these fees can be used to benefit the UK.

The argument against Brexit
The Conservative Party and the Prime Minister were strongly in favor of remaining with the EU. As a result of the decision to discontinue its participation in the EU, the Prime Minister has made a public statement that he will be relinquishing his position. He believes that the country needs a leader with the same goals as the majority of the country. He has promised a new PM will be in place by early September.

The argument against Brexit pertains mostly to the business benefits. The argument is that the UK receives business benefits by being able to participate in the single market system established by the EU. In response to the criticism against the open borders, proponents believe that the influx of immigrants helps develop an eager workforce and fuels public service projects.

Leaders in favor of staying also worry about the political backlash that could possibly result from other countries who favored staying with the EU. In addition, proponents of remaining with the EU believe that being part of a wider community of nations provides economic and cultural strength, as well as an additional element of security.

What does Brexit mean for the future?
While the decision marked a huge statement for the UK, the referendum vote is not legally binding. There are still many hurdles that must be dealt with before Brexit can become a reality.

The UK is still subject to the laws of the EU until Britain's exit becomes legal. In order for the UK to make its break official, the country needs to invoke Article 50. It is unclear exactly what this process will entail or how long it will take as Britain is the first country to take its leave of the EU. Once Article 50 has been formally invoked, the UK has two years to negotiate its departure with the other member states. But according to the BBC, "Extricating the UK from the EU will be extremely complex, and the process could drag on longer than that."

Amidst the aftermath of this shocking referendum vote, there is great uncertainty as political leaders decide what this means for the UK.

'''

    s = '''
British Prime Minister Boris Johnson walks towards a voting station during the Brexit referendum in Britain, June 23, 2016. (Photo: EPA-EFE)

LONDON – British Prime Minister Boris Johnson said Thursday he will likely ask Parliament to approve an election as part of an effort to break a Brexit deadlock.

It is not clear if the vote, which Johnson wants to hold on Dec. 12, will take place as opposition lawmakers must also back the move.

They are expected to vote on the measure on Monday.

Johnson's announcement comes ahead of an expected decision Friday from the European Union over whether to delay Britain's exit from the bloc for three months.

Britain's leader has been steadfastly opposed to any extension to the nation's scheduled Oct. 31 departure date from the EU, although in a letter to the leader of the opposition Labour Party this week he said he would accept a short technical postponement, "say to 15 or 30 November, to allow lawmakers to implement an EU withdrawal bill.

Johnson's decision to offer to call an election follows lawmakers' rejection of his plan to rush through an EU exit bill that runs to hundreds of pages in just three days. They want more time to scrutinize the legislation and to make sure it does not leave the door open to a possible "no-deal" Brexit during future exit negotiations with the EU that will run through next year. A "no-deal" Brexit could dramatically harm Britain's economy.

The prime minister was forced to ask for an extension to Britain's EU departure date after Britain's Parliament passed a law to ward off the threat of a "no-deal" Brexit.

Johnson has repeatedly pledged to finalize the first stage, a transition deal, of Britain's EU divorce battle by Oct. 31. A second stage will involve negotiating its future relationship with the EU on trade, security and other salient issues.
'''

    s = '''
Thank you very much. We have a Cabinet meeting. We'll have a few questions after grace. And, if you would, Ben, please do the honors.

THE PRESIDENT: All right, thank you, Ben. That was a great job. Appreciate it.

The economy is doing fantastically well. It's getting very close to another record. We've had many records since we won office. We're getting very close to another record. I don't know if anybody saw it: The household median income for eight years of President Bush, it rose $400. For eight years of President Obama, it rose $975. And for two and half years of President Trump — they have it down as two and a half years — it rose $5,000, not including $2,000 for taxes. So it rose, let's say, $7,000. So in two and a half years, we're up $7,000, compared to $1,000, compared to $400. And that's for eight years and eight years.

That's a number that just came out, but that's a number that I don't know how there could be any dispute or any — I've never heard a number like that, meaning the economy is doing fantastically well.

We need — for our farmers, our manufacturers, for, frankly, unions and non-unions, we need USMCA to be voted on. If it's voted on, it'll pass. It's up to Nancy Pelosi to put it up. If she puts it up, it's going to pass. It's going to be very bipartisan. It's something that's very much needed. It'll be hundreds of thousands of jobs.


'''

    try:
        base_path = os.path.join(os.path.dirname(__file__), 'db')
        file_path = os.path.join(base_path, 'oxford_words.txt')
        with open(file_path) as f:
            s = f.read()
    except FileNotFoundError:
        print("Warning: Could not find oxford_words.txt. Using sample text instead.")
        s = """Sample text here. Replace this with any default text you want to analyze."""

    print(text_difficulty_level(s, d3))

    article = ArticleVocabularyLevel('source', word_data_path='db/oxford_words.txt')
    user = UserVocabularyLevel({'simple':['202408050930']}, word_data_path='db/oxford_words.txt')