import pickle
import math
from wordfreqCMD import remove_punctuation, freq, sort_in_descending_order, sort_in_ascending_order, map_percentages_to_levels
import snowballstemmer


class WordDifficultyEvaluator:
    def __init__(self):
        self.ENGLISH_WORD_DIFFICULTY_DICT = {}

    def load_record(self, pickle_fname):
        with open(pickle_fname, 'rb') as f:
            return pickle.load(f)

    def convert_test_type_to_difficulty_level(self, d):
        """
        对原本的单词库中的单词进行难度评级
        :param d: 存储了单词库pickle文件中的单词的字典
        :return:
        """
        result = {}
        for k in d:
            if 'CET4' in d[k]:
                result[k] = 4
            elif 'OXFORD3000' in d[k]:
                result[k] = 5
            elif 'CET6' in d[k] or 'GRADUATE' in d[k]:
                result[k] = 6
            elif 'OXFORD5000' in d[k] or 'IELTS' in d[k]:
                result[k] = 7
            elif 'BBC' in d[k]:
                result[k] = 8

        self.ENGLISH_WORD_DIFFICULTY_DICT = result
        return result

    def get_difficulty_level_for_user(self, d1, d2):
        if not self.ENGLISH_WORD_DIFFICULTY_DICT:
            d2 = self.convert_test_type_to_difficulty_level(d2)
        else:
            d2 = self.ENGLISH_WORD_DIFFICULTY_DICT

        stemmer = snowballstemmer.stemmer('english')

        for k in d1:
            if k in d2:
                continue
            else:
                stem = stemmer.stemWord(k)
                if stem in d2:
                    d2[k] = d2[stem]
                else:
                    d2[k] = 3
        return d2

    def revert_dict(self, d):
        d2 = {}
        for k in d:
            lst = d[k] if isinstance(d[k], list) else [d[k]] * ['2021082019']
            for time_info in lst:
                date = time_info[:10]
                if date not in d2:
                    d2[date] = [k]
                else:
                    d2[date].append(k)
        return d2

    def user_difficulty_level(self, d_user, d, calc_func=0):
        d_user2 = self.revert_dict(d_user)
        if calc_func != 0:
            geometric = sum(math.log(d[word]) for date in sorted(d_user2, reverse=True) for word in d_user2[date] if word in d)
            count = sum(1 for date in d_user2 for word in d_user2[date] if word in d)
            return math.exp(geometric / max(count, 1))
        else:
            count = {}
            percentages = {}
            total = sum(1 for date in d_user2 for word in d_user2[date] if word in d)
            if total == 0:
                return 1
            for date in d_user2:
                for word in d_user2[date]:
                    if word in d:
                        difficulty = d[word]
                        count[difficulty] = count.get(difficulty, 0) + 1
            for k in count:
                percentages[k] = count[k] / total
            weight = map_percentages_to_levels(percentages)
            return sum(weight[k] * k for k in weight)

    def text_difficulty_level(self, s, d):
        s = remove_punctuation(s)
        L = freq(s)
        stop_words = {
            'the': 1, 'and': 1, 'of': 1, 'to': 1, 'what': 1, 'in': 1, 'there': 1, 'when': 1,
            'them': 1, 'would': 1, 'will': 1, 'out': 1, 'his': 1, 'mr': 1, 'that': 1, 'up': 1,
            'more': 1, 'your': 1, 'it': 1, 'now': 1, 'very': 1, 'then': 1, 'could': 1, 'he': 1,
            'any': 1, 'some': 1, 'with': 1, 'into': 1, 'you': 1, 'our': 1, 'man': 1, 'other': 1,
            'time': 1, 'was': 1, 'than': 1, 'know': 1, 'about': 1, 'only': 1, 'like': 1, 'how': 1,
            'see': 1, 'is': 1, 'before': 1, 'such': 1, 'little': 1, 'two': 1, 'its': 1, 'as': 1,
            'these': 1, 'may': 1, 'much': 1, 'down': 1, 'for': 1, 'well': 1, 'should': 1, 'those': 1,
            'after': 1, 'same': 1, 'must': 1, 'say': 1, 'first': 1, 'again': 1, 'us': 1, 'great': 1,
            'where': 1, 'being': 1, 'come': 1, 'over': 1, 'good': 1, 'himself': 1, 'am': 1, 'never': 1,
            'on': 1, 'old': 1, 'here': 1, 'way': 1, 'at': 1, 'go': 1, 'upon': 1, 'have': 1, 'had': 1,
            'without': 1, 'my': 1, 'day': 1, 'be': 1, 'but': 1, 'though': 1, 'from': 1, 'not': 1,
            'too': 1, 'another': 1, 'this': 1, 'even': 1, 'still': 1, 'her': 1, 'yet': 1, 'under': 1,
            'by': 1, 'let': 1, 'just': 1, 'all': 1, 'because': 1, 'we': 1, 'always': 1, 'off': 1,
            'yes': 1, 'so': 1, 'while': 1, 'why': 1, 'which': 1, 'me': 1, 'are': 1, 'or': 1, 'no': 1,
            'if': 1, 'an': 1, 'also': 1, 'thus': 1, 'who': 1, 'cannot': 1, 'she': 1, 'whether': 1
        }
        lst = [(word, d[word]) for word, _ in L if word not in stop_words and word in d]
        lst2 = sort_in_descending_order(lst)
        geometric = math.prod(hard for _, hard in lst2[:20])
        count = min(len(lst2), 20)
        return geometric ** (1 / max(count, 1))


if __name__ == '__main__':
    evaluator = WordDifficultyEvaluator()
    d1 = evaluator.load_record('frequency.p')
    d2 = evaluator.load_record('words_and_tests.p')
    d3 = evaluator.get_difficulty_level_for_user(d1, d2)

    with open('wordlist.txt') as f:
        s = f.read()

    print(evaluator.text_difficulty_level(s, d3))