EnglishPal/app/Article.py

from WordFreq import WordFreq
from AAA_VocabularyLevelEstimator import VocabularyLevelEstimator
from difficulty import WordDifficultyEvaluator
from wordfreqCMD import youdao_link, sort_in_descending_order
import pickle_idea, pickle_idea2
import os
import random, glob
import hashlib
from datetime import datetime
from flask import Flask, request, redirect, render_template, url_for, session, abort, flash, get_flashed_messages
from model.article import get_all_articles, get_article_by_id, get_number_of_articles
import logging
import re

path_prefix = './'
db_path_prefix = './db/'  # 部署时请注释掉此行
oxford_words_path = './db/oxford_words.txt'


def count_oxford_words(text, oxford_words):
    words = re.findall(r'\b\w+\b', text.lower())
    total_words = len(words)
    oxford_word_count = sum(1 for word in words if word in oxford_words)
    return oxford_word_count, total_words


def calculate_ratio(oxford_word_count, total_words):
    if total_words == 0:
        return 0
    return oxford_word_count / total_words


def load_oxford_words(file_path):
    oxford_words = {}
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split()
            word = parts[0]
            pos = parts[1]
            level = parts[2]
            oxford_words[word] = {'pos': pos, 'level': level}
    return oxford_words


def total_number_of_essays():
    return get_number_of_articles()


def get_article_title(s):
    return s.split('\n')[0]


def get_article_body(s):
    lst = s.split('\n')
    lst.pop(0)  # 移除第一行标题
    return '\n'.join(lst)


def get_today_article(user_word_list, visited_articles):
    if visited_articles is None:
        visited_articles = {
            "index": 0,
            "article_ids": []
        }

    if visited_articles["index"] > len(visited_articles["article_ids"]) - 1:
        result = get_all_articles()
    else:
        if visited_articles["article_ids"][visited_articles["index"]] == 'null':
            visited_articles["index"] -= 1
            visited_articles["article_ids"].pop()
        article_id = visited_articles["article_ids"][visited_articles["index"]]
        result = get_article_by_id(article_id)

    random.shuffle(result)

    # 初始化 VocabularyLevelEstimator
    vocabularyLevelEstimator = VocabularyLevelEstimator(
        word_vectors_path='./wiki-news-300d-1M.vec',
        words_and_tests_pickle=path_prefix + 'static/words_and_tests.p'
    )

    # 加载用户不认识的单词列表
    d_user_unknown_words = load_freq_history(user_word_list)

    # 获取用户的词汇难度级别
    user_level = vocabularyLevelEstimator.estimate_user_vocabulary_level(d_user_unknown_words.keys())

    d = None
    result_of_generate_article = "not found"
    text_level = 0

    if visited_articles["index"] > len(visited_articles["article_ids"]) - 1:
        amount_of_visited_articles = len(visited_articles["article_ids"])
        amount_of_existing_articles = len(result)
        if amount_of_visited_articles == amount_of_existing_articles:
            result_of_generate_article = "had read all articles"
        else:
            for _ in range(3):
                for reading in result:
                    text_level = vocabularyLevelEstimator.estimate_text_difficulty(reading['text'])
                    factor = random.gauss(0.8, 0.1)
                    if reading['article_id'] not in visited_articles["article_ids"] and within_range(text_level, user_level, (8.0 - user_level) * factor):
                        d = reading
                        visited_articles["article_ids"].append(d['article_id'])
                        result_of_generate_article = "found"
                        break
                if result_of_generate_article == "found":
                    break
        if result_of_generate_article != "found":
            visited_articles["article_ids"].append('null')
    else:
        d = random.choice(result)
        text_level = vocabularyLevelEstimator.estimate_text_difficulty(d['text'])
        result_of_generate_article = "found"

    today_article = None
    if d:
        oxford_words = load_oxford_words(oxford_words_path)
        oxford_word_count, total_words = count_oxford_words(d['text'], oxford_words)
        ratio = calculate_ratio(oxford_word_count, total_words)
        today_article = {
            "user_level": '%4.1f' % user_level,
            "text_level": '%4.1f' % text_level,
            "date": d['date'],
            "article_title": get_article_title(d['text']),
            "article_body": get_article_body(d['text']),
            "source": d["source"],
            "question": get_question_part(d['question']),
            "answer": get_answer_part(d['question']),
            "ratio": ratio
        }

    return visited_articles, today_article, result_of_generate_article


def load_freq_history(path):
    d = {}
    if os.path.exists(path):
        d = pickle_idea.load_record(path)
    return d


def within_range(x, y, r):
    return x > y and abs(x - y) <= r


def get_question_part(s):
    result = []
    flag = False
    for line in s.split('\n'):
        line = line.strip()
        if line == 'QUESTION':
            result.append(line)
            flag = True
        elif line == 'ANSWER':
            flag = False
        elif flag:
            result.append(line)
    return '\n'.join(result)


def get_answer_part(s):
    result = []
    flag = False
    for line in s.split('\n'):
        line = line.strip()
        if line == 'ANSWER':
            flag = True
        elif flag:
            result.append(line)
    return '\n'.join(result)