2022-01-26 21:10:09 +08:00
|
|
|
|
import os
|
2023-06-04 00:35:43 +08:00
|
|
|
|
import random
|
|
|
|
|
import pickle_idea
|
2023-05-18 23:29:38 +08:00
|
|
|
|
from difficulty import get_difficulty_level_for_user, text_difficulty_level, user_difficulty_level
|
2023-06-04 00:35:43 +08:00
|
|
|
|
from UseSqlite import RecordQuery
|
2022-01-26 21:10:09 +08:00
|
|
|
|
|
|
|
|
|
path_prefix = '/var/www/wordfreq/wordfreq/'
|
|
|
|
|
path_prefix = './' # comment this line in deployment
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def total_number_of_essays():
|
2023-06-04 00:35:43 +08:00
|
|
|
|
'''
|
|
|
|
|
得到文章总数
|
|
|
|
|
return:文章数目
|
|
|
|
|
'''
|
2022-01-26 21:10:09 +08:00
|
|
|
|
rq = RecordQuery(path_prefix + 'static/wordfreqapp.db')
|
|
|
|
|
rq.instructions("SELECT * FROM article")
|
|
|
|
|
rq.do()
|
|
|
|
|
result = rq.get_results()
|
|
|
|
|
return len(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_article_title(s):
|
2023-06-04 00:35:43 +08:00
|
|
|
|
'''
|
|
|
|
|
得到文章的标题
|
|
|
|
|
'''
|
2022-01-26 21:10:09 +08:00
|
|
|
|
return s.split('\n')[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_article_body(s):
|
2023-06-04 00:35:43 +08:00
|
|
|
|
'''
|
|
|
|
|
得到文章的内容
|
|
|
|
|
'''
|
2022-01-26 21:10:09 +08:00
|
|
|
|
lst = s.split('\n')
|
|
|
|
|
lst.pop(0) # remove the first line
|
|
|
|
|
return '\n'.join(lst)
|
|
|
|
|
|
|
|
|
|
|
2023-04-25 17:47:51 +08:00
|
|
|
|
def get_today_article(user_word_list, visited_articles):
|
2023-06-04 00:35:43 +08:00
|
|
|
|
'''
|
|
|
|
|
根据用户的单词列表和阅读过的文章返回需要的文章的全部信息
|
|
|
|
|
'''
|
2022-01-26 21:10:09 +08:00
|
|
|
|
rq = RecordQuery(path_prefix + 'static/wordfreqapp.db')
|
2023-04-25 17:47:51 +08:00
|
|
|
|
if visited_articles is None:
|
|
|
|
|
visited_articles = {
|
2023-06-04 00:35:43 +08:00
|
|
|
|
"index": 0, # 为 article_ids 的索引
|
2023-04-04 22:31:53 +08:00
|
|
|
|
"article_ids": [] # 之前显示文章的id列表,越后越新
|
|
|
|
|
}
|
2023-04-25 17:47:51 +08:00
|
|
|
|
if visited_articles["index"] > len(visited_articles["article_ids"])-1: # 生成新的文章,因此查找所有的文章
|
2022-01-26 21:10:09 +08:00
|
|
|
|
rq.instructions("SELECT * FROM article")
|
2023-04-20 22:53:30 +08:00
|
|
|
|
else: # 生成阅读过的文章,因此查询指定 article_id 的文章
|
2023-06-04 00:35:43 +08:00
|
|
|
|
# 可能因为直接刷新页面导致直接去查询了'null',因此当刷新的页面的时候,需要直接进行“上一篇”操作
|
|
|
|
|
if visited_articles["article_ids"][visited_articles["index"]] == 'null':
|
2023-04-25 17:47:51 +08:00
|
|
|
|
visited_articles["index"] -= 1
|
|
|
|
|
visited_articles["article_ids"].pop()
|
2023-06-04 00:35:43 +08:00
|
|
|
|
rq.instructions(
|
|
|
|
|
f'SELECT * FROM article WHERE article_id='
|
|
|
|
|
f'{visited_articles["article_ids"][visited_articles["index"]]}'
|
|
|
|
|
)
|
2022-01-26 21:10:09 +08:00
|
|
|
|
rq.do()
|
|
|
|
|
result = rq.get_results()
|
|
|
|
|
random.shuffle(result)
|
|
|
|
|
|
|
|
|
|
# Choose article according to reader's level
|
|
|
|
|
d1 = load_freq_history(path_prefix + 'static/frequency/frequency.p')
|
|
|
|
|
d2 = load_freq_history(path_prefix + 'static/words_and_tests.p')
|
2023-05-18 23:29:38 +08:00
|
|
|
|
d3 = get_difficulty_level_for_user(d1, d2)
|
2022-01-26 21:10:09 +08:00
|
|
|
|
|
2023-04-04 22:31:53 +08:00
|
|
|
|
d = None
|
2023-04-20 22:53:30 +08:00
|
|
|
|
result_of_generate_article = "not found"
|
2022-01-26 21:10:09 +08:00
|
|
|
|
d_user = load_freq_history(user_word_list)
|
2023-06-04 00:35:43 +08:00
|
|
|
|
# 更多的考虑,因为用户的行为是动态的。应考虑时间因素。
|
|
|
|
|
user_level = user_difficulty_level(d_user, d3)
|
2023-03-08 16:33:13 +08:00
|
|
|
|
text_level = 0
|
2023-04-25 17:47:51 +08:00
|
|
|
|
if visited_articles["index"] > len(visited_articles["article_ids"])-1: # 生成新的文章
|
|
|
|
|
amount_of_visited_articles = len(visited_articles["article_ids"])
|
2023-06-04 00:35:43 +08:00
|
|
|
|
amount_of_existing_articles = len(result)
|
|
|
|
|
# 如果当前阅读过的文章的数量 == 存在的文章的数量,即所有的书本都阅读过了
|
|
|
|
|
if amount_of_visited_articles == amount_of_existing_articles:
|
2023-04-20 22:53:30 +08:00
|
|
|
|
result_of_generate_article = "had read all articles"
|
|
|
|
|
else:
|
|
|
|
|
for k in range(3): # 最多尝试3次
|
|
|
|
|
for reading in result:
|
|
|
|
|
text_level = text_difficulty_level(reading['text'], d3)
|
2023-06-04 00:35:43 +08:00
|
|
|
|
# 从高斯分布中得出的平均值为 0.8,站位偏差为 1 的数字
|
|
|
|
|
factor = random.gauss(0.8, 0.1)
|
|
|
|
|
# 新的文章之前没有出现过且符合一定范围的水平
|
|
|
|
|
if reading['article_id'] not in visited_articles["article_ids"] and within_range(text_level, user_level, (8.0 - user_level) * factor):
|
2023-04-20 22:53:30 +08:00
|
|
|
|
d = reading
|
2023-04-25 17:47:51 +08:00
|
|
|
|
visited_articles["article_ids"].append(d['article_id']) # 列表添加新的文章id;下面进行
|
2023-04-20 22:53:30 +08:00
|
|
|
|
result_of_generate_article = "found"
|
|
|
|
|
break
|
|
|
|
|
if result_of_generate_article == "found": # 用于成功找到文章后及时退出外层循环
|
|
|
|
|
break
|
2023-04-21 02:36:51 +08:00
|
|
|
|
if result_of_generate_article != "found": # 阅读完所有文章,或者循环3次没有找到适合的文章,则放入空(“null”)
|
2023-04-25 17:47:51 +08:00
|
|
|
|
visited_articles["article_ids"].append('null')
|
2023-04-20 22:53:30 +08:00
|
|
|
|
else: # 生成已经阅读过的文章
|
2023-03-08 16:33:13 +08:00
|
|
|
|
d = random.choice(result)
|
|
|
|
|
text_level = text_difficulty_level(d['text'], d3)
|
2023-04-21 02:36:51 +08:00
|
|
|
|
result_of_generate_article = "found"
|
2023-03-08 16:33:13 +08:00
|
|
|
|
|
|
|
|
|
today_article = None
|
2023-04-04 22:31:53 +08:00
|
|
|
|
if d:
|
2023-03-08 16:33:13 +08:00
|
|
|
|
today_article = {
|
2023-06-04 00:35:43 +08:00
|
|
|
|
"user_level": f'{user_level:4.2f}',
|
|
|
|
|
"text_level": f'{text_level:4.2f}',
|
2023-03-08 16:33:13 +08:00
|
|
|
|
"date": d['date'],
|
|
|
|
|
"article_title": get_article_title(d['text']),
|
|
|
|
|
"article_body": get_article_body(d['text']),
|
|
|
|
|
"source": d["source"],
|
|
|
|
|
"question": get_question_part(d['question']),
|
|
|
|
|
"answer": get_answer_part(d['question'])
|
|
|
|
|
}
|
2022-01-26 21:10:09 +08:00
|
|
|
|
|
2023-04-25 17:47:51 +08:00
|
|
|
|
return visited_articles, today_article, result_of_generate_article
|
2022-01-26 21:10:09 +08:00
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_freq_history(path):
|
2023-06-04 00:35:43 +08:00
|
|
|
|
'''
|
|
|
|
|
加载历史路径
|
|
|
|
|
'''
|
2022-01-26 21:10:09 +08:00
|
|
|
|
d = {}
|
|
|
|
|
if os.path.exists(path):
|
|
|
|
|
d = pickle_idea.load_record(path)
|
|
|
|
|
return d
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def within_range(x, y, r):
|
2023-06-04 00:35:43 +08:00
|
|
|
|
'''
|
|
|
|
|
判断x>y并且x-y<=r
|
|
|
|
|
'''
|
2022-01-26 21:10:09 +08:00
|
|
|
|
return x > y and abs(x - y) <= r
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_question_part(s):
|
2023-06-04 00:35:43 +08:00
|
|
|
|
'''
|
|
|
|
|
得到问题部分
|
|
|
|
|
'''
|
2022-01-26 21:10:09 +08:00
|
|
|
|
s = s.strip()
|
|
|
|
|
result = []
|
|
|
|
|
flag = 0
|
|
|
|
|
for line in s.split('\n'):
|
|
|
|
|
line = line.strip()
|
|
|
|
|
if line == 'QUESTION':
|
|
|
|
|
result.append(line)
|
|
|
|
|
flag = 1
|
|
|
|
|
elif line == 'ANSWER':
|
|
|
|
|
flag = 0
|
|
|
|
|
elif flag == 1:
|
|
|
|
|
result.append(line)
|
|
|
|
|
return '\n'.join(result)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_answer_part(s):
|
2023-06-04 00:35:43 +08:00
|
|
|
|
'''
|
|
|
|
|
得到答案部分
|
|
|
|
|
'''
|
2022-01-26 21:10:09 +08:00
|
|
|
|
s = s.strip()
|
|
|
|
|
result = []
|
|
|
|
|
flag = 0
|
|
|
|
|
for line in s.split('\n'):
|
|
|
|
|
line = line.strip()
|
|
|
|
|
if line == 'ANSWER':
|
|
|
|
|
flag = 1
|
|
|
|
|
elif flag == 1:
|
|
|
|
|
result.append(line)
|
2023-03-30 16:10:22 +08:00
|
|
|
|
return '\n'.join(result)
|