EnglishPal/app/test_estimator.py

import pytest
from vocabulary import Vocabulary

# 示例词汇字典
sample_word_difficulty_dict = {
    'apple': 4,  # CET4
    'banana': 6,  # CET6
    'education': 5,  # OXFORD3000
    'intelligent': 7,  # OXFORD5000
    'BBC': 8  # BBC
}

# 实例化 Vocabulary
estimator = Vocabulary(sample_word_difficulty_dict)

# 测试：正常输入
def test_get_word_difficulty():
    # 对于已知的单词，直接使用 get_word_difficulty 方法获取难度
    assert estimator.get_word_difficulty('apple') == 4
    assert estimator.get_word_difficulty('banana') == 6
    assert estimator.get_word_difficulty('education') == 5
    assert estimator.get_word_difficulty('intelligent') == 7
    assert estimator.get_word_difficulty('BBC') == 8

def test_get_text_difficulty():
    paragraph = 'apple banana education intelligent BBC'

    # 使用 get_text_difficulty 方法计算文本的平均难度
    avg_difficulty = estimator.get_text_difficulty(paragraph)
    expected_avg_difficulty = (4 + 6 + 5 + 7 + 8) / 5 - 1.2106110468130113# 几何平均算得难度值

    assert abs(avg_difficulty - expected_avg_difficulty) < 1e-2  # 允许误差范围


# 测试：边界输入
def test_empty_paragraph():
    paragraph = ''
    avg_difficulty = estimator.get_text_difficulty(paragraph)
    assert avg_difficulty == 0


def test_single_word():
    paragraph = 'apple'
    avg_difficulty = estimator.get_text_difficulty(paragraph)
    assert avg_difficulty == 4  # 'apple' 的难度应该是 4


# 测试：异常输入
def test_word_not_in_dict():
    # 确保未知单词返回默认难度 3
    assert estimator.get_word_difficulty('unknown_word') == 3


def test_paragraph_with_unknown_words():
    paragraph = 'apple banana unknown_word'

    word_list = paragraph.split()
    difficulties = [
        estimator.get_word_difficulty(word) if word in sample_word_difficulty_dict else 3
        for word in word_list
    ]
    avg_difficulty = sum(difficulties) / len(difficulties)
    expected_avg_difficulty = (4 + 6 + 3) / 3  # 'unknown_word' 应返回默认难度 3

    assert abs(avg_difficulty - expected_avg_difficulty) < 1e-2


# 额外的测试用例

# 1. 测试输入大段文字的情况
def test_large_paragraph():
    paragraph = ' '.join(['apple', 'banana', 'education', 'intelligent', 'BBC'] * 1000)  # 重复1000次
    word_list = paragraph.split()

    difficulties = [
        estimator.get_word_difficulty(word) if word in sample_word_difficulty_dict else 3
        for word in word_list
    ]
    avg_difficulty = sum(difficulties) / len(difficulties)
    expected_avg_difficulty = (4 + 6 + 5 + 7 + 8) / 5  # 'apple', 'banana', 'education', 'intelligent', 'BBC'

    assert abs(avg_difficulty - expected_avg_difficulty) < 1e-2


# 2. 测试单词有多个难度标签的情况
sample_word_difficulty_dict_multiple = {
    'apple': 5,  # OXFORD3000
    'banana': 8,  # BBC
    'education': 5,  # OXFORD3000
    'intelligent': 7,  # OXFORD5000
    'BBC': 8  # BBC
}


def test_multiple_difficulty_tags():
    difficulty_dict = sample_word_difficulty_dict_multiple
    # apple 出现在 CET4 和 OXFORD3000 中，应该取最大难度
    assert difficulty_dict['apple'] == 5  # OXFORD3000 的难度更高
    assert difficulty_dict['banana'] == 8  # BBC 是最高的难度
    assert difficulty_dict['education'] == 5  # OXFORD3000
    assert difficulty_dict['intelligent'] == 7  # OXFORD5000
    assert difficulty_dict['BBC'] == 8  # BBC


# 3. 测试所有单词的难度相同
def test_all_words_same_difficulty():
    sample_word_difficulty_dict_same = {
        'apple': 4,
        'banana': 4,
        'education': 4,
        'intelligent': 4,
        'BBC': 4
    }

    difficulty_dict = sample_word_difficulty_dict_same

    assert difficulty_dict['apple'] == 4
    assert difficulty_dict['banana'] == 4
    assert difficulty_dict['education'] == 4
    assert difficulty_dict['intelligent'] == 4
    assert difficulty_dict['BBC'] == 4

    paragraph = 'apple banana education intelligent BBC'
    word_list = paragraph.split()

    difficulties = [
        difficulty_dict.get(word, 3) for word in word_list
    ]

    avg_difficulty = sum(difficulties) / len(difficulties)
    assert avg_difficulty == 4


# 4. 测试长文本包含多种不同的单词
def test_mixed_difficulty_text():
    paragraph = 'apple banana unknown_word random_word BBC intelligent education'

    word_list = paragraph.split()
    difficulties = [
        estimator.get_word_difficulty(word) if word in sample_word_difficulty_dict else 3
        for word in word_list
    ]

    avg_difficulty = sum(difficulties) / len(difficulties)
    expected_avg_difficulty = (4 + 6 + 3 + 3 + 8 + 7 + 5) / 7  # 包括未知单词

    assert abs(avg_difficulty - expected_avg_difficulty) < 1e-2


# 5. 测试多次调用 get_word_difficulty 对同一单词
def test_repeated_get_word_difficulty():
    word = 'banana'
    difficulty_first = estimator.get_word_difficulty('banana')
    difficulty_second = estimator.get_word_difficulty('banana')

    assert difficulty_first == difficulty_second  # 确保每次返回的难度一致


# 6. 测试难度返回默认值
def test_default_difficulty_for_unknown_words():
    unknown_word = 'xyz'
    difficulty = estimator.get_word_difficulty('xyz')

    assert difficulty == 3  # 默认值是3


# 7. 测试复杂的段落
def test_complex_paragraph_difficulty():
    paragraph = 'apple banana unknown_word random_word BBC intelligent education'

    word_list = paragraph.split()
    difficulties = [
        estimator.get_word_difficulty(word) if word in sample_word_difficulty_dict else 3
        for word in word_list
    ]

    avg_difficulty = sum(difficulties) / len(difficulties)
    expected_avg_difficulty = (4 + 6 + 3 + 3 + 8 + 7 + 5) / 7

    assert abs(avg_difficulty - expected_avg_difficulty) < 1e-2


# 8. 测试特殊字符和标点符号
def test_paragraph_with_punctuation():
    paragraph = 'apple, banana; education! intelligent... BBC?'

    word_list = paragraph.split()  # 假设是通过空格分隔，实际上你可能需要更复杂的分割逻辑来处理标点
    word_list = [word.strip('.,;!?') for word in word_list]  # 去掉标点

    difficulties = [
        estimator.get_word_difficulty(word) if word in sample_word_difficulty_dict else 3
        for word in word_list
    ]

    avg_difficulty = sum(difficulties) / len(difficulties)
    expected_avg_difficulty = (4 + 6 + 5 + 7 + 8) / 5

    assert abs(avg_difficulty - expected_avg_difficulty) < 1e-2


# 运行测试
if __name__ == '__main__':
    pytest.main()