bc1d56d1a4
P0.6: gcov infrastructure P1: extract_structure output expansion (11 new feature fields) P2: Confusion group rule engine (8 pairs + contradiction + backtrack) P3: 4-factor confidence calculation + quality gate update P4: 33+2 COBOL program type test samples (22 files, 7 categories) P5: parametrized/ test data generation engine P6: japanese_data.py lookup tables P7-10: Type-specific test suites (~159 parametrized tests) P11: Full classification pipeline (classify_program) + orchestrator integration P12: Documentation (module-interfaces, test-plan v3.0, coverage-matrix) Architecture decisions: - classification_pipeline/ merged to hina/pipeline/ - parametrized/ as independent module - japanese_data.py as root-level file - hina/__all__ only exports classify_program() Co-Authored-By: Claude <noreply@anthropic.com>
200 lines
7.3 KiB
Python
200 lines
7.3 KiB
Python
"""Phase 7: 匹配系测试 — 基于 parametrized 生成匹配数据。
|
|
|
|
测试覆盖:
|
|
- 1:1 / 1:N / N:1 基本匹配(含内容校验)
|
|
- 不平衡场景(主 > 从 / 从 > 主)
|
|
- gcov 验证入口(需要 cobc 环境)
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import pytest
|
|
|
|
from parametrized import generate_matching_data, generate_keybreak_data
|
|
|
|
|
|
# ============================================================
|
|
# 1:1 匹配
|
|
# ============================================================
|
|
|
|
class TestMatchingOneToOne:
|
|
"""1:1 — 主件每条在从件最多命中一条"""
|
|
|
|
def test_1to1_equal_counts_all_matched(self):
|
|
main, sub = generate_matching_data("1:1", 10, 10, 1.0)
|
|
assert len(main) == 10
|
|
assert len(sub) == 10
|
|
main_keys = {r["KEY"] for r in main}
|
|
sub_keys = {r["KEY"] for r in sub}
|
|
assert main_keys == sub_keys, "全部匹配时主从 KEY 集合应一致"
|
|
|
|
def test_1to1_equal_counts_partial_50(self):
|
|
main, sub = generate_matching_data("1:1", 10, 10, 0.5)
|
|
assert len(main) == 10
|
|
assert len(sub) == 10
|
|
matched = sum(1 for r in sub if r["KEY"].startswith("MAIN"))
|
|
assert matched == 5, "50% 匹配应有 5 条从件命中"
|
|
|
|
def test_1to1_unbalanced_main_more(self):
|
|
main, sub = generate_matching_data("1:1", 20, 5, 1.0)
|
|
assert len(main) == 20
|
|
assert len(sub) == 5
|
|
sub_keys = {r["KEY"] for r in sub}
|
|
matched = sum(1 for r in main if r["KEY"] in sub_keys)
|
|
assert matched == 5, "主件多于从件时最多只能匹配从件数"
|
|
|
|
def test_1to1_unbalanced_sub_more(self):
|
|
main, sub = generate_matching_data("1:1", 5, 20, 1.0)
|
|
assert len(main) == 5
|
|
assert len(sub) == 20
|
|
matched = sum(1 for r in sub if r["KEY"].startswith("MAIN"))
|
|
assert matched == 5, "从件多于主件时最多只能匹配主件数"
|
|
|
|
def test_1to1_no_match(self):
|
|
main, sub = generate_matching_data("1:1", 10, 10, 0.0)
|
|
main_keys = {r["KEY"] for r in main}
|
|
sub_keys = {r["KEY"] for r in sub}
|
|
assert main_keys.isdisjoint(sub_keys), "ratio=0 时主从 KEY 应无交集"
|
|
|
|
def test_1to1_ratio_boundary(self):
|
|
"""边界: match_ratio=0.0 和 1.0"""
|
|
main0, sub0 = generate_matching_data("1:1", 5, 5, 0.0)
|
|
main1, sub1 = generate_matching_data("1:1", 5, 5, 1.0)
|
|
m0 = {r["KEY"] for r in main0}
|
|
s0 = {r["KEY"] for r in sub0}
|
|
assert m0.isdisjoint(s0)
|
|
m1 = {r["KEY"] for r in main1}
|
|
s1 = {r["KEY"] for r in sub1}
|
|
assert m1 == s1
|
|
|
|
def test_1to1_content_integrity(self):
|
|
"""验证每条记录包含正确的字段结构"""
|
|
main, sub = generate_matching_data("1:1", 5, 5, 1.0)
|
|
for rec in main:
|
|
assert "KEY" in rec
|
|
assert "DATA" in rec
|
|
assert "SEQ" in rec
|
|
for rec in sub:
|
|
assert "KEY" in rec
|
|
assert "DATA" in rec
|
|
assert "SEQ" in rec
|
|
|
|
|
|
# ============================================================
|
|
# 1:N 匹配
|
|
# ============================================================
|
|
|
|
class TestMatchingOneToMany:
|
|
"""1:N — 主件每条在从件可能命中多条"""
|
|
|
|
def test_1toN_one_main_many_sub(self):
|
|
main, sub = generate_matching_data("1:N", 1, 10, 1.0)
|
|
assert len(main) == 1
|
|
assert len(sub) == 10
|
|
assert main[0]["KEY"] == "MAIN-0000"
|
|
assert all(r["KEY"] == "MAIN-0000" for r in sub), "全部从件应匹配同一主件"
|
|
|
|
def test_1toN_mixed_unmatched(self):
|
|
main, sub = generate_matching_data("1:N", 5, 10, 0.6)
|
|
assert len(main) == 5
|
|
assert len(sub) == 10
|
|
matched = [r for r in sub if r["KEY"].startswith("MAIN")]
|
|
unmatched = [r for r in sub if r["KEY"].startswith("UNMATCHED")]
|
|
assert len(matched) > 0
|
|
assert len(unmatched) > 0
|
|
|
|
def test_1toN_all_main_unmatched(self):
|
|
main, sub = generate_matching_data("1:N", 5, 10, 0.0)
|
|
assert all(r["KEY"].startswith("UNMATCHED") for r in sub)
|
|
|
|
|
|
# ============================================================
|
|
# N:1 匹配
|
|
# ============================================================
|
|
|
|
class TestMatchingManyToOne:
|
|
"""N:1 — 从件每条在主件可能命中多条"""
|
|
|
|
def test_Nto1_many_main_one_sub(self):
|
|
main, sub = generate_matching_data("N:1", 10, 1, 1.0)
|
|
assert len(main) == 10
|
|
assert len(sub) == 1
|
|
sub_key = sub[0]["KEY"]
|
|
assert sub_key.startswith("MAIN")
|
|
matched = sum(1 for r in main if r["KEY"] == sub_key)
|
|
assert matched >= 1
|
|
|
|
def test_Nto1_unbalanced(self):
|
|
main, sub = generate_matching_data("N:1", 100, 20, 0.5)
|
|
assert len(main) == 100
|
|
assert len(sub) == 20
|
|
matched = sum(1 for r in sub if r["KEY"].startswith("MAIN"))
|
|
assert matched <= 20
|
|
|
|
def test_Nto1_all_unmatched(self):
|
|
main, sub = generate_matching_data("N:1", 10, 5, 0.0)
|
|
sub_keys = {r["KEY"] for r in sub}
|
|
assert all(r["KEY"] not in sub_keys for r in main)
|
|
|
|
|
|
# ============================================================
|
|
# KEY 切中断
|
|
# ============================================================
|
|
|
|
class TestKeybreak:
|
|
"""KEY 值变化触发中断 / AT END / BREAK"""
|
|
|
|
def test_keybreak_three_groups(self):
|
|
data = generate_keybreak_data(3, 2)
|
|
assert len(data) == 6
|
|
keys = [r["KEY"] for r in data]
|
|
assert keys == ["KEY-A", "KEY-A", "KEY-B", "KEY-B", "KEY-C", "KEY-C"]
|
|
|
|
def test_keybreak_many_groups(self):
|
|
data = generate_keybreak_data(10, 1)
|
|
assert len(data) == 10
|
|
assert len({r["KEY"] for r in data}) == 10
|
|
|
|
def test_keybreak_field_accumulate(self):
|
|
data = generate_keybreak_data(3, 2, "accumulate")
|
|
assert data[0]["FIELD"] == 101
|
|
assert data[1]["FIELD"] == 102
|
|
assert data[2]["FIELD"] == 201
|
|
assert data[5]["FIELD"] == 302
|
|
|
|
def test_keybreak_field_aggregate(self):
|
|
data = generate_keybreak_data(3, 3, "aggregate")
|
|
assert all(r["FIELD"] == 100 for r in data[0:3])
|
|
assert all(r["FIELD"] == 200 for r in data[3:6])
|
|
assert all(r["FIELD"] == 300 for r in data[6:9])
|
|
|
|
def test_keybreak_field_mark(self):
|
|
data = generate_keybreak_data(4, 1, "mark")
|
|
assert [r["FIELD"] for r in data] == ["MARK-A", "MARK-B", "MARK-C", "MARK-D"]
|
|
|
|
|
|
# ============================================================
|
|
# gcov 验证(可选,需要 cobc)
|
|
# ============================================================
|
|
|
|
class TestGcovVerification:
|
|
"""gcov 验证 — 需要 cobc 编译器"""
|
|
|
|
@pytest.mark.skip(reason="需要 cobc 编译器才能运行真实的 gcov 验证")
|
|
def test_gcov_with_cobc(self):
|
|
"""基于真实 COBOL 编译的 gcov 覆盖验证"""
|
|
pytest.skip("COBOL 编译器 (cobc) 不可用 — 跳过 gcov 验证")
|
|
|
|
def test_gcov_coverage_data_structure(self):
|
|
"""验证 gcov 所需的数据结构完整性(不依赖 cobc)"""
|
|
from parametrized.common import generate_minimal_records
|
|
fields = [
|
|
{"name": "KEY", "type": "string", "length": 10},
|
|
{"name": "AMOUNT", "type": "numeric"},
|
|
]
|
|
records = generate_minimal_records(fields)
|
|
assert len(records) == 1
|
|
assert "KEY" in records[0]
|
|
assert "AMOUNT" in records[0]
|
|
assert records[0]["AMOUNT"] == 0
|