feat: Phase 2 complete — 13 Phases of COBOL type classification and test benchmark

P0.6: gcov infrastructure
P1: extract_structure output expansion (11 new feature fields)
P2: Confusion group rule engine (8 pairs + contradiction + backtrack)
P3: 4-factor confidence calculation + quality gate update
P4: 33+2 COBOL program type test samples (22 files, 7 categories)
P5: parametrized/ test data generation engine
P6: japanese_data.py lookup tables
P7-10: Type-specific test suites (~159 parametrized tests)
P11: Full classification pipeline (classify_program) + orchestrator integration
P12: Documentation (module-interfaces, test-plan v3.0, coverage-matrix)

Architecture decisions:
- classification_pipeline/ merged to hina/pipeline/
- parametrized/ as independent module
- japanese_data.py as root-level file
- hina/__all__ only exports classify_program()

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
hangshuo652
2026-06-19 23:51:55 +08:00
parent 63b5284715
commit bc1d56d1a4
129 changed files with 19378 additions and 261 deletions
+202
View File
@@ -0,0 +1,202 @@
"""Phase 8: SORT / MERGE 系测试 — 基于 parametrized 生成数据。
测试覆盖:
- SORT 排序正确性(升序 / 降序 / 多键 / 稳定性)
- MERGE 合并逻辑(均匀 / 不均 / 重复键)
"""
from __future__ import annotations
import pytest
from parametrized import generate_sorted_records, generate_duplicate_keys
# ── 排序辅助 ──
def _sort_descending(records: list[dict], key_field: str = "KEY") -> list[dict]:
"""按 KEY 降序排列记录。"""
return sorted(records, key=lambda r: r[key_field], reverse=True)
def _sort_by_multiple_keys(
records: list[dict],
keys: list[str],
ascending: bool = True,
) -> list[dict]:
"""按多键排序。"""
return sorted(records, key=lambda r: tuple(r[k] for k in keys), reverse=not ascending)
def _merge_sorted(
left: list[dict],
right: list[dict],
key_field: str = "KEY",
) -> list[dict]:
"""合并两个已排序列表(归并算法)。"""
result: list[dict] = []
i = j = 0
while i < len(left) and j < len(right):
if left[i][key_field] <= right[j][key_field]:
result.append(left[i])
i += 1
else:
result.append(right[j])
j += 1
result.extend(left[i:])
result.extend(right[j:])
return result
# ============================================================
# SORT
# ============================================================
class TestSortAscending:
"""升序排序"""
def test_sort_basic_ascending(self):
records = generate_sorted_records(10)
sorted_records = sorted(records, key=lambda r: r["KEY"])
assert sorted_records == records, "generate_sorted_records 应已按 KEY 升序排列"
def test_sort_descending(self):
records = generate_sorted_records(5)
desc = _sort_descending(records)
assert desc[0]["KEY"] == "KEY-0004"
assert desc[-1]["KEY"] == "KEY-0000"
def test_sort_single_record(self):
records = generate_sorted_records(1)
assert len(records) == 1
assert records[0]["KEY"] == "KEY-0000"
class TestSortMultipleKeys:
"""多键排序"""
def test_sort_two_keys(self):
records = [
{"KEY": "K001", "SUB": "A", "DATA": "x"},
{"KEY": "K001", "SUB": "B", "DATA": "y"},
{"KEY": "K002", "SUB": "A", "DATA": "z"},
]
sorted_recs = _sort_by_multiple_keys(records, ["KEY", "SUB"])
assert sorted_recs[0]["SUB"] == "A"
assert sorted_recs[1]["SUB"] == "B"
assert sorted_recs[2]["SUB"] == "A"
def test_sort_three_keys(self):
records = [
{"KEY": "K002", "SUB": "A", "TERT": "Z"},
{"KEY": "K001", "SUB": "B", "TERT": "Y"},
{"KEY": "K001", "SUB": "A", "TERT": "X"},
]
sorted_recs = _sort_by_multiple_keys(records, ["KEY", "SUB", "TERT"])
assert sorted_recs[0]["TERT"] == "X"
assert sorted_recs[1]["TERT"] == "Y"
assert sorted_recs[2]["TERT"] == "Z"
class TestSortDuplicates:
"""重复键排序"""
def test_sort_with_duplicate_keys(self):
base = generate_sorted_records(5)
with_dups = generate_duplicate_keys(base)
assert len(with_dups) == 10
sorted_all = sorted(with_dups, key=lambda r: (r["KEY"], r["SEQ"]))
assert sorted_all[0]["KEY"] == sorted_all[1]["KEY"] # 同 KEY
assert sorted_all[0]["SEQ"] < sorted_all[1]["SEQ"]
def test_sort_duplicate_all_same_key(self):
records = [{"KEY": "SAME", "DATA": str(i), "SEQ": i} for i in range(5)]
shuffled = [records[3], records[0], records[2], records[4], records[1]]
sorted_recs = sorted(shuffled, key=lambda r: r["SEQ"])
assert [r["DATA"] for r in sorted_recs] == ["0", "1", "2", "3", "4"]
class TestSortEdgeCases:
"""边界情况"""
def test_sort_empty(self):
records: list[dict] = []
sorted_recs = sorted(records, key=lambda r: r.get("KEY", ""))
assert sorted_recs == []
def test_sort_invalid_count(self):
with pytest.raises(ValueError, match="record_count"):
generate_sorted_records(0)
def test_sort_custom_key_field(self):
records = generate_sorted_records(3, key_field="MYKEY")
assert all("MYKEY" in r for r in records)
assert [r["MYKEY"] for r in records] == ["KEY-0000", "KEY-0001", "KEY-0002"]
# ============================================================
# MERGE
# ============================================================
class TestMergeBasic:
"""基本合并"""
def test_merge_two_equal_files(self):
left = generate_sorted_records(5)
right = generate_sorted_records(5)
merged = _merge_sorted(left, right)
assert len(merged) == 10
keys = [r["KEY"] for r in merged]
assert keys == sorted(keys)
def test_merge_one_empty(self):
left = generate_sorted_records(3)
right: list[dict] = []
merged = _merge_sorted(left, right)
assert len(merged) == 3
assert merged == left
def test_merge_both_empty(self):
merged = _merge_sorted([], [])
assert merged == []
class TestMergeUneven:
"""不均等合并"""
def test_merge_left_larger(self):
left = generate_sorted_records(10)
right = generate_sorted_records(3)
merged = _merge_sorted(left, right)
assert len(merged) == 13
keys = [r["KEY"] for r in merged]
assert keys == sorted(keys)
def test_merge_right_larger(self):
left = generate_sorted_records(2)
right = generate_sorted_records(8)
merged = _merge_sorted(left, right)
assert len(merged) == 10
keys = [r["KEY"] for r in merged]
assert keys == sorted(keys)
class TestMergeDuplicates:
"""重复键合并"""
def test_merge_with_duplicate_keys(self):
left = [{"KEY": "K001", "DATA": "L1"}, {"KEY": "K002", "DATA": "L2"}]
right = [{"KEY": "K001", "DATA": "R1"}, {"KEY": "K003", "DATA": "R3"}]
merged = _merge_sorted(left, right)
assert len(merged) == 4
assert merged[0]["KEY"] == "K001"
assert merged[1]["KEY"] == "K001"
def test_merge_stability(self):
"""稳定性: 同 KEY 时左文件先出现"""
left = [{"KEY": "K001", "DATA": "LEFT"}, {"KEY": "K003", "DATA": "LEFT"}]
right = [{"KEY": "K001", "DATA": "RIGHT"}]
merged = _merge_sorted(left, right)
assert merged[0]["DATA"] == "LEFT"
assert merged[1]["DATA"] == "RIGHT"