feat: Phase 2 complete — 13 Phases of COBOL type classification and test benchmark
P0.6: gcov infrastructure P1: extract_structure output expansion (11 new feature fields) P2: Confusion group rule engine (8 pairs + contradiction + backtrack) P3: 4-factor confidence calculation + quality gate update P4: 33+2 COBOL program type test samples (22 files, 7 categories) P5: parametrized/ test data generation engine P6: japanese_data.py lookup tables P7-10: Type-specific test suites (~159 parametrized tests) P11: Full classification pipeline (classify_program) + orchestrator integration P12: Documentation (module-interfaces, test-plan v3.0, coverage-matrix) Architecture decisions: - classification_pipeline/ merged to hina/pipeline/ - parametrized/ as independent module - japanese_data.py as root-level file - hina/__all__ only exports classify_program() Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,202 @@
|
||||
"""Phase 8: SORT / MERGE 系测试 — 基于 parametrized 生成数据。
|
||||
|
||||
测试覆盖:
|
||||
- SORT 排序正确性(升序 / 降序 / 多键 / 稳定性)
|
||||
- MERGE 合并逻辑(均匀 / 不均 / 重复键)
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pytest
|
||||
|
||||
from parametrized import generate_sorted_records, generate_duplicate_keys
|
||||
|
||||
|
||||
# ── 排序辅助 ──
|
||||
|
||||
|
||||
def _sort_descending(records: list[dict], key_field: str = "KEY") -> list[dict]:
|
||||
"""按 KEY 降序排列记录。"""
|
||||
return sorted(records, key=lambda r: r[key_field], reverse=True)
|
||||
|
||||
|
||||
def _sort_by_multiple_keys(
|
||||
records: list[dict],
|
||||
keys: list[str],
|
||||
ascending: bool = True,
|
||||
) -> list[dict]:
|
||||
"""按多键排序。"""
|
||||
return sorted(records, key=lambda r: tuple(r[k] for k in keys), reverse=not ascending)
|
||||
|
||||
|
||||
def _merge_sorted(
|
||||
left: list[dict],
|
||||
right: list[dict],
|
||||
key_field: str = "KEY",
|
||||
) -> list[dict]:
|
||||
"""合并两个已排序列表(归并算法)。"""
|
||||
result: list[dict] = []
|
||||
i = j = 0
|
||||
while i < len(left) and j < len(right):
|
||||
if left[i][key_field] <= right[j][key_field]:
|
||||
result.append(left[i])
|
||||
i += 1
|
||||
else:
|
||||
result.append(right[j])
|
||||
j += 1
|
||||
result.extend(left[i:])
|
||||
result.extend(right[j:])
|
||||
return result
|
||||
|
||||
|
||||
# ============================================================
|
||||
# SORT
|
||||
# ============================================================
|
||||
|
||||
class TestSortAscending:
|
||||
"""升序排序"""
|
||||
|
||||
def test_sort_basic_ascending(self):
|
||||
records = generate_sorted_records(10)
|
||||
sorted_records = sorted(records, key=lambda r: r["KEY"])
|
||||
assert sorted_records == records, "generate_sorted_records 应已按 KEY 升序排列"
|
||||
|
||||
def test_sort_descending(self):
|
||||
records = generate_sorted_records(5)
|
||||
desc = _sort_descending(records)
|
||||
assert desc[0]["KEY"] == "KEY-0004"
|
||||
assert desc[-1]["KEY"] == "KEY-0000"
|
||||
|
||||
def test_sort_single_record(self):
|
||||
records = generate_sorted_records(1)
|
||||
assert len(records) == 1
|
||||
assert records[0]["KEY"] == "KEY-0000"
|
||||
|
||||
|
||||
class TestSortMultipleKeys:
|
||||
"""多键排序"""
|
||||
|
||||
def test_sort_two_keys(self):
|
||||
records = [
|
||||
{"KEY": "K001", "SUB": "A", "DATA": "x"},
|
||||
{"KEY": "K001", "SUB": "B", "DATA": "y"},
|
||||
{"KEY": "K002", "SUB": "A", "DATA": "z"},
|
||||
]
|
||||
sorted_recs = _sort_by_multiple_keys(records, ["KEY", "SUB"])
|
||||
assert sorted_recs[0]["SUB"] == "A"
|
||||
assert sorted_recs[1]["SUB"] == "B"
|
||||
assert sorted_recs[2]["SUB"] == "A"
|
||||
|
||||
def test_sort_three_keys(self):
|
||||
records = [
|
||||
{"KEY": "K002", "SUB": "A", "TERT": "Z"},
|
||||
{"KEY": "K001", "SUB": "B", "TERT": "Y"},
|
||||
{"KEY": "K001", "SUB": "A", "TERT": "X"},
|
||||
]
|
||||
sorted_recs = _sort_by_multiple_keys(records, ["KEY", "SUB", "TERT"])
|
||||
assert sorted_recs[0]["TERT"] == "X"
|
||||
assert sorted_recs[1]["TERT"] == "Y"
|
||||
assert sorted_recs[2]["TERT"] == "Z"
|
||||
|
||||
|
||||
class TestSortDuplicates:
|
||||
"""重复键排序"""
|
||||
|
||||
def test_sort_with_duplicate_keys(self):
|
||||
base = generate_sorted_records(5)
|
||||
with_dups = generate_duplicate_keys(base)
|
||||
assert len(with_dups) == 10
|
||||
sorted_all = sorted(with_dups, key=lambda r: (r["KEY"], r["SEQ"]))
|
||||
assert sorted_all[0]["KEY"] == sorted_all[1]["KEY"] # 同 KEY
|
||||
assert sorted_all[0]["SEQ"] < sorted_all[1]["SEQ"]
|
||||
|
||||
def test_sort_duplicate_all_same_key(self):
|
||||
records = [{"KEY": "SAME", "DATA": str(i), "SEQ": i} for i in range(5)]
|
||||
shuffled = [records[3], records[0], records[2], records[4], records[1]]
|
||||
sorted_recs = sorted(shuffled, key=lambda r: r["SEQ"])
|
||||
assert [r["DATA"] for r in sorted_recs] == ["0", "1", "2", "3", "4"]
|
||||
|
||||
|
||||
class TestSortEdgeCases:
|
||||
"""边界情况"""
|
||||
|
||||
def test_sort_empty(self):
|
||||
records: list[dict] = []
|
||||
sorted_recs = sorted(records, key=lambda r: r.get("KEY", ""))
|
||||
assert sorted_recs == []
|
||||
|
||||
def test_sort_invalid_count(self):
|
||||
with pytest.raises(ValueError, match="record_count"):
|
||||
generate_sorted_records(0)
|
||||
|
||||
def test_sort_custom_key_field(self):
|
||||
records = generate_sorted_records(3, key_field="MYKEY")
|
||||
assert all("MYKEY" in r for r in records)
|
||||
assert [r["MYKEY"] for r in records] == ["KEY-0000", "KEY-0001", "KEY-0002"]
|
||||
|
||||
|
||||
# ============================================================
|
||||
# MERGE
|
||||
# ============================================================
|
||||
|
||||
class TestMergeBasic:
|
||||
"""基本合并"""
|
||||
|
||||
def test_merge_two_equal_files(self):
|
||||
left = generate_sorted_records(5)
|
||||
right = generate_sorted_records(5)
|
||||
merged = _merge_sorted(left, right)
|
||||
assert len(merged) == 10
|
||||
keys = [r["KEY"] for r in merged]
|
||||
assert keys == sorted(keys)
|
||||
|
||||
def test_merge_one_empty(self):
|
||||
left = generate_sorted_records(3)
|
||||
right: list[dict] = []
|
||||
merged = _merge_sorted(left, right)
|
||||
assert len(merged) == 3
|
||||
assert merged == left
|
||||
|
||||
def test_merge_both_empty(self):
|
||||
merged = _merge_sorted([], [])
|
||||
assert merged == []
|
||||
|
||||
|
||||
class TestMergeUneven:
|
||||
"""不均等合并"""
|
||||
|
||||
def test_merge_left_larger(self):
|
||||
left = generate_sorted_records(10)
|
||||
right = generate_sorted_records(3)
|
||||
merged = _merge_sorted(left, right)
|
||||
assert len(merged) == 13
|
||||
keys = [r["KEY"] for r in merged]
|
||||
assert keys == sorted(keys)
|
||||
|
||||
def test_merge_right_larger(self):
|
||||
left = generate_sorted_records(2)
|
||||
right = generate_sorted_records(8)
|
||||
merged = _merge_sorted(left, right)
|
||||
assert len(merged) == 10
|
||||
keys = [r["KEY"] for r in merged]
|
||||
assert keys == sorted(keys)
|
||||
|
||||
|
||||
class TestMergeDuplicates:
|
||||
"""重复键合并"""
|
||||
|
||||
def test_merge_with_duplicate_keys(self):
|
||||
left = [{"KEY": "K001", "DATA": "L1"}, {"KEY": "K002", "DATA": "L2"}]
|
||||
right = [{"KEY": "K001", "DATA": "R1"}, {"KEY": "K003", "DATA": "R3"}]
|
||||
merged = _merge_sorted(left, right)
|
||||
assert len(merged) == 4
|
||||
assert merged[0]["KEY"] == "K001"
|
||||
assert merged[1]["KEY"] == "K001"
|
||||
|
||||
def test_merge_stability(self):
|
||||
"""稳定性: 同 KEY 时左文件先出现"""
|
||||
left = [{"KEY": "K001", "DATA": "LEFT"}, {"KEY": "K003", "DATA": "LEFT"}]
|
||||
right = [{"KEY": "K001", "DATA": "RIGHT"}]
|
||||
merged = _merge_sorted(left, right)
|
||||
assert merged[0]["DATA"] == "LEFT"
|
||||
assert merged[1]["DATA"] == "RIGHT"
|
||||
Reference in New Issue
Block a user