feat: Phase 2 complete — 13 Phases of COBOL type classification and test benchmark

P0.6: gcov infrastructure
P1: extract_structure output expansion (11 new feature fields)
P2: Confusion group rule engine (8 pairs + contradiction + backtrack)
P3: 4-factor confidence calculation + quality gate update
P4: 33+2 COBOL program type test samples (22 files, 7 categories)
P5: parametrized/ test data generation engine
P6: japanese_data.py lookup tables
P7-10: Type-specific test suites (~159 parametrized tests)
P11: Full classification pipeline (classify_program) + orchestrator integration
P12: Documentation (module-interfaces, test-plan v3.0, coverage-matrix)

Architecture decisions:
- classification_pipeline/ merged to hina/pipeline/
- parametrized/ as independent module
- japanese_data.py as root-level file
- hina/__all__ only exports classify_program()

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
hangshuo652
2026-06-19 23:51:55 +08:00
parent 63b5284715
commit bc1d56d1a4
129 changed files with 19378 additions and 261 deletions
+354
View File
@@ -0,0 +1,354 @@
"""测试: 确信度 4 因子计算 + 质量门禁评分 + 覆盖率比较"""
import pytest
from hina.confidence import compute_confidence_v2
from hina.gate import compute_quality_score, check as gate_check
from coverage.compare_coverage import compare_coverage
# ── compute_confidence_v2 判定阈值测试 ──
def test_auto_judgment():
"""确信度 >= 0.90 → auto"""
keyword_result = {
"base_confidence": 1.0,
"match_count": 3,
}
structure_features = {"structure_match_score": 5}
result = compute_confidence_v2(keyword_result, structure_features)
# 1.0 × 1.0 × 1.0 × 1.0 = 1.0
assert result["confidence"] == 1.0
assert result["judgment"] == "auto"
assert result["needs_review"] is False
def test_review_judgment():
"""确信度 0.70-0.89 → review"""
# Need 0.70 <= confidence < 0.90
# base=1.0, context=0.95, consistency=1.0, structure=0.7 → 0.665 → still manual
# base=1.0, context=1.0, consistency=0.9, structure=0.85... hmm structure is discrete
# Let's try: base=0.95, context=1.0, consistency=1.0, structure=0.7 → 0.665 (manual)
# base=0.95, context=0.95(match=2), consistency=1.0, structure=0.7 → 0.63175 (manual)
# base=0.95, context=1.0, consistency=0.90, structure=1.0 → 0.855 (review!)
keyword_result = {
"base_confidence": 0.95,
"match_count": 3,
}
structure_features = {"structure_match_score": 5}
contradictions = [
{"type": "type_mismatch", "resolved": True},
]
result = compute_confidence_v2(
keyword_result, structure_features,
contradictions=contradictions,
)
# 0.95 × 1.0 × 0.90 × 1.0 = 0.855
assert 0.70 <= result["confidence"] < 0.90
assert result["judgment"] == "review"
assert result["needs_review"] is True
def test_manual_judgment():
"""确信度 0.50-0.69 → manual"""
keyword_result = {
"base_confidence": 0.95,
"match_count": 1,
}
structure_features = {"structure_match_score": 4}
contradictions = [
{"type": "type_mismatch", "resolved": True},
]
result = compute_confidence_v2(
keyword_result, structure_features,
contradictions=contradictions,
)
# 0.95 × 0.90 × 0.90 × 0.7 = 0.53865
assert 0.50 <= result["confidence"] < 0.70
assert result["judgment"] == "manual"
assert result["needs_review"] is True
def test_impossible_judgment():
"""确信度 < 0.50 → impossible"""
keyword_result = {
"base_confidence": 0.7,
"match_count": 0,
}
structure_features = {"structure_match_score": 0}
result = compute_confidence_v2(keyword_result, structure_features)
# 0.7 × 0.50 × 1.0 × 0.3 = 0.105
assert result["confidence"] < 0.50
assert result["judgment"] == "impossible"
assert result["needs_review"] is True
# ── 因子边界测试 ──
def test_context_factor_match_counts():
"""关键字匹配数对上下文因子的影响"""
# match_count >= 3 → context_factor = 1.0
r = compute_confidence_v2(
{"base_confidence": 1.0, "match_count": 5},
{"structure_match_score": 5},
)
assert r["context_factor"] == 1.0
assert r["confidence"] == 1.0
# match_count == 2 → context_factor = 0.95
r = compute_confidence_v2(
{"base_confidence": 1.0, "match_count": 2},
{"structure_match_score": 5},
)
assert r["context_factor"] == 0.95
assert r["confidence"] == 0.95
# match_count == 1 → context_factor = 0.90
r = compute_confidence_v2(
{"base_confidence": 1.0, "match_count": 1},
{"structure_match_score": 5},
)
assert r["context_factor"] == 0.90
assert r["confidence"] == 0.90
# match_count == 0 → context_factor = 0.50
r = compute_confidence_v2(
{"base_confidence": 1.0, "match_count": 0},
{"structure_match_score": 5},
)
assert r["context_factor"] == 0.50
assert r["confidence"] == 0.50
def test_consistency_factor_contradictions():
"""矛盾数量对一致性因子的影响"""
# 无矛盾 → 1.0
r = compute_confidence_v2(
{"base_confidence": 1.0, "match_count": 3},
{"structure_match_score": 5},
contradictions=[],
)
assert r["consistency_factor"] == 1.0
# 已解决 → 0.90
r = compute_confidence_v2(
{"base_confidence": 1.0, "match_count": 3},
{"structure_match_score": 5},
contradictions=[{"type": "t1", "resolved": True}],
)
assert r["consistency_factor"] == 0.90
# 未解决 < 3 → 0.80
r = compute_confidence_v2(
{"base_confidence": 1.0, "match_count": 3},
{"structure_match_score": 5},
contradictions=[{"type": "t1", "resolved": False}],
)
assert r["consistency_factor"] == 0.80
# ≥3 未解决 → 0.50
r = compute_confidence_v2(
{"base_confidence": 1.0, "match_count": 3},
{"structure_match_score": 5},
contradictions=[
{"type": "t1", "resolved": False},
{"type": "t2", "resolved": False},
{"type": "t3", "resolved": True},
],
)
assert r["consistency_factor"] == 0.50
def test_structure_factor_scores():
"""结构匹配度对结构一致性因子的影响"""
# 5/5 → 1.0
r = compute_confidence_v2(
{"base_confidence": 1.0, "match_count": 3},
{"structure_match_score": 5},
)
assert r["structure_factor"] == 1.0
# 3-4/5 → 0.7
r = compute_confidence_v2(
{"base_confidence": 1.0, "match_count": 3},
{"structure_match_score": 3},
)
assert r["structure_factor"] == 0.7
# 1-2/5 → 0.5
r = compute_confidence_v2(
{"base_confidence": 1.0, "match_count": 3},
{"structure_match_score": 1},
)
assert r["structure_factor"] == 0.5
# 无法/0 → 0.3
r = compute_confidence_v2(
{"base_confidence": 1.0, "match_count": 3},
{"structure_match_score": 0},
)
assert r["structure_factor"] == 0.3
def test_base_confidence_default():
"""keyword_result 未提供 base_confidence 时使用默认值 0.7"""
r = compute_confidence_v2(
{"match_count": 3},
{"structure_match_score": 5},
)
assert r["base"] == 0.7
# ── compute_quality_score 双模式测试 ──
def test_quality_score_no_gcov():
"""gcov 未启用模式: branch_rate×0.5 + paragraph_rate×0.5 + confidence×0.4"""
static_cov = {
"branch_rate": 0.80,
"paragraph_rate": 0.90,
}
score = compute_quality_score(static_cov, gcov_coverage=None, confidence=0.5)
# 0.80×0.5 + 0.90×0.5 + 0.5×0.4 = 0.40 + 0.45 + 0.20 = 1.05 → min(1.0, 1.05) = 1.0
assert score == 1.0
def test_quality_score_no_gcov_sub_max():
"""gcov 未启用模式,确保不超过 1.0 被 clamp"""
static_cov = {
"branch_rate": 0.60,
"paragraph_rate": 0.70,
}
score = compute_quality_score(static_cov, gcov_coverage=None, confidence=0.8)
# 0.60×0.5 + 0.70×0.5 + 0.8×0.4 = 0.30 + 0.35 + 0.32 = 0.97
assert score == 0.97
def test_quality_score_with_gcov():
"""gcov 启用模式: static_cov×0.3 + gcov_cov×0.4 + confidence×0.3"""
static_cov = {
"branch_rate": 0.80,
"paragraph_rate": 0.90,
}
gcov_cov = {"gcov_cov": 0.75}
score = compute_quality_score(static_cov, gcov_cov, confidence=0.5)
# static_cov = 0.80×0.5 + 0.90×0.5 = 0.85
# score = 0.85×0.3 + 0.75×0.4 + 0.5×0.3 = 0.255 + 0.30 + 0.15 = 0.705
assert score == 0.705
def test_quality_score_with_gcov_zero_confidence():
"""gcov 启用模式,置信度为 0"""
static_cov = {
"branch_rate": 1.0,
"paragraph_rate": 1.0,
}
gcov_cov = {"gcov_cov": 0.5}
score = compute_quality_score(static_cov, gcov_cov, confidence=0.0)
# static_cov = 1.0
# score = 1.0×0.3 + 0.5×0.4 + 0.0×0.3 = 0.30 + 0.20 + 0.0 = 0.50
assert score == 0.50
# ── compare_coverage 基本功能测试 ──
def test_compare_coverage_basic():
"""compare_coverage 基本功能"""
static = {
"branch_rate": 0.90,
"paragraph_rate": 0.85,
"total_branches": 20,
"covered_branches": 18,
}
dynamic = {
"gcov_cov": 0.75,
"covered_branches": 15,
"total_branches": 20,
"misleading_branches": ["BR001", "BR003"],
}
result = compare_coverage("TESTPROG", static, dynamic)
assert result["program"] == "TESTPROG"
assert result["static"]["branch_rate"] == 0.90
assert result["static"]["paragraph_rate"] == 0.85
assert result["dynamic"]["gcov_cov"] == 0.75
# gap = (0.90×0.5 + 0.85×0.5) - 0.75 = 0.875 - 0.75 = 0.125
assert result["gap"] == 0.125
assert result["misleading_branches"] == ["BR001", "BR003"]
def test_compare_coverage_no_gap():
"""静态与动态完全一致时 gap 为 0"""
static = {
"branch_rate": 0.80,
"paragraph_rate": 0.80,
"total_branches": 10,
"covered_branches": 8,
}
dynamic = {
"gcov_cov": 0.80,
"covered_branches": 8,
"total_branches": 10,
"misleading_branches": [],
}
result = compare_coverage("NOGAP", static, dynamic)
# gap = (0.80×0.5 + 0.80×0.5) - 0.80 = 0.80 - 0.80 = 0.0
assert result["gap"] == 0.0
assert result["misleading_branches"] == []
def test_compare_coverage_no_misleading():
"""没有误导分支时的返回"""
static = {
"branch_rate": 0.95,
"paragraph_rate": 1.0,
}
dynamic = {
"gcov_cov": 0.90,
"misleading_branches": [],
}
result = compare_coverage("CLEAN", static, dynamic)
# gap = (0.95×0.5 + 1.0×0.5) - 0.90 = 0.975 - 0.90 = 0.075
assert result["gap"] == 0.075
assert result["misleading_branches"] == []
# ── gate.check 基本功能测试 ──
def test_gate_check_passed():
"""质量门禁完全通过"""
result = gate_check(
complete_tests=[{"id": 1}],
hina_result={},
coverage={"branch_rate": 1.0, "paragraph_rate": 1.0},
)
assert result["passed"] is True
assert len(result["issues"]) == 0
def test_gate_check_failed_branch():
"""分支覆盖率不足"""
result = gate_check(
complete_tests=[{"id": 1}],
hina_result={},
coverage={
"branch_rate": 0.50,
"paragraph_rate": 1.0,
"uncovered_decision_ids": [1, 2],
},
)
assert result["passed"] is False
assert "decision_gaps" in result["issues"]
def test_gate_check_no_data():
"""无测试数据"""
result = gate_check(
complete_tests=[],
hina_result={},
coverage={"branch_rate": 1.0, "paragraph_rate": 1.0},
)
assert result["passed"] is False
assert "no_data" in result["issues"]