feat: Phase 2 complete — 13 Phases of COBOL type classification and test benchmark

P0.6: gcov infrastructure P1: extract_structure output expansion (11 new feature fields) P2: Confusion group rule engine (8 pairs + contradiction + backtrack) P3: 4-factor confidence calculation + quality gate update P4: 33+2 COBOL program type test samples (22 files, 7 categories) P5: parametrized/ test data generation engine P6: japanese_data.py lookup tables P7-10: Type-specific test suites (~159 parametrized tests) P11: Full classification pipeline (classify_program) + orchestrator integration P12: Documentation (module-interfaces, test-plan v3.0, coverage-matrix) Architecture decisions: - classification_pipeline/ merged to hina/pipeline/ - parametrized/ as independent module - japanese_data.py as root-level file - hina/__all__ only exports classify_program() Co-Authored-By: Claude <noreply@anthropic.com>
2026-06-19 23:51:55 +08:00
parent 63b5284715
commit bc1d56d1a4
129 changed files with 19378 additions and 261 deletions
@@ -0,0 +1,354 @@
+"""测试: 确信度 4 因子计算 + 质量门禁评分 + 覆盖率比较"""
+
+import pytest
+from hina.confidence import compute_confidence_v2
+from hina.gate import compute_quality_score, check as gate_check
+from coverage.compare_coverage import compare_coverage
+
+
+# ── compute_confidence_v2 判定阈值测试 ──
+
+
+def test_auto_judgment():
+    """确信度 >= 0.90 → auto"""
+    keyword_result = {
+        "base_confidence": 1.0,
+        "match_count": 3,
+    }
+    structure_features = {"structure_match_score": 5}
+    result = compute_confidence_v2(keyword_result, structure_features)
+    # 1.0 × 1.0 × 1.0 × 1.0 = 1.0
+    assert result["confidence"] == 1.0
+    assert result["judgment"] == "auto"
+    assert result["needs_review"] is False
+
+
+def test_review_judgment():
+    """确信度 0.70-0.89 → review"""
+    # Need 0.70 <= confidence < 0.90
+    # base=1.0, context=0.95, consistency=1.0, structure=0.7 → 0.665 → still manual
+    # base=1.0, context=1.0, consistency=0.9, structure=0.85... hmm structure is discrete
+    # Let's try: base=0.95, context=1.0, consistency=1.0, structure=0.7 → 0.665 (manual)
+    # base=0.95, context=0.95(match=2), consistency=1.0, structure=0.7 → 0.63175 (manual)
+    # base=0.95, context=1.0, consistency=0.90, structure=1.0 → 0.855 (review!)
+    keyword_result = {
+        "base_confidence": 0.95,
+        "match_count": 3,
+    }
+    structure_features = {"structure_match_score": 5}
+    contradictions = [
+        {"type": "type_mismatch", "resolved": True},
+    ]
+    result = compute_confidence_v2(
+        keyword_result, structure_features,
+        contradictions=contradictions,
+    )
+    # 0.95 × 1.0 × 0.90 × 1.0 = 0.855
+    assert 0.70 <= result["confidence"] < 0.90
+    assert result["judgment"] == "review"
+    assert result["needs_review"] is True
+
+
+def test_manual_judgment():
+    """确信度 0.50-0.69 → manual"""
+    keyword_result = {
+        "base_confidence": 0.95,
+        "match_count": 1,
+    }
+    structure_features = {"structure_match_score": 4}
+    contradictions = [
+        {"type": "type_mismatch", "resolved": True},
+    ]
+    result = compute_confidence_v2(
+        keyword_result, structure_features,
+        contradictions=contradictions,
+    )
+    # 0.95 × 0.90 × 0.90 × 0.7 = 0.53865
+    assert 0.50 <= result["confidence"] < 0.70
+    assert result["judgment"] == "manual"
+    assert result["needs_review"] is True
+
+
+def test_impossible_judgment():
+    """确信度 < 0.50 → impossible"""
+    keyword_result = {
+        "base_confidence": 0.7,
+        "match_count": 0,
+    }
+    structure_features = {"structure_match_score": 0}
+    result = compute_confidence_v2(keyword_result, structure_features)
+    # 0.7 × 0.50 × 1.0 × 0.3 = 0.105
+    assert result["confidence"] < 0.50
+    assert result["judgment"] == "impossible"
+    assert result["needs_review"] is True
+
+
+# ── 因子边界测试 ──
+
+
+def test_context_factor_match_counts():
+    """关键字匹配数对上下文因子的影响"""
+    # match_count >= 3 → context_factor = 1.0
+    r = compute_confidence_v2(
+        {"base_confidence": 1.0, "match_count": 5},
+        {"structure_match_score": 5},
+    )
+    assert r["context_factor"] == 1.0
+    assert r["confidence"] == 1.0
+
+    # match_count == 2 → context_factor = 0.95
+    r = compute_confidence_v2(
+        {"base_confidence": 1.0, "match_count": 2},
+        {"structure_match_score": 5},
+    )
+    assert r["context_factor"] == 0.95
+    assert r["confidence"] == 0.95
+
+    # match_count == 1 → context_factor = 0.90
+    r = compute_confidence_v2(
+        {"base_confidence": 1.0, "match_count": 1},
+        {"structure_match_score": 5},
+    )
+    assert r["context_factor"] == 0.90
+    assert r["confidence"] == 0.90
+
+    # match_count == 0 → context_factor = 0.50
+    r = compute_confidence_v2(
+        {"base_confidence": 1.0, "match_count": 0},
+        {"structure_match_score": 5},
+    )
+    assert r["context_factor"] == 0.50
+    assert r["confidence"] == 0.50
+
+
+def test_consistency_factor_contradictions():
+    """矛盾数量对一致性因子的影响"""
+    # 无矛盾 → 1.0
+    r = compute_confidence_v2(
+        {"base_confidence": 1.0, "match_count": 3},
+        {"structure_match_score": 5},
+        contradictions=[],
+    )
+    assert r["consistency_factor"] == 1.0
+
+    # 已解决 → 0.90
+    r = compute_confidence_v2(
+        {"base_confidence": 1.0, "match_count": 3},
+        {"structure_match_score": 5},
+        contradictions=[{"type": "t1", "resolved": True}],
+    )
+    assert r["consistency_factor"] == 0.90
+
+    # 未解决 < 3 → 0.80
+    r = compute_confidence_v2(
+        {"base_confidence": 1.0, "match_count": 3},
+        {"structure_match_score": 5},
+        contradictions=[{"type": "t1", "resolved": False}],
+    )
+    assert r["consistency_factor"] == 0.80
+
+    # ≥3 未解决 → 0.50
+    r = compute_confidence_v2(
+        {"base_confidence": 1.0, "match_count": 3},
+        {"structure_match_score": 5},
+        contradictions=[
+            {"type": "t1", "resolved": False},
+            {"type": "t2", "resolved": False},
+            {"type": "t3", "resolved": True},
+        ],
+    )
+    assert r["consistency_factor"] == 0.50
+
+
+def test_structure_factor_scores():
+    """结构匹配度对结构一致性因子的影响"""
+    # 5/5 → 1.0
+    r = compute_confidence_v2(
+        {"base_confidence": 1.0, "match_count": 3},
+        {"structure_match_score": 5},
+    )
+    assert r["structure_factor"] == 1.0
+
+    # 3-4/5 → 0.7
+    r = compute_confidence_v2(
+        {"base_confidence": 1.0, "match_count": 3},
+        {"structure_match_score": 3},
+    )
+    assert r["structure_factor"] == 0.7
+
+    # 1-2/5 → 0.5
+    r = compute_confidence_v2(
+        {"base_confidence": 1.0, "match_count": 3},
+        {"structure_match_score": 1},
+    )
+    assert r["structure_factor"] == 0.5
+
+    # 无法/0 → 0.3
+    r = compute_confidence_v2(
+        {"base_confidence": 1.0, "match_count": 3},
+        {"structure_match_score": 0},
+    )
+    assert r["structure_factor"] == 0.3
+
+
+def test_base_confidence_default():
+    """keyword_result 未提供 base_confidence 时使用默认值 0.7"""
+    r = compute_confidence_v2(
+        {"match_count": 3},
+        {"structure_match_score": 5},
+    )
+    assert r["base"] == 0.7
+
+
+# ── compute_quality_score 双模式测试 ──
+
+
+def test_quality_score_no_gcov():
+    """gcov 未启用模式: branch_rate×0.5 + paragraph_rate×0.5 + confidence×0.4"""
+    static_cov = {
+        "branch_rate": 0.80,
+        "paragraph_rate": 0.90,
+    }
+    score = compute_quality_score(static_cov, gcov_coverage=None, confidence=0.5)
+    # 0.80×0.5 + 0.90×0.5 + 0.5×0.4 = 0.40 + 0.45 + 0.20 = 1.05 → min(1.0, 1.05) = 1.0
+    assert score == 1.0
+
+
+def test_quality_score_no_gcov_sub_max():
+    """gcov 未启用模式，确保不超过 1.0 被 clamp"""
+    static_cov = {
+        "branch_rate": 0.60,
+        "paragraph_rate": 0.70,
+    }
+    score = compute_quality_score(static_cov, gcov_coverage=None, confidence=0.8)
+    # 0.60×0.5 + 0.70×0.5 + 0.8×0.4 = 0.30 + 0.35 + 0.32 = 0.97
+    assert score == 0.97
+
+
+def test_quality_score_with_gcov():
+    """gcov 启用模式: static_cov×0.3 + gcov_cov×0.4 + confidence×0.3"""
+    static_cov = {
+        "branch_rate": 0.80,
+        "paragraph_rate": 0.90,
+    }
+    gcov_cov = {"gcov_cov": 0.75}
+    score = compute_quality_score(static_cov, gcov_cov, confidence=0.5)
+    # static_cov = 0.80×0.5 + 0.90×0.5 = 0.85
+    # score = 0.85×0.3 + 0.75×0.4 + 0.5×0.3 = 0.255 + 0.30 + 0.15 = 0.705
+    assert score == 0.705
+
+
+def test_quality_score_with_gcov_zero_confidence():
+    """gcov 启用模式，置信度为 0"""
+    static_cov = {
+        "branch_rate": 1.0,
+        "paragraph_rate": 1.0,
+    }
+    gcov_cov = {"gcov_cov": 0.5}
+    score = compute_quality_score(static_cov, gcov_cov, confidence=0.0)
+    # static_cov = 1.0
+    # score = 1.0×0.3 + 0.5×0.4 + 0.0×0.3 = 0.30 + 0.20 + 0.0 = 0.50
+    assert score == 0.50
+
+
+# ── compare_coverage 基本功能测试 ──
+
+
+def test_compare_coverage_basic():
+    """compare_coverage 基本功能"""
+    static = {
+        "branch_rate": 0.90,
+        "paragraph_rate": 0.85,
+        "total_branches": 20,
+        "covered_branches": 18,
+    }
+    dynamic = {
+        "gcov_cov": 0.75,
+        "covered_branches": 15,
+        "total_branches": 20,
+        "misleading_branches": ["BR001", "BR003"],
+    }
+    result = compare_coverage("TESTPROG", static, dynamic)
+    assert result["program"] == "TESTPROG"
+    assert result["static"]["branch_rate"] == 0.90
+    assert result["static"]["paragraph_rate"] == 0.85
+    assert result["dynamic"]["gcov_cov"] == 0.75
+    # gap = (0.90×0.5 + 0.85×0.5) - 0.75 = 0.875 - 0.75 = 0.125
+    assert result["gap"] == 0.125
+    assert result["misleading_branches"] == ["BR001", "BR003"]
+
+
+def test_compare_coverage_no_gap():
+    """静态与动态完全一致时 gap 为 0"""
+    static = {
+        "branch_rate": 0.80,
+        "paragraph_rate": 0.80,
+        "total_branches": 10,
+        "covered_branches": 8,
+    }
+    dynamic = {
+        "gcov_cov": 0.80,
+        "covered_branches": 8,
+        "total_branches": 10,
+        "misleading_branches": [],
+    }
+    result = compare_coverage("NOGAP", static, dynamic)
+    # gap = (0.80×0.5 + 0.80×0.5) - 0.80 = 0.80 - 0.80 = 0.0
+    assert result["gap"] == 0.0
+    assert result["misleading_branches"] == []
+
+
+def test_compare_coverage_no_misleading():
+    """没有误导分支时的返回"""
+    static = {
+        "branch_rate": 0.95,
+        "paragraph_rate": 1.0,
+    }
+    dynamic = {
+        "gcov_cov": 0.90,
+        "misleading_branches": [],
+    }
+    result = compare_coverage("CLEAN", static, dynamic)
+    # gap = (0.95×0.5 + 1.0×0.5) - 0.90 = 0.975 - 0.90 = 0.075
+    assert result["gap"] == 0.075
+    assert result["misleading_branches"] == []
+
+
+# ── gate.check 基本功能测试 ──
+
+
+def test_gate_check_passed():
+    """质量门禁完全通过"""
+    result = gate_check(
+        complete_tests=[{"id": 1}],
+        hina_result={},
+        coverage={"branch_rate": 1.0, "paragraph_rate": 1.0},
+    )
+    assert result["passed"] is True
+    assert len(result["issues"]) == 0
+
+
+def test_gate_check_failed_branch():
+    """分支覆盖率不足"""
+    result = gate_check(
+        complete_tests=[{"id": 1}],
+        hina_result={},
+        coverage={
+            "branch_rate": 0.50,
+            "paragraph_rate": 1.0,
+            "uncovered_decision_ids": [1, 2],
+        },
+    )
+    assert result["passed"] is False
+    assert "decision_gaps" in result["issues"]
+
+
+def test_gate_check_no_data():
+    """无测试数据"""
+    result = gate_check(
+        complete_tests=[],
+        hina_result={},
+        coverage={"branch_rate": 1.0, "paragraph_rate": 1.0},
+    )
+    assert result["passed"] is False
+    assert "no_data" in result["issues"]