cobol-java-v3/tests/hina/test_confidence.py

"""测试: 确信度 4 因子计算 + 质量门禁评分 + 覆盖率比较"""

import pytest
from hina.confidence import compute_confidence_v2
from hina.gate import compute_quality_score, check as gate_check
from coverage.compare_coverage import compare_coverage


# ── compute_confidence_v2 判定阈值测试 ──


def test_auto_judgment():
    """确信度 >= 0.90 → auto"""
    keyword_result = {
        "base_confidence": 1.0,
        "match_count": 3,
    }
    structure_features = {"structure_match_score": 5}
    result = compute_confidence_v2(keyword_result, structure_features)
    # 1.0 × 1.0 × 1.0 × 1.0 = 1.0
    assert result["confidence"] == 1.0
    assert result["judgment"] == "auto"
    assert result["needs_review"] is False


def test_review_judgment():
    """确信度 0.70-0.89 → review"""
    # Need 0.70 <= confidence < 0.90
    # base=1.0, context=0.95, consistency=1.0, structure=0.7 → 0.665 → still manual
    # base=1.0, context=1.0, consistency=0.9, structure=0.85... hmm structure is discrete
    # Let's try: base=0.95, context=1.0, consistency=1.0, structure=0.7 → 0.665 (manual)
    # base=0.95, context=0.95(match=2), consistency=1.0, structure=0.7 → 0.63175 (manual)
    # base=0.95, context=1.0, consistency=0.90, structure=1.0 → 0.855 (review!)
    keyword_result = {
        "base_confidence": 0.95,
        "match_count": 3,
    }
    structure_features = {"structure_match_score": 5}
    contradictions = [
        {"type": "type_mismatch", "resolved": True},
    ]
    result = compute_confidence_v2(
        keyword_result, structure_features,
        contradictions=contradictions,
    )
    # 0.95 × 1.0 × 0.90 × 1.0 = 0.855
    assert 0.70 <= result["confidence"] < 0.90
    assert result["judgment"] == "review"
    assert result["needs_review"] is True


def test_manual_judgment():
    """确信度 0.50-0.69 → manual"""
    keyword_result = {
        "base_confidence": 0.95,
        "match_count": 1,
    }
    structure_features = {"structure_match_score": 4}
    contradictions = [
        {"type": "type_mismatch", "resolved": True},
    ]
    result = compute_confidence_v2(
        keyword_result, structure_features,
        contradictions=contradictions,
    )
    # 0.95 × 0.90 × 0.90 × 0.7 = 0.53865
    assert 0.50 <= result["confidence"] < 0.70
    assert result["judgment"] == "manual"
    assert result["needs_review"] is True


def test_impossible_judgment():
    """确信度 < 0.50 → impossible"""
    keyword_result = {
        "base_confidence": 0.7,
        "match_count": 0,
    }
    structure_features = {"structure_match_score": 0}
    result = compute_confidence_v2(keyword_result, structure_features)
    # 0.7 × 0.50 × 1.0 × 0.3 = 0.105
    assert result["confidence"] < 0.50
    assert result["judgment"] == "impossible"
    assert result["needs_review"] is True


# ── 因子边界测试 ──


def test_context_factor_match_counts():
    """关键字匹配数对上下文因子的影响"""
    # match_count >= 3 → context_factor = 1.0
    r = compute_confidence_v2(
        {"base_confidence": 1.0, "match_count": 5},
        {"structure_match_score": 5},
    )
    assert r["context_factor"] == 1.0
    assert r["confidence"] == 1.0

    # match_count == 2 → context_factor = 0.95
    r = compute_confidence_v2(
        {"base_confidence": 1.0, "match_count": 2},
        {"structure_match_score": 5},
    )
    assert r["context_factor"] == 0.95
    assert r["confidence"] == 0.95

    # match_count == 1 → context_factor = 0.90
    r = compute_confidence_v2(
        {"base_confidence": 1.0, "match_count": 1},
        {"structure_match_score": 5},
    )
    assert r["context_factor"] == 0.90
    assert r["confidence"] == 0.90

    # match_count == 0 → context_factor = 0.50
    r = compute_confidence_v2(
        {"base_confidence": 1.0, "match_count": 0},
        {"structure_match_score": 5},
    )
    assert r["context_factor"] == 0.50
    assert r["confidence"] == 0.50


def test_consistency_factor_contradictions():
    """矛盾数量对一致性因子的影响"""
    # 无矛盾 → 1.0
    r = compute_confidence_v2(
        {"base_confidence": 1.0, "match_count": 3},
        {"structure_match_score": 5},
        contradictions=[],
    )
    assert r["consistency_factor"] == 1.0

    # 已解决 → 0.90
    r = compute_confidence_v2(
        {"base_confidence": 1.0, "match_count": 3},
        {"structure_match_score": 5},
        contradictions=[{"type": "t1", "resolved": True}],
    )
    assert r["consistency_factor"] == 0.90

    # 未解决 < 3 → 0.80
    r = compute_confidence_v2(
        {"base_confidence": 1.0, "match_count": 3},
        {"structure_match_score": 5},
        contradictions=[{"type": "t1", "resolved": False}],
    )
    assert r["consistency_factor"] == 0.80

    # ≥3 未解决 → 0.50
    r = compute_confidence_v2(
        {"base_confidence": 1.0, "match_count": 3},
        {"structure_match_score": 5},
        contradictions=[
            {"type": "t1", "resolved": False},
            {"type": "t2", "resolved": False},
            {"type": "t3", "resolved": True},
        ],
    )
    assert r["consistency_factor"] == 0.50


def test_structure_factor_scores():
    """结构匹配度对结构一致性因子的影响"""
    # 5/5 → 1.0
    r = compute_confidence_v2(
        {"base_confidence": 1.0, "match_count": 3},
        {"structure_match_score": 5},
    )
    assert r["structure_factor"] == 1.0

    # 3-4/5 → 0.7
    r = compute_confidence_v2(
        {"base_confidence": 1.0, "match_count": 3},
        {"structure_match_score": 3},
    )
    assert r["structure_factor"] == 0.7

    # 1-2/5 → 0.5
    r = compute_confidence_v2(
        {"base_confidence": 1.0, "match_count": 3},
        {"structure_match_score": 1},
    )
    assert r["structure_factor"] == 0.5

    # 无法/0 → 0.3
    r = compute_confidence_v2(
        {"base_confidence": 1.0, "match_count": 3},
        {"structure_match_score": 0},
    )
    assert r["structure_factor"] == 0.3


def test_base_confidence_default():
    """keyword_result 未提供 base_confidence 时使用默认值 0.7"""
    r = compute_confidence_v2(
        {"match_count": 3},
        {"structure_match_score": 5},
    )
    assert r["base"] == 0.7


# ── compute_quality_score 双模式测试 ──


def test_quality_score_no_gcov():
    """gcov 未启用模式: branch_rate×0.5 + paragraph_rate×0.5 + confidence×0.4"""
    static_cov = {
        "branch_rate": 0.80,
        "paragraph_rate": 0.90,
    }
    score = compute_quality_score(static_cov, gcov_coverage=None, confidence=0.5)
    # 0.80×0.5 + 0.90×0.5 + 0.5×0.4 = 0.40 + 0.45 + 0.20 = 1.05 → min(1.0, 1.05) = 1.0
    assert score == 1.0


def test_quality_score_no_gcov_sub_max():
    """gcov 未启用模式，确保不超过 1.0 被 clamp"""
    static_cov = {
        "branch_rate": 0.60,
        "paragraph_rate": 0.70,
    }
    score = compute_quality_score(static_cov, gcov_coverage=None, confidence=0.8)
    # 0.60×0.5 + 0.70×0.5 + 0.8×0.4 = 0.30 + 0.35 + 0.32 = 0.97
    assert score == 0.97


def test_quality_score_with_gcov():
    """gcov 启用模式: static_cov×0.3 + gcov_cov×0.4 + confidence×0.3"""
    static_cov = {
        "branch_rate": 0.80,
        "paragraph_rate": 0.90,
    }
    gcov_cov = {"gcov_cov": 0.75}
    score = compute_quality_score(static_cov, gcov_cov, confidence=0.5)
    # static_cov = 0.80×0.5 + 0.90×0.5 = 0.85
    # score = 0.85×0.3 + 0.75×0.4 + 0.5×0.3 = 0.255 + 0.30 + 0.15 = 0.705
    assert score == 0.705


def test_quality_score_with_gcov_zero_confidence():
    """gcov 启用模式，置信度为 0"""
    static_cov = {
        "branch_rate": 1.0,
        "paragraph_rate": 1.0,
    }
    gcov_cov = {"gcov_cov": 0.5}
    score = compute_quality_score(static_cov, gcov_cov, confidence=0.0)
    # static_cov = 1.0
    # score = 1.0×0.3 + 0.5×0.4 + 0.0×0.3 = 0.30 + 0.20 + 0.0 = 0.50
    assert score == 0.50


# ── compare_coverage 基本功能测试 ──


def test_compare_coverage_basic():
    """compare_coverage 基本功能"""
    static = {
        "branch_rate": 0.90,
        "paragraph_rate": 0.85,
        "total_branches": 20,
        "covered_branches": 18,
    }
    dynamic = {
        "gcov_cov": 0.75,
        "covered_branches": 15,
        "total_branches": 20,
        "misleading_branches": ["BR001", "BR003"],
    }
    result = compare_coverage("TESTPROG", static, dynamic)
    assert result["program"] == "TESTPROG"
    assert result["static"]["branch_rate"] == 0.90
    assert result["static"]["paragraph_rate"] == 0.85
    assert result["dynamic"]["gcov_cov"] == 0.75
    # gap = (0.90×0.5 + 0.85×0.5) - 0.75 = 0.875 - 0.75 = 0.125
    assert result["gap"] == 0.125
    assert result["misleading_branches"] == ["BR001", "BR003"]


def test_compare_coverage_no_gap():
    """静态与动态完全一致时 gap 为 0"""
    static = {
        "branch_rate": 0.80,
        "paragraph_rate": 0.80,
        "total_branches": 10,
        "covered_branches": 8,
    }
    dynamic = {
        "gcov_cov": 0.80,
        "covered_branches": 8,
        "total_branches": 10,
        "misleading_branches": [],
    }
    result = compare_coverage("NOGAP", static, dynamic)
    # gap = (0.80×0.5 + 0.80×0.5) - 0.80 = 0.80 - 0.80 = 0.0
    assert result["gap"] == 0.0
    assert result["misleading_branches"] == []


def test_compare_coverage_no_misleading():
    """没有误导分支时的返回"""
    static = {
        "branch_rate": 0.95,
        "paragraph_rate": 1.0,
    }
    dynamic = {
        "gcov_cov": 0.90,
        "misleading_branches": [],
    }
    result = compare_coverage("CLEAN", static, dynamic)
    # gap = (0.95×0.5 + 1.0×0.5) - 0.90 = 0.975 - 0.90 = 0.075
    assert result["gap"] == 0.075
    assert result["misleading_branches"] == []


# ── gate.check 基本功能测试 ──


def test_gate_check_passed():
    """质量门禁完全通过"""
    result = gate_check(
        complete_tests=[{"id": 1}],
        hina_result={},
        coverage={"branch_rate": 1.0, "paragraph_rate": 1.0},
    )
    assert result["passed"] is True
    assert len(result["issues"]) == 0


def test_gate_check_failed_branch():
    """分支覆盖率不足"""
    result = gate_check(
        complete_tests=[{"id": 1}],
        hina_result={},
        coverage={
            "branch_rate": 0.50,
            "paragraph_rate": 1.0,
            "uncovered_decision_ids": [1, 2],
        },
    )
    assert result["passed"] is False
    assert "decision_gaps" in result["issues"]


def test_gate_check_no_data():
    """无测试数据"""
    result = gate_check(
        complete_tests=[],
        hina_result={},
        coverage={"branch_rate": 1.0, "paragraph_rate": 1.0},
    )
    assert result["passed"] is False
    assert "no_data" in result["issues"]