Files
cobol-java-v3/tests/hina/test_confidence.py
hangshuo652 bc1d56d1a4 feat: Phase 2 complete — 13 Phases of COBOL type classification and test benchmark
P0.6: gcov infrastructure
P1: extract_structure output expansion (11 new feature fields)
P2: Confusion group rule engine (8 pairs + contradiction + backtrack)
P3: 4-factor confidence calculation + quality gate update
P4: 33+2 COBOL program type test samples (22 files, 7 categories)
P5: parametrized/ test data generation engine
P6: japanese_data.py lookup tables
P7-10: Type-specific test suites (~159 parametrized tests)
P11: Full classification pipeline (classify_program) + orchestrator integration
P12: Documentation (module-interfaces, test-plan v3.0, coverage-matrix)

Architecture decisions:
- classification_pipeline/ merged to hina/pipeline/
- parametrized/ as independent module
- japanese_data.py as root-level file
- hina/__all__ only exports classify_program()

Co-Authored-By: Claude <noreply@anthropic.com>
2026-06-19 23:51:55 +08:00

355 lines
11 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""测试: 确信度 4 因子计算 + 质量门禁评分 + 覆盖率比较"""
import pytest
from hina.confidence import compute_confidence_v2
from hina.gate import compute_quality_score, check as gate_check
from coverage.compare_coverage import compare_coverage
# ── compute_confidence_v2 判定阈值测试 ──
def test_auto_judgment():
"""确信度 >= 0.90 → auto"""
keyword_result = {
"base_confidence": 1.0,
"match_count": 3,
}
structure_features = {"structure_match_score": 5}
result = compute_confidence_v2(keyword_result, structure_features)
# 1.0 × 1.0 × 1.0 × 1.0 = 1.0
assert result["confidence"] == 1.0
assert result["judgment"] == "auto"
assert result["needs_review"] is False
def test_review_judgment():
"""确信度 0.70-0.89 → review"""
# Need 0.70 <= confidence < 0.90
# base=1.0, context=0.95, consistency=1.0, structure=0.7 → 0.665 → still manual
# base=1.0, context=1.0, consistency=0.9, structure=0.85... hmm structure is discrete
# Let's try: base=0.95, context=1.0, consistency=1.0, structure=0.7 → 0.665 (manual)
# base=0.95, context=0.95(match=2), consistency=1.0, structure=0.7 → 0.63175 (manual)
# base=0.95, context=1.0, consistency=0.90, structure=1.0 → 0.855 (review!)
keyword_result = {
"base_confidence": 0.95,
"match_count": 3,
}
structure_features = {"structure_match_score": 5}
contradictions = [
{"type": "type_mismatch", "resolved": True},
]
result = compute_confidence_v2(
keyword_result, structure_features,
contradictions=contradictions,
)
# 0.95 × 1.0 × 0.90 × 1.0 = 0.855
assert 0.70 <= result["confidence"] < 0.90
assert result["judgment"] == "review"
assert result["needs_review"] is True
def test_manual_judgment():
"""确信度 0.50-0.69 → manual"""
keyword_result = {
"base_confidence": 0.95,
"match_count": 1,
}
structure_features = {"structure_match_score": 4}
contradictions = [
{"type": "type_mismatch", "resolved": True},
]
result = compute_confidence_v2(
keyword_result, structure_features,
contradictions=contradictions,
)
# 0.95 × 0.90 × 0.90 × 0.7 = 0.53865
assert 0.50 <= result["confidence"] < 0.70
assert result["judgment"] == "manual"
assert result["needs_review"] is True
def test_impossible_judgment():
"""确信度 < 0.50 → impossible"""
keyword_result = {
"base_confidence": 0.7,
"match_count": 0,
}
structure_features = {"structure_match_score": 0}
result = compute_confidence_v2(keyword_result, structure_features)
# 0.7 × 0.50 × 1.0 × 0.3 = 0.105
assert result["confidence"] < 0.50
assert result["judgment"] == "impossible"
assert result["needs_review"] is True
# ── 因子边界测试 ──
def test_context_factor_match_counts():
"""关键字匹配数对上下文因子的影响"""
# match_count >= 3 → context_factor = 1.0
r = compute_confidence_v2(
{"base_confidence": 1.0, "match_count": 5},
{"structure_match_score": 5},
)
assert r["context_factor"] == 1.0
assert r["confidence"] == 1.0
# match_count == 2 → context_factor = 0.95
r = compute_confidence_v2(
{"base_confidence": 1.0, "match_count": 2},
{"structure_match_score": 5},
)
assert r["context_factor"] == 0.95
assert r["confidence"] == 0.95
# match_count == 1 → context_factor = 0.90
r = compute_confidence_v2(
{"base_confidence": 1.0, "match_count": 1},
{"structure_match_score": 5},
)
assert r["context_factor"] == 0.90
assert r["confidence"] == 0.90
# match_count == 0 → context_factor = 0.50
r = compute_confidence_v2(
{"base_confidence": 1.0, "match_count": 0},
{"structure_match_score": 5},
)
assert r["context_factor"] == 0.50
assert r["confidence"] == 0.50
def test_consistency_factor_contradictions():
"""矛盾数量对一致性因子的影响"""
# 无矛盾 → 1.0
r = compute_confidence_v2(
{"base_confidence": 1.0, "match_count": 3},
{"structure_match_score": 5},
contradictions=[],
)
assert r["consistency_factor"] == 1.0
# 已解决 → 0.90
r = compute_confidence_v2(
{"base_confidence": 1.0, "match_count": 3},
{"structure_match_score": 5},
contradictions=[{"type": "t1", "resolved": True}],
)
assert r["consistency_factor"] == 0.90
# 未解决 < 3 → 0.80
r = compute_confidence_v2(
{"base_confidence": 1.0, "match_count": 3},
{"structure_match_score": 5},
contradictions=[{"type": "t1", "resolved": False}],
)
assert r["consistency_factor"] == 0.80
# ≥3 未解决 → 0.50
r = compute_confidence_v2(
{"base_confidence": 1.0, "match_count": 3},
{"structure_match_score": 5},
contradictions=[
{"type": "t1", "resolved": False},
{"type": "t2", "resolved": False},
{"type": "t3", "resolved": True},
],
)
assert r["consistency_factor"] == 0.50
def test_structure_factor_scores():
"""结构匹配度对结构一致性因子的影响"""
# 5/5 → 1.0
r = compute_confidence_v2(
{"base_confidence": 1.0, "match_count": 3},
{"structure_match_score": 5},
)
assert r["structure_factor"] == 1.0
# 3-4/5 → 0.7
r = compute_confidence_v2(
{"base_confidence": 1.0, "match_count": 3},
{"structure_match_score": 3},
)
assert r["structure_factor"] == 0.7
# 1-2/5 → 0.5
r = compute_confidence_v2(
{"base_confidence": 1.0, "match_count": 3},
{"structure_match_score": 1},
)
assert r["structure_factor"] == 0.5
# 无法/0 → 0.3
r = compute_confidence_v2(
{"base_confidence": 1.0, "match_count": 3},
{"structure_match_score": 0},
)
assert r["structure_factor"] == 0.3
def test_base_confidence_default():
"""keyword_result 未提供 base_confidence 时使用默认值 0.7"""
r = compute_confidence_v2(
{"match_count": 3},
{"structure_match_score": 5},
)
assert r["base"] == 0.7
# ── compute_quality_score 双模式测试 ──
def test_quality_score_no_gcov():
"""gcov 未启用模式: branch_rate×0.5 + paragraph_rate×0.5 + confidence×0.4"""
static_cov = {
"branch_rate": 0.80,
"paragraph_rate": 0.90,
}
score = compute_quality_score(static_cov, gcov_coverage=None, confidence=0.5)
# 0.80×0.5 + 0.90×0.5 + 0.5×0.4 = 0.40 + 0.45 + 0.20 = 1.05 → min(1.0, 1.05) = 1.0
assert score == 1.0
def test_quality_score_no_gcov_sub_max():
"""gcov 未启用模式,确保不超过 1.0 被 clamp"""
static_cov = {
"branch_rate": 0.60,
"paragraph_rate": 0.70,
}
score = compute_quality_score(static_cov, gcov_coverage=None, confidence=0.8)
# 0.60×0.5 + 0.70×0.5 + 0.8×0.4 = 0.30 + 0.35 + 0.32 = 0.97
assert score == 0.97
def test_quality_score_with_gcov():
"""gcov 启用模式: static_cov×0.3 + gcov_cov×0.4 + confidence×0.3"""
static_cov = {
"branch_rate": 0.80,
"paragraph_rate": 0.90,
}
gcov_cov = {"gcov_cov": 0.75}
score = compute_quality_score(static_cov, gcov_cov, confidence=0.5)
# static_cov = 0.80×0.5 + 0.90×0.5 = 0.85
# score = 0.85×0.3 + 0.75×0.4 + 0.5×0.3 = 0.255 + 0.30 + 0.15 = 0.705
assert score == 0.705
def test_quality_score_with_gcov_zero_confidence():
"""gcov 启用模式,置信度为 0"""
static_cov = {
"branch_rate": 1.0,
"paragraph_rate": 1.0,
}
gcov_cov = {"gcov_cov": 0.5}
score = compute_quality_score(static_cov, gcov_cov, confidence=0.0)
# static_cov = 1.0
# score = 1.0×0.3 + 0.5×0.4 + 0.0×0.3 = 0.30 + 0.20 + 0.0 = 0.50
assert score == 0.50
# ── compare_coverage 基本功能测试 ──
def test_compare_coverage_basic():
"""compare_coverage 基本功能"""
static = {
"branch_rate": 0.90,
"paragraph_rate": 0.85,
"total_branches": 20,
"covered_branches": 18,
}
dynamic = {
"gcov_cov": 0.75,
"covered_branches": 15,
"total_branches": 20,
"misleading_branches": ["BR001", "BR003"],
}
result = compare_coverage("TESTPROG", static, dynamic)
assert result["program"] == "TESTPROG"
assert result["static"]["branch_rate"] == 0.90
assert result["static"]["paragraph_rate"] == 0.85
assert result["dynamic"]["gcov_cov"] == 0.75
# gap = (0.90×0.5 + 0.85×0.5) - 0.75 = 0.875 - 0.75 = 0.125
assert result["gap"] == 0.125
assert result["misleading_branches"] == ["BR001", "BR003"]
def test_compare_coverage_no_gap():
"""静态与动态完全一致时 gap 为 0"""
static = {
"branch_rate": 0.80,
"paragraph_rate": 0.80,
"total_branches": 10,
"covered_branches": 8,
}
dynamic = {
"gcov_cov": 0.80,
"covered_branches": 8,
"total_branches": 10,
"misleading_branches": [],
}
result = compare_coverage("NOGAP", static, dynamic)
# gap = (0.80×0.5 + 0.80×0.5) - 0.80 = 0.80 - 0.80 = 0.0
assert result["gap"] == 0.0
assert result["misleading_branches"] == []
def test_compare_coverage_no_misleading():
"""没有误导分支时的返回"""
static = {
"branch_rate": 0.95,
"paragraph_rate": 1.0,
}
dynamic = {
"gcov_cov": 0.90,
"misleading_branches": [],
}
result = compare_coverage("CLEAN", static, dynamic)
# gap = (0.95×0.5 + 1.0×0.5) - 0.90 = 0.975 - 0.90 = 0.075
assert result["gap"] == 0.075
assert result["misleading_branches"] == []
# ── gate.check 基本功能测试 ──
def test_gate_check_passed():
"""质量门禁完全通过"""
result = gate_check(
complete_tests=[{"id": 1}],
hina_result={},
coverage={"branch_rate": 1.0, "paragraph_rate": 1.0},
)
assert result["passed"] is True
assert len(result["issues"]) == 0
def test_gate_check_failed_branch():
"""分支覆盖率不足"""
result = gate_check(
complete_tests=[{"id": 1}],
hina_result={},
coverage={
"branch_rate": 0.50,
"paragraph_rate": 1.0,
"uncovered_decision_ids": [1, 2],
},
)
assert result["passed"] is False
assert "decision_gaps" in result["issues"]
def test_gate_check_no_data():
"""无测试数据"""
result = gate_check(
complete_tests=[],
hina_result={},
coverage={"branch_rate": 1.0, "paragraph_rate": 1.0},
)
assert result["passed"] is False
assert "no_data" in result["issues"]