bc1d56d1a4
P0.6: gcov infrastructure P1: extract_structure output expansion (11 new feature fields) P2: Confusion group rule engine (8 pairs + contradiction + backtrack) P3: 4-factor confidence calculation + quality gate update P4: 33+2 COBOL program type test samples (22 files, 7 categories) P5: parametrized/ test data generation engine P6: japanese_data.py lookup tables P7-10: Type-specific test suites (~159 parametrized tests) P11: Full classification pipeline (classify_program) + orchestrator integration P12: Documentation (module-interfaces, test-plan v3.0, coverage-matrix) Architecture decisions: - classification_pipeline/ merged to hina/pipeline/ - parametrized/ as independent module - japanese_data.py as root-level file - hina/__all__ only exports classify_program() Co-Authored-By: Claude <noreply@anthropic.com>
355 lines
11 KiB
Python
355 lines
11 KiB
Python
"""测试: 确信度 4 因子计算 + 质量门禁评分 + 覆盖率比较"""
|
||
|
||
import pytest
|
||
from hina.confidence import compute_confidence_v2
|
||
from hina.gate import compute_quality_score, check as gate_check
|
||
from coverage.compare_coverage import compare_coverage
|
||
|
||
|
||
# ── compute_confidence_v2 判定阈值测试 ──
|
||
|
||
|
||
def test_auto_judgment():
|
||
"""确信度 >= 0.90 → auto"""
|
||
keyword_result = {
|
||
"base_confidence": 1.0,
|
||
"match_count": 3,
|
||
}
|
||
structure_features = {"structure_match_score": 5}
|
||
result = compute_confidence_v2(keyword_result, structure_features)
|
||
# 1.0 × 1.0 × 1.0 × 1.0 = 1.0
|
||
assert result["confidence"] == 1.0
|
||
assert result["judgment"] == "auto"
|
||
assert result["needs_review"] is False
|
||
|
||
|
||
def test_review_judgment():
|
||
"""确信度 0.70-0.89 → review"""
|
||
# Need 0.70 <= confidence < 0.90
|
||
# base=1.0, context=0.95, consistency=1.0, structure=0.7 → 0.665 → still manual
|
||
# base=1.0, context=1.0, consistency=0.9, structure=0.85... hmm structure is discrete
|
||
# Let's try: base=0.95, context=1.0, consistency=1.0, structure=0.7 → 0.665 (manual)
|
||
# base=0.95, context=0.95(match=2), consistency=1.0, structure=0.7 → 0.63175 (manual)
|
||
# base=0.95, context=1.0, consistency=0.90, structure=1.0 → 0.855 (review!)
|
||
keyword_result = {
|
||
"base_confidence": 0.95,
|
||
"match_count": 3,
|
||
}
|
||
structure_features = {"structure_match_score": 5}
|
||
contradictions = [
|
||
{"type": "type_mismatch", "resolved": True},
|
||
]
|
||
result = compute_confidence_v2(
|
||
keyword_result, structure_features,
|
||
contradictions=contradictions,
|
||
)
|
||
# 0.95 × 1.0 × 0.90 × 1.0 = 0.855
|
||
assert 0.70 <= result["confidence"] < 0.90
|
||
assert result["judgment"] == "review"
|
||
assert result["needs_review"] is True
|
||
|
||
|
||
def test_manual_judgment():
|
||
"""确信度 0.50-0.69 → manual"""
|
||
keyword_result = {
|
||
"base_confidence": 0.95,
|
||
"match_count": 1,
|
||
}
|
||
structure_features = {"structure_match_score": 4}
|
||
contradictions = [
|
||
{"type": "type_mismatch", "resolved": True},
|
||
]
|
||
result = compute_confidence_v2(
|
||
keyword_result, structure_features,
|
||
contradictions=contradictions,
|
||
)
|
||
# 0.95 × 0.90 × 0.90 × 0.7 = 0.53865
|
||
assert 0.50 <= result["confidence"] < 0.70
|
||
assert result["judgment"] == "manual"
|
||
assert result["needs_review"] is True
|
||
|
||
|
||
def test_impossible_judgment():
|
||
"""确信度 < 0.50 → impossible"""
|
||
keyword_result = {
|
||
"base_confidence": 0.7,
|
||
"match_count": 0,
|
||
}
|
||
structure_features = {"structure_match_score": 0}
|
||
result = compute_confidence_v2(keyword_result, structure_features)
|
||
# 0.7 × 0.50 × 1.0 × 0.3 = 0.105
|
||
assert result["confidence"] < 0.50
|
||
assert result["judgment"] == "impossible"
|
||
assert result["needs_review"] is True
|
||
|
||
|
||
# ── 因子边界测试 ──
|
||
|
||
|
||
def test_context_factor_match_counts():
|
||
"""关键字匹配数对上下文因子的影响"""
|
||
# match_count >= 3 → context_factor = 1.0
|
||
r = compute_confidence_v2(
|
||
{"base_confidence": 1.0, "match_count": 5},
|
||
{"structure_match_score": 5},
|
||
)
|
||
assert r["context_factor"] == 1.0
|
||
assert r["confidence"] == 1.0
|
||
|
||
# match_count == 2 → context_factor = 0.95
|
||
r = compute_confidence_v2(
|
||
{"base_confidence": 1.0, "match_count": 2},
|
||
{"structure_match_score": 5},
|
||
)
|
||
assert r["context_factor"] == 0.95
|
||
assert r["confidence"] == 0.95
|
||
|
||
# match_count == 1 → context_factor = 0.90
|
||
r = compute_confidence_v2(
|
||
{"base_confidence": 1.0, "match_count": 1},
|
||
{"structure_match_score": 5},
|
||
)
|
||
assert r["context_factor"] == 0.90
|
||
assert r["confidence"] == 0.90
|
||
|
||
# match_count == 0 → context_factor = 0.50
|
||
r = compute_confidence_v2(
|
||
{"base_confidence": 1.0, "match_count": 0},
|
||
{"structure_match_score": 5},
|
||
)
|
||
assert r["context_factor"] == 0.50
|
||
assert r["confidence"] == 0.50
|
||
|
||
|
||
def test_consistency_factor_contradictions():
|
||
"""矛盾数量对一致性因子的影响"""
|
||
# 无矛盾 → 1.0
|
||
r = compute_confidence_v2(
|
||
{"base_confidence": 1.0, "match_count": 3},
|
||
{"structure_match_score": 5},
|
||
contradictions=[],
|
||
)
|
||
assert r["consistency_factor"] == 1.0
|
||
|
||
# 已解决 → 0.90
|
||
r = compute_confidence_v2(
|
||
{"base_confidence": 1.0, "match_count": 3},
|
||
{"structure_match_score": 5},
|
||
contradictions=[{"type": "t1", "resolved": True}],
|
||
)
|
||
assert r["consistency_factor"] == 0.90
|
||
|
||
# 未解决 < 3 → 0.80
|
||
r = compute_confidence_v2(
|
||
{"base_confidence": 1.0, "match_count": 3},
|
||
{"structure_match_score": 5},
|
||
contradictions=[{"type": "t1", "resolved": False}],
|
||
)
|
||
assert r["consistency_factor"] == 0.80
|
||
|
||
# ≥3 未解决 → 0.50
|
||
r = compute_confidence_v2(
|
||
{"base_confidence": 1.0, "match_count": 3},
|
||
{"structure_match_score": 5},
|
||
contradictions=[
|
||
{"type": "t1", "resolved": False},
|
||
{"type": "t2", "resolved": False},
|
||
{"type": "t3", "resolved": True},
|
||
],
|
||
)
|
||
assert r["consistency_factor"] == 0.50
|
||
|
||
|
||
def test_structure_factor_scores():
|
||
"""结构匹配度对结构一致性因子的影响"""
|
||
# 5/5 → 1.0
|
||
r = compute_confidence_v2(
|
||
{"base_confidence": 1.0, "match_count": 3},
|
||
{"structure_match_score": 5},
|
||
)
|
||
assert r["structure_factor"] == 1.0
|
||
|
||
# 3-4/5 → 0.7
|
||
r = compute_confidence_v2(
|
||
{"base_confidence": 1.0, "match_count": 3},
|
||
{"structure_match_score": 3},
|
||
)
|
||
assert r["structure_factor"] == 0.7
|
||
|
||
# 1-2/5 → 0.5
|
||
r = compute_confidence_v2(
|
||
{"base_confidence": 1.0, "match_count": 3},
|
||
{"structure_match_score": 1},
|
||
)
|
||
assert r["structure_factor"] == 0.5
|
||
|
||
# 无法/0 → 0.3
|
||
r = compute_confidence_v2(
|
||
{"base_confidence": 1.0, "match_count": 3},
|
||
{"structure_match_score": 0},
|
||
)
|
||
assert r["structure_factor"] == 0.3
|
||
|
||
|
||
def test_base_confidence_default():
|
||
"""keyword_result 未提供 base_confidence 时使用默认值 0.7"""
|
||
r = compute_confidence_v2(
|
||
{"match_count": 3},
|
||
{"structure_match_score": 5},
|
||
)
|
||
assert r["base"] == 0.7
|
||
|
||
|
||
# ── compute_quality_score 双模式测试 ──
|
||
|
||
|
||
def test_quality_score_no_gcov():
|
||
"""gcov 未启用模式: branch_rate×0.5 + paragraph_rate×0.5 + confidence×0.4"""
|
||
static_cov = {
|
||
"branch_rate": 0.80,
|
||
"paragraph_rate": 0.90,
|
||
}
|
||
score = compute_quality_score(static_cov, gcov_coverage=None, confidence=0.5)
|
||
# 0.80×0.5 + 0.90×0.5 + 0.5×0.4 = 0.40 + 0.45 + 0.20 = 1.05 → min(1.0, 1.05) = 1.0
|
||
assert score == 1.0
|
||
|
||
|
||
def test_quality_score_no_gcov_sub_max():
|
||
"""gcov 未启用模式,确保不超过 1.0 被 clamp"""
|
||
static_cov = {
|
||
"branch_rate": 0.60,
|
||
"paragraph_rate": 0.70,
|
||
}
|
||
score = compute_quality_score(static_cov, gcov_coverage=None, confidence=0.8)
|
||
# 0.60×0.5 + 0.70×0.5 + 0.8×0.4 = 0.30 + 0.35 + 0.32 = 0.97
|
||
assert score == 0.97
|
||
|
||
|
||
def test_quality_score_with_gcov():
|
||
"""gcov 启用模式: static_cov×0.3 + gcov_cov×0.4 + confidence×0.3"""
|
||
static_cov = {
|
||
"branch_rate": 0.80,
|
||
"paragraph_rate": 0.90,
|
||
}
|
||
gcov_cov = {"gcov_cov": 0.75}
|
||
score = compute_quality_score(static_cov, gcov_cov, confidence=0.5)
|
||
# static_cov = 0.80×0.5 + 0.90×0.5 = 0.85
|
||
# score = 0.85×0.3 + 0.75×0.4 + 0.5×0.3 = 0.255 + 0.30 + 0.15 = 0.705
|
||
assert score == 0.705
|
||
|
||
|
||
def test_quality_score_with_gcov_zero_confidence():
|
||
"""gcov 启用模式,置信度为 0"""
|
||
static_cov = {
|
||
"branch_rate": 1.0,
|
||
"paragraph_rate": 1.0,
|
||
}
|
||
gcov_cov = {"gcov_cov": 0.5}
|
||
score = compute_quality_score(static_cov, gcov_cov, confidence=0.0)
|
||
# static_cov = 1.0
|
||
# score = 1.0×0.3 + 0.5×0.4 + 0.0×0.3 = 0.30 + 0.20 + 0.0 = 0.50
|
||
assert score == 0.50
|
||
|
||
|
||
# ── compare_coverage 基本功能测试 ──
|
||
|
||
|
||
def test_compare_coverage_basic():
|
||
"""compare_coverage 基本功能"""
|
||
static = {
|
||
"branch_rate": 0.90,
|
||
"paragraph_rate": 0.85,
|
||
"total_branches": 20,
|
||
"covered_branches": 18,
|
||
}
|
||
dynamic = {
|
||
"gcov_cov": 0.75,
|
||
"covered_branches": 15,
|
||
"total_branches": 20,
|
||
"misleading_branches": ["BR001", "BR003"],
|
||
}
|
||
result = compare_coverage("TESTPROG", static, dynamic)
|
||
assert result["program"] == "TESTPROG"
|
||
assert result["static"]["branch_rate"] == 0.90
|
||
assert result["static"]["paragraph_rate"] == 0.85
|
||
assert result["dynamic"]["gcov_cov"] == 0.75
|
||
# gap = (0.90×0.5 + 0.85×0.5) - 0.75 = 0.875 - 0.75 = 0.125
|
||
assert result["gap"] == 0.125
|
||
assert result["misleading_branches"] == ["BR001", "BR003"]
|
||
|
||
|
||
def test_compare_coverage_no_gap():
|
||
"""静态与动态完全一致时 gap 为 0"""
|
||
static = {
|
||
"branch_rate": 0.80,
|
||
"paragraph_rate": 0.80,
|
||
"total_branches": 10,
|
||
"covered_branches": 8,
|
||
}
|
||
dynamic = {
|
||
"gcov_cov": 0.80,
|
||
"covered_branches": 8,
|
||
"total_branches": 10,
|
||
"misleading_branches": [],
|
||
}
|
||
result = compare_coverage("NOGAP", static, dynamic)
|
||
# gap = (0.80×0.5 + 0.80×0.5) - 0.80 = 0.80 - 0.80 = 0.0
|
||
assert result["gap"] == 0.0
|
||
assert result["misleading_branches"] == []
|
||
|
||
|
||
def test_compare_coverage_no_misleading():
|
||
"""没有误导分支时的返回"""
|
||
static = {
|
||
"branch_rate": 0.95,
|
||
"paragraph_rate": 1.0,
|
||
}
|
||
dynamic = {
|
||
"gcov_cov": 0.90,
|
||
"misleading_branches": [],
|
||
}
|
||
result = compare_coverage("CLEAN", static, dynamic)
|
||
# gap = (0.95×0.5 + 1.0×0.5) - 0.90 = 0.975 - 0.90 = 0.075
|
||
assert result["gap"] == 0.075
|
||
assert result["misleading_branches"] == []
|
||
|
||
|
||
# ── gate.check 基本功能测试 ──
|
||
|
||
|
||
def test_gate_check_passed():
|
||
"""质量门禁完全通过"""
|
||
result = gate_check(
|
||
complete_tests=[{"id": 1}],
|
||
hina_result={},
|
||
coverage={"branch_rate": 1.0, "paragraph_rate": 1.0},
|
||
)
|
||
assert result["passed"] is True
|
||
assert len(result["issues"]) == 0
|
||
|
||
|
||
def test_gate_check_failed_branch():
|
||
"""分支覆盖率不足"""
|
||
result = gate_check(
|
||
complete_tests=[{"id": 1}],
|
||
hina_result={},
|
||
coverage={
|
||
"branch_rate": 0.50,
|
||
"paragraph_rate": 1.0,
|
||
"uncovered_decision_ids": [1, 2],
|
||
},
|
||
)
|
||
assert result["passed"] is False
|
||
assert "decision_gaps" in result["issues"]
|
||
|
||
|
||
def test_gate_check_no_data():
|
||
"""无测试数据"""
|
||
result = gate_check(
|
||
complete_tests=[],
|
||
hina_result={},
|
||
coverage={"branch_rate": 1.0, "paragraph_rate": 1.0},
|
||
)
|
||
assert result["passed"] is False
|
||
assert "no_data" in result["issues"]
|