feat: Phase 2 complete — 13 Phases of COBOL type classification and test benchmark

P0.6: gcov infrastructure
P1: extract_structure output expansion (11 new feature fields)
P2: Confusion group rule engine (8 pairs + contradiction + backtrack)
P3: 4-factor confidence calculation + quality gate update
P4: 33+2 COBOL program type test samples (22 files, 7 categories)
P5: parametrized/ test data generation engine
P6: japanese_data.py lookup tables
P7-10: Type-specific test suites (~159 parametrized tests)
P11: Full classification pipeline (classify_program) + orchestrator integration
P12: Documentation (module-interfaces, test-plan v3.0, coverage-matrix)

Architecture decisions:
- classification_pipeline/ merged to hina/pipeline/
- parametrized/ as independent module
- japanese_data.py as root-level file
- hina/__all__ only exports classify_program()

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
hangshuo652
2026-06-19 23:51:55 +08:00
parent 63b5284715
commit bc1d56d1a4
129 changed files with 19378 additions and 261 deletions
+96
View File
@@ -0,0 +1,96 @@
"""回溯机制 — 多轮判定,必要时重新提取特征以化解矛盾。
BacktrackResolver 封装了多轮判定的核心逻辑:
1. 用当前 features 检测矛盾。
2. 对有矛盾的对调用 resolve_contradiction。
3. 如果仍然存在矛盾,重新提取特征再判定。
4. 超过 max_rounds 轮或 30s 超时后降级。
"""
from __future__ import annotations
import time
from typing import Any, Callable
from .contradiction import detect_contradictions, resolve_contradiction
class BacktrackResolver:
"""多轮回溯判定器。
Args:
structure_extractor: 接受 COBOL 源码字符串,返回 features dict 的可调用对象。
"""
def __init__(self, structure_extractor: Callable[[str], dict[str, Any]]) -> None:
self.extract = structure_extractor
self.max_rounds = 3
def _needs_backtrack(self, contradictions: list[dict]) -> bool:
"""判断是否需要回溯重提取。
只要检测到矛盾(列表非空),就需要回溯。
"""
return len(contradictions) > 0
def resolve(self, cobol_source: str, initial_features: dict) -> dict[str, Any]:
"""多轮判定,30s 超时降级。
Args:
cobol_source: COBOL 程序源码。
initial_features: 初始提取的特征字典。
Returns:
最终的特征字典,可能包含 backtrack_rounds 和 backtrack_timeout 信息。
"""
start = time.time()
features: dict[str, Any] = dict(initial_features)
features["backtrack_rounds"] = 0
for round_num in range(1, self.max_rounds + 1):
# 超时检查
if time.time() - start > 30:
features["backtrack_timeout"] = True
break
# 检测矛盾
contradictions = detect_contradictions(features)
if not contradictions:
# 无矛盾,判定完成
features["backtrack_resolved"] = True
break
# 解决矛盾
for c in contradictions:
resolution = resolve_contradiction(features, c)
# 将解决结果写入 features
resolved_types = features.setdefault("resolved_types", {})
resolved_types[f"resolved_{c['name']}"] = resolution
features["backtrack_rounds"] = round_num
# 判断是否需要重新提取
if self._needs_backtrack(contradictions):
# 重新提取特征
try:
new_features = self.extract(cobol_source)
# 合并新特征,保留旧特征中的回溯状态和已解决的矛盾
preserved_keys = ("backtrack_rounds", "backtrack_timeout", "resolved_types")
preserved = {k: features[k] for k in preserved_keys if k in features}
features.update(new_features)
features.update(preserved)
except Exception:
features["backtrack_extract_error"] = True
break
else:
# max_rounds 耗尽,标记降级
features["backtrack_degraded"] = True
# 确保时间字段存在
elapsed = time.time() - start
features.setdefault("backtrack_timeout", False)
features.setdefault("backtrack_resolved", False)
features.setdefault("backtrack_degraded", False)
features["backtrack_elapsed"] = round(elapsed, 3)
return features