feat: Phase 2 complete — 13 Phases of COBOL type classification and test benchmark

P0.6: gcov infrastructure P1: extract_structure output expansion (11 new feature fields) P2: Confusion group rule engine (8 pairs + contradiction + backtrack) P3: 4-factor confidence calculation + quality gate update P4: 33+2 COBOL program type test samples (22 files, 7 categories) P5: parametrized/ test data generation engine P6: japanese_data.py lookup tables P7-10: Type-specific test suites (~159 parametrized tests) P11: Full classification pipeline (classify_program) + orchestrator integration P12: Documentation (module-interfaces, test-plan v3.0, coverage-matrix) Architecture decisions: - classification_pipeline/ merged to hina/pipeline/ - parametrized/ as independent module - japanese_data.py as root-level file - hina/__all__ only exports classify_program() Co-Authored-By: Claude <noreply@anthropic.com>
2026-06-19 23:51:55 +08:00
parent 63b5284715
commit bc1d56d1a4
129 changed files with 19378 additions and 261 deletions
@@ -0,0 +1,47 @@
+"""HINA 混淆组判定规则引擎
+
+公开 API:
+  resolve_confusion_pair()          — 根据 pair_name 调度对应函数
+  detect_contradictions()           — 检测可能矛盾的类型对
+  resolve_contradiction()           — 解决矛盾，返回胜出的类型名
+  BacktrackResolver                 — 多轮回溯判定
+"""
+
+from __future__ import annotations
+
+from .confusion_groups import (
+    resolve_confusion_pair,
+    resolve_matching_vs_keybreak,
+    resolve_dedup_vs_nodedup,
+    resolve_validation_vs_keybreak,
+    resolve_csv_merge_vs_split,
+    resolve_simple_vs_two_stage,
+    resolve_pure_vs_mixed,
+    resolve_division_50_25_100,
+    resolve_mn_output_mode,
+)
+from .contradiction import (
+    CONTRADICTION_PAIRS,
+    detect_contradictions,
+    resolve_contradiction,
+)
+from .backtrack import BacktrackResolver
+
+__all__ = [
+    # 混淆组判定
+    "resolve_confusion_pair",
+    "resolve_matching_vs_keybreak",
+    "resolve_dedup_vs_nodedup",
+    "resolve_validation_vs_keybreak",
+    "resolve_csv_merge_vs_split",
+    "resolve_simple_vs_two_stage",
+    "resolve_pure_vs_mixed",
+    "resolve_division_50_25_100",
+    "resolve_mn_output_mode",
+    # 矛盾检测与解决
+    "CONTRADICTION_PAIRS",
+    "detect_contradictions",
+    "resolve_contradiction",
+    # 回溯
+    "BacktrackResolver",
+]
@@ -0,0 +1,96 @@
+"""回溯机制 — 多轮判定，必要时重新提取特征以化解矛盾。
+
+BacktrackResolver 封装了多轮判定的核心逻辑:
+  1. 用当前 features 检测矛盾。
+  2. 对有矛盾的对调用 resolve_contradiction。
+  3. 如果仍然存在矛盾，重新提取特征再判定。
+  4. 超过 max_rounds 轮或 30s 超时后降级。
+"""
+
+from __future__ import annotations
+
+import time
+from typing import Any, Callable
+
+from .contradiction import detect_contradictions, resolve_contradiction
+
+
+class BacktrackResolver:
+    """多轮回溯判定器。
+
+    Args:
+        structure_extractor: 接受 COBOL 源码字符串，返回 features dict 的可调用对象。
+    """
+
+    def __init__(self, structure_extractor: Callable[[str], dict[str, Any]]) -> None:
+        self.extract = structure_extractor
+        self.max_rounds = 3
+
+    def _needs_backtrack(self, contradictions: list[dict]) -> bool:
+        """判断是否需要回溯重提取。
+
+        只要检测到矛盾（列表非空），就需要回溯。
+        """
+        return len(contradictions) > 0
+
+    def resolve(self, cobol_source: str, initial_features: dict) -> dict[str, Any]:
+        """多轮判定，30s 超时降级。
+
+        Args:
+            cobol_source: COBOL 程序源码。
+            initial_features: 初始提取的特征字典。
+
+        Returns:
+            最终的特征字典，可能包含 backtrack_rounds 和 backtrack_timeout 信息。
+        """
+        start = time.time()
+        features: dict[str, Any] = dict(initial_features)
+        features["backtrack_rounds"] = 0
+
+        for round_num in range(1, self.max_rounds + 1):
+            # 超时检查
+            if time.time() - start > 30:
+                features["backtrack_timeout"] = True
+                break
+
+            # 检测矛盾
+            contradictions = detect_contradictions(features)
+            if not contradictions:
+                # 无矛盾，判定完成
+                features["backtrack_resolved"] = True
+                break
+
+            # 解决矛盾
+            for c in contradictions:
+                resolution = resolve_contradiction(features, c)
+                # 将解决结果写入 features
+                resolved_types = features.setdefault("resolved_types", {})
+                resolved_types[f"resolved_{c['name']}"] = resolution
+
+            features["backtrack_rounds"] = round_num
+
+            # 判断是否需要重新提取
+            if self._needs_backtrack(contradictions):
+                # 重新提取特征
+                try:
+                    new_features = self.extract(cobol_source)
+                    # 合并新特征，保留旧特征中的回溯状态和已解决的矛盾
+                    preserved_keys = ("backtrack_rounds", "backtrack_timeout", "resolved_types")
+                    preserved = {k: features[k] for k in preserved_keys if k in features}
+                    features.update(new_features)
+                    features.update(preserved)
+                except Exception:
+                    features["backtrack_extract_error"] = True
+                    break
+        else:
+            # max_rounds 耗尽，标记降级
+            features["backtrack_degraded"] = True
+
+        # 确保时间字段存在
+        elapsed = time.time() - start
+        features.setdefault("backtrack_timeout", False)
+        features.setdefault("backtrack_resolved", False)
+        features.setdefault("backtrack_degraded", False)
+        features["backtrack_elapsed"] = round(elapsed, 3)
+
+        return features
@@ -0,0 +1,235 @@
+"""混淆组判定规则引擎 — 8 个混淆对的化解函数。
+
+每个函数接收 features dict，返回:
+  {
+    "resolved_type": str,
+    "confidence": float,
+    "evidence": list[str],
+  }
+"""
+
+from __future__ import annotations
+
+
+def resolve_matching_vs_keybreak(features: dict) -> dict:
+    """区分「マッチング」与「キーブレイク」。
+
+    规则:
+      - IF 三路分支 (comparison ≥ 2) + SELECT 文件数 ≥ 2 → マッチング
+      - IF 双路分支 (equality 为主) + WS-PREV-KEY 存在 + 累加器存在 → キーブレイク
+    """
+    if_types = features.get("if_types", {})
+    total_ifs = if_types.get("total", 0)
+    comparison_ifs = if_types.get("comparison", 0)
+    equality_ifs = if_types.get("equality", 0)
+
+    select_files = features.get("select_files", {})
+    file_count = len(select_files) if isinstance(select_files, dict) else features.get("file_count", 0)
+
+    variable_patterns = features.get("variable_patterns", {})
+    has_prev_key = variable_patterns.get("has_prev_key", False)
+    has_accumulator = variable_patterns.get("has_accumulator", False)
+
+    evidence: list[str] = []
+
+    # 规则 1: 三路分支 + 多文件 → マッチング
+    if comparison_ifs >= 2 and file_count >= 2:
+        evidence.append(f"三路 IF 分支 (comparison={comparison_ifs}) + SELECT 文件数 >=2 ({file_count}) → マッチング")
+        return {"resolved_type": "マッチング", "confidence": 0.90, "evidence": evidence}
+
+    # 规则 2: 双路 + WS-PREV-KEY + 累加器 → キーブレイク
+    if total_ifs >= 1 and has_prev_key and has_accumulator:
+        evidence.append(f"WS-PREV-KEY 存在 + 累加器存在 + IF 分支 → キーブレイク")
+        return {"resolved_type": "キーブレイク", "confidence": 0.85, "evidence": evidence}
+
+    # 补充规则: SELECT 文件数 >= 2 且 comparison 至少 1 → 倾向マッチング
+    if file_count >= 2 and comparison_ifs >= 1:
+        evidence.append(f"SELECT 文件数 >=2 + comparison IF >=1 → マッチング")
+        return {"resolved_type": "マッチング", "confidence": 0.75, "evidence": evidence}
+
+    # 回退: 无法明确判定
+    evidence.append(f"特征不足: total_ifs={total_ifs}, comparison={comparison_ifs}, "
+                     f"file_count={file_count}, has_prev_key={has_prev_key}, "
+                     f"has_accumulator={has_accumulator}")
+    return {"resolved_type": "unknown", "confidence": 0.0, "evidence": evidence}
+
+
+def resolve_dedup_vs_nodedup(features: dict) -> dict:
+    """区分「項目チェック(重複含む)」与「項目チェック(重複含まず)」。
+
+    规则:
+      - WS-PREV-KEY 存在 → 含重复
+      - 无 WS-PREV-KEY → 不含重复
+    """
+    variable_patterns = features.get("variable_patterns", {})
+    has_prev_key = variable_patterns.get("has_prev_key", False)
+    evidence: list[str] = []
+
+    if has_prev_key:
+        evidence.append("WS-PREV-KEY 存在 → 含重复")
+        return {"resolved_type": "項目チェック(重複含む)", "confidence": 0.90, "evidence": evidence}
+    else:
+        evidence.append("未检测到 WS-PREV-KEY → 不含重复")
+        return {"resolved_type": "項目チェック(重複含まず)", "confidence": 0.85, "evidence": evidence}
+
+
+def resolve_validation_vs_keybreak(features: dict) -> dict:
+    """区分「編集処理(校验)」与「キーブレイク」。
+
+    规则:
+      - WS-ERR* 相关字段存在 → 校验 (validation)
+      - WS-*CNT 累加计数器存在 → キーブレイク (key break)
+    """
+    variable_patterns = features.get("variable_patterns", {})
+    has_error_flag = variable_patterns.get("has_error_flag", False)
+    has_counter = variable_patterns.get("has_counter", False)
+    evidence: list[str] = []
+
+    if has_error_flag:
+        evidence.append("WS-ERR* 错误字段存在 → 校验")
+        return {"resolved_type": "編集処理(校验)", "confidence": 0.85, "evidence": evidence}
+
+    if has_counter:
+        evidence.append("WS-*CNT 计数器存在 → キーブレイク")
+        return {"resolved_type": "キーブレイク", "confidence": 0.80, "evidence": evidence}
+
+    evidence.append("既无错误字段也无计数器，无法判定")
+    return {"resolved_type": "unknown", "confidence": 0.0, "evidence": evidence}
+
+
+def resolve_csv_merge_vs_split(features: dict) -> dict:
+    """区分 CSV 合并与拆分。
+
+    规则:
+      - STRING 语句存在 → 无换行 (合并, merge)
+      - INSPECT REPLACING 存在 → 有换行 (拆分, split)
+    """
+    has_string = features.get("has_string", False)
+    has_inspect = features.get("has_inspect", False)
+    evidence: list[str] = []
+
+    if has_string:
+        evidence.append("STRING 语句存在 → CSV 合并 (无换行)")
+        return {"resolved_type": "CSV合并", "confidence": 0.85, "evidence": evidence}
+
+    if has_inspect:
+        evidence.append("INSPECT REPLACING 存在 → CSV 拆分 (有换行)")
+        return {"resolved_type": "CSV拆分", "confidence": 0.85, "evidence": evidence}
+
+    evidence.append("既无 STRING 也无 INSPECT REPLACING")
+    return {"resolved_type": "unknown", "confidence": 0.0, "evidence": evidence}
+
+
+def resolve_simple_vs_two_stage(features: dict) -> dict:
+    """区分「単純マッチング」与「二段階マッチング」。
+
+    规则:
+      - OPEN → CLOSE → 再 OPEN 模式 → 二级匹配
+      - 其他顺序 → 简单匹配
+    """
+    open_pattern = features.get("open_pattern", "")
+    evidence: list[str] = []
+
+    if open_pattern == "open-close-open":
+        evidence.append("OPEN→CLOSE→再OPEN 模式 → 二级匹配")
+        return {"resolved_type": "二段階マッチング", "confidence": 0.90, "evidence": evidence}
+    else:
+        evidence.append(f"OPEN 模式为 '{open_pattern}' → 简单匹配")
+        return {"resolved_type": "単純マッチング", "confidence": 0.80, "evidence": evidence}
+
+
+def resolve_pure_vs_mixed(features: dict) -> dict:
+    """区分「純粋マッチング」与「混合マッチング」。
+
+    规则:
+      - variable_patterns 中 has_switch 且 has_counter → 混合（隐含额外键比较）
+      - 有 PERFORM 且 多文件 → 可能混合
+      - 否则 → 纯粹匹配（低确信度，因无法静态确定有无额外键比较）
+    """
+    variable_patterns = features.get("variable_patterns", {})
+    if_types = features.get("if_types", {})
+    evidence: list[str] = []
+
+    has_switch = variable_patterns.get("has_switch", False)
+    has_counter = variable_patterns.get("has_counter", False)
+    if_count = if_types.get("total", 0)
+
+    if has_switch and has_counter and if_count >= 3:
+        evidence.append("多个变量模式和 IF 分支 → 可能混合匹配")
+        return {"resolved_type": "混合マッチング", "confidence": 0.70, "evidence": evidence}
+
+    evidence.append("无明确混合特征 → 纯粹匹配（需数据验证）")
+    return {"resolved_type": "unknown", "confidence": 0.0, "evidence": evidence}
+
+
+def resolve_division_50_25_100(features: dict) -> dict:
+    """区分 DIVIDE 被除数常量 50/25/100。
+
+    从 features["divide_constants"] 列表中匹配已知常量。
+    """
+    divide_constants = features.get("divide_constants", [])
+    evidence: list[str] = []
+
+    if not isinstance(divide_constants, (list, tuple)):
+        evidence.append("divide_constants 格式无效")
+        return {"resolved_type": "unknown", "confidence": 0.0, "evidence": evidence}
+
+    for c in divide_constants:
+        if c in (50, 25, 100):
+            evidence.append(f"DIVIDE 被除数 = {c}")
+            return {"resolved_type": f"DIVIDE_{c}", "confidence": 0.95, "evidence": evidence}
+
+    evidence.append(f"未匹配已知常量 (50/25/100)，当前值: {divide_constants}")
+    return {"resolved_type": "unknown", "confidence": 0.0, "evidence": evidence}
+
+
+def resolve_mn_output_mode(features: dict) -> dict:
+    """判断 M:N 输出模式。
+
+    规则:
+      - 根据文件或记录数判断 M:N 关系
+      - 返回 unknown 注明需数据验证
+    """
+    select_files = features.get("select_files", {})
+    file_count = len(select_files) if isinstance(select_files, dict) else features.get("file_count", 0)
+    evidence: list[str] = []
+
+    # 尝试判断 M:N（从现有特征推断）
+    select_count = len(select_files)
+    total_branches = features.get("total_branches", 0)
+    if select_count >= 2 and total_branches >= 3:
+        evidence.append(f"SELECT={select_count}, 分支={total_branches} → 可能 M:N")
+        return {"resolved_type": "M:N", "confidence": 0.65, "evidence": evidence}
+
+    if file_count >= 3:
+        evidence.append(f"文件数 {file_count} >= 3, 可能为 M:N 关系")
+        return {"resolved_type": "M:N", "confidence": 0.60, "evidence": evidence}
+
+    evidence.append("需数据验证确定 M:N 输出模式")
+    return {"resolved_type": "unknown", "confidence": 0.0, "evidence": evidence}
+
+
+# ── 调度表 ──────────────────────────────────────────────────────────────────
+
+_RESOLVER_MAP = {
+    "matching_vs_keybreak": resolve_matching_vs_keybreak,
+    "dedup_vs_nodedup": resolve_dedup_vs_nodedup,
+    "validation_vs_keybreak": resolve_validation_vs_keybreak,
+    "csv_merge_vs_split": resolve_csv_merge_vs_split,
+    "simple_vs_two_stage": resolve_simple_vs_two_stage,
+    "pure_vs_mixed": resolve_pure_vs_mixed,
+    "division_50_25_100": resolve_division_50_25_100,
+    "mn_output_mode": resolve_mn_output_mode,
+}
+
+
+def resolve_confusion_pair(features: dict, pair_name: str) -> dict:
+    """Dispatch to the correct function by pair_name."""
+    resolver = _RESOLVER_MAP.get(pair_name)
+    if resolver is None:
+        return {
+            "resolved_type": "unknown",
+            "confidence": 0.0,
+            "evidence": [f"未知混淆对名称: {pair_name}"],
+        }
+    return resolver(features)
@@ -0,0 +1,153 @@
+"""矛盾检测与解决 — 检测来自不同混淆组的类型冲突。
+
+CONTRADICTION_PAIRS 定义了可能会矛盾的分类类型对。
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+# ── 矛盾对定义 ──────────────────────────────────────────────────────────────
+
+CONTRADICTION_PAIRS: list[dict[str, str]] = [
+    {
+        "name": "matching_vs_keybreak",
+        "type_a": "マッチング",
+        "type_b": "キーブレイク",
+    },
+    {
+        "name": "dedup_vs_nodedup",
+        "type_a": "項目チェック(重複含む)",
+        "type_b": "項目チェック(重複含まず)",
+    },
+    {
+        "name": "validation_vs_keybreak",
+        "type_a": "編集処理(校验)",
+        "type_b": "キーブレイク",
+    },
+    {
+        "name": "csv_merge_vs_split",
+        "type_a": "CSV合并",
+        "type_b": "CSV拆分",
+    },
+    {
+        "name": "simple_vs_two_stage",
+        "type_a": "単純マッチング",
+        "type_b": "二段階マッチング",
+    },
+    {
+        "name": "pure_vs_mixed",
+        "type_a": "純粋マッチング",
+        "type_b": "混合マッチング",
+    },
+    {
+        "name": "division_50_25_100",
+        "type_a": "DIVIDE_50",
+        "type_b": "DIVIDE_100",
+    },
+    {
+        "name": "mn_output_mode",
+        "type_a": "M:N",
+        "type_b": "1:1",
+    },
+]
+
+# ── 冲突优先级: 当同一种类型被多个混淆组判定时，优先级高者胜出 ──────────
+
+TYPE_PRIORITY: dict[str, int] = {
+    "マッチング": 10,
+    "キーブレイク": 9,
+    "項目チェック(重複含む)": 8,
+    "項目チェック(重複含まず)": 8,
+    "編集処理(校验)": 7,
+    "CSV合并": 6,
+    "CSV拆分": 6,
+    "単純マッチング": 5,
+    "二段階マッチング": 5,
+    "純粋マッチング": 4,
+    "混合マッチング": 4,
+    "DIVIDE_50": 3,
+    "DIVIDE_100": 3,
+    "DIVIDE_25": 3,
+    "M:N": 2,
+    "1:1": 2,
+}
+
+
+def detect_contradictions(features: dict) -> list[dict]:
+    """检测可能矛盾的类型对，返回矛盾列表。
+
+    检查 features["resolved_types"] 中已判定的类型，
+    如果同一混淆组内两个类型同时存在，或不同组的类型存在冲突，则记录。
+
+    Args:
+        features: 包含所有已判定的 resolved_types 字典。
+
+    Returns:
+        矛盾列表。每个元素格式: {"name": str, "type_a": str, "type_b": str}
+    """
+    resolved_types: dict[str, str] = features.get("resolved_types", {})
+    if not resolved_types:
+        return []
+
+    contradictions: list[dict] = []
+
+    for pair in CONTRADICTION_PAIRS:
+        name = pair["name"]
+        type_a = pair["type_a"]
+        type_b = pair["type_b"]
+
+        # 检查该混淆组的判定结果中是否同时包含两个类型
+        for key, resolved_type in resolved_types.items():
+            if resolved_type == type_a:
+                for other_key, other_type in resolved_types.items():
+                    if other_key != key and other_type == type_b:
+                        contradictions.append({
+                            "name": name,
+                            "type_a": type_a,
+                            "type_b": type_b,
+                            "source_a": key,
+                            "source_b": other_key,
+                        })
+                        break
+                break
+
+    return contradictions
+
+
+def resolve_contradiction(features: dict, contradiction: dict) -> str:
+    """解决矛盾，返回胜出的类型名。
+
+    策略:
+      1. 根据 TYPE_PRIORITY 取优先级高的类型。
+      2. 若优先级相同，根据 features 中的额外证据选择。
+
+    Args:
+        features: 完整特征字典。
+        contradiction: detect_contradictions 返回的单个矛盾。
+
+    Returns:
+        胜出的类型名称。
+    """
+    type_a = contradiction["type_a"]
+    type_b = contradiction["type_b"]
+
+    priority_a = TYPE_PRIORITY.get(type_a, 0)
+    priority_b = TYPE_PRIORITY.get(type_b, 0)
+
+    if priority_a > priority_b:
+        return type_a
+    elif priority_b > priority_a:
+        return type_b
+
+    # 优先级相同，尝试使用 confusion_groups 重判定
+    from .confusion_groups import resolve_confusion_pair
+
+    pair_name = contradiction.get("name", "")
+    if pair_name:
+        result = resolve_confusion_pair(features, pair_name)
+        if result.get("confidence", 0) >= 0.80:
+            return result["resolved_type"]
+
+    # 最终回退: 取 type_a
+    return type_a