feat: Phase 2 complete — 13 Phases of COBOL type classification and test benchmark

P0.6: gcov infrastructure P1: extract_structure output expansion (11 new feature fields) P2: Confusion group rule engine (8 pairs + contradiction + backtrack) P3: 4-factor confidence calculation + quality gate update P4: 33+2 COBOL program type test samples (22 files, 7 categories) P5: parametrized/ test data generation engine P6: japanese_data.py lookup tables P7-10: Type-specific test suites (~159 parametrized tests) P11: Full classification pipeline (classify_program) + orchestrator integration P12: Documentation (module-interfaces, test-plan v3.0, coverage-matrix) Architecture decisions: - classification_pipeline/ merged to hina/pipeline/ - parametrized/ as independent module - japanese_data.py as root-level file - hina/__all__ only exports classify_program() Co-Authored-By: Claude <noreply@anthropic.com>
2026-06-19 23:51:55 +08:00
parent 63b5284715
commit bc1d56d1a4
129 changed files with 19378 additions and 261 deletions
@@ -0,0 +1,153 @@
+"""矛盾检测与解决 — 检测来自不同混淆组的类型冲突。
+
+CONTRADICTION_PAIRS 定义了可能会矛盾的分类类型对。
+"""
+
+from __future__ import annotations
+
+from typing import Any
+
+# ── 矛盾对定义 ──────────────────────────────────────────────────────────────
+
+CONTRADICTION_PAIRS: list[dict[str, str]] = [
+    {
+        "name": "matching_vs_keybreak",
+        "type_a": "マッチング",
+        "type_b": "キーブレイク",
+    },
+    {
+        "name": "dedup_vs_nodedup",
+        "type_a": "項目チェック(重複含む)",
+        "type_b": "項目チェック(重複含まず)",
+    },
+    {
+        "name": "validation_vs_keybreak",
+        "type_a": "編集処理(校验)",
+        "type_b": "キーブレイク",
+    },
+    {
+        "name": "csv_merge_vs_split",
+        "type_a": "CSV合并",
+        "type_b": "CSV拆分",
+    },
+    {
+        "name": "simple_vs_two_stage",
+        "type_a": "単純マッチング",
+        "type_b": "二段階マッチング",
+    },
+    {
+        "name": "pure_vs_mixed",
+        "type_a": "純粋マッチング",
+        "type_b": "混合マッチング",
+    },
+    {
+        "name": "division_50_25_100",
+        "type_a": "DIVIDE_50",
+        "type_b": "DIVIDE_100",
+    },
+    {
+        "name": "mn_output_mode",
+        "type_a": "M:N",
+        "type_b": "1:1",
+    },
+]
+
+# ── 冲突优先级: 当同一种类型被多个混淆组判定时，优先级高者胜出 ──────────
+
+TYPE_PRIORITY: dict[str, int] = {
+    "マッチング": 10,
+    "キーブレイク": 9,
+    "項目チェック(重複含む)": 8,
+    "項目チェック(重複含まず)": 8,
+    "編集処理(校验)": 7,
+    "CSV合并": 6,
+    "CSV拆分": 6,
+    "単純マッチング": 5,
+    "二段階マッチング": 5,
+    "純粋マッチング": 4,
+    "混合マッチング": 4,
+    "DIVIDE_50": 3,
+    "DIVIDE_100": 3,
+    "DIVIDE_25": 3,
+    "M:N": 2,
+    "1:1": 2,
+}
+
+
+def detect_contradictions(features: dict) -> list[dict]:
+    """检测可能矛盾的类型对，返回矛盾列表。
+
+    检查 features["resolved_types"] 中已判定的类型，
+    如果同一混淆组内两个类型同时存在，或不同组的类型存在冲突，则记录。
+
+    Args:
+        features: 包含所有已判定的 resolved_types 字典。
+
+    Returns:
+        矛盾列表。每个元素格式: {"name": str, "type_a": str, "type_b": str}
+    """
+    resolved_types: dict[str, str] = features.get("resolved_types", {})
+    if not resolved_types:
+        return []
+
+    contradictions: list[dict] = []
+
+    for pair in CONTRADICTION_PAIRS:
+        name = pair["name"]
+        type_a = pair["type_a"]
+        type_b = pair["type_b"]
+
+        # 检查该混淆组的判定结果中是否同时包含两个类型
+        for key, resolved_type in resolved_types.items():
+            if resolved_type == type_a:
+                for other_key, other_type in resolved_types.items():
+                    if other_key != key and other_type == type_b:
+                        contradictions.append({
+                            "name": name,
+                            "type_a": type_a,
+                            "type_b": type_b,
+                            "source_a": key,
+                            "source_b": other_key,
+                        })
+                        break
+                break
+
+    return contradictions
+
+
+def resolve_contradiction(features: dict, contradiction: dict) -> str:
+    """解决矛盾，返回胜出的类型名。
+
+    策略:
+      1. 根据 TYPE_PRIORITY 取优先级高的类型。
+      2. 若优先级相同，根据 features 中的额外证据选择。
+
+    Args:
+        features: 完整特征字典。
+        contradiction: detect_contradictions 返回的单个矛盾。
+
+    Returns:
+        胜出的类型名称。
+    """
+    type_a = contradiction["type_a"]
+    type_b = contradiction["type_b"]
+
+    priority_a = TYPE_PRIORITY.get(type_a, 0)
+    priority_b = TYPE_PRIORITY.get(type_b, 0)
+
+    if priority_a > priority_b:
+        return type_a
+    elif priority_b > priority_a:
+        return type_b
+
+    # 优先级相同，尝试使用 confusion_groups 重判定
+    from .confusion_groups import resolve_confusion_pair
+
+    pair_name = contradiction.get("name", "")
+    if pair_name:
+        result = resolve_confusion_pair(features, pair_name)
+        if result.get("confidence", 0) >= 0.80:
+            return result["resolved_type"]
+
+    # 最终回退: 取 type_a
+    return type_a