cobol-java-v3/hina/rule_engine/contradiction.py

"""矛盾检测与解决 — 检测来自不同混淆组的类型冲突。

CONTRADICTION_PAIRS 定义了可能会矛盾的分类类型对。
"""

from __future__ import annotations

from typing import Any

# ── 矛盾对定义 ──────────────────────────────────────────────────────────────

CONTRADICTION_PAIRS: list[dict[str, str]] = [
    {
        "name": "matching_vs_keybreak",
        "type_a": "マッチング",
        "type_b": "キーブレイク",
    },
    {
        "name": "dedup_vs_nodedup",
        "type_a": "項目チェック(重複含む)",
        "type_b": "項目チェック(重複含まず)",
    },
    {
        "name": "validation_vs_keybreak",
        "type_a": "編集処理(校验)",
        "type_b": "キーブレイク",
    },
    {
        "name": "csv_merge_vs_split",
        "type_a": "CSV合并",
        "type_b": "CSV拆分",
    },
    {
        "name": "simple_vs_two_stage",
        "type_a": "単純マッチング",
        "type_b": "二段階マッチング",
    },
    {
        "name": "pure_vs_mixed",
        "type_a": "純粋マッチング",
        "type_b": "混合マッチング",
    },
    {
        "name": "division_50_25_100",
        "type_a": "DIVIDE_50",
        "type_b": "DIVIDE_100",
    },
    {
        "name": "division_50_25_100",
        "type_a": "DIVIDE_50",
        "type_b": "DIVIDE_25",
    },
    {
        "name": "division_50_25_100",
        "type_a": "DIVIDE_100",
        "type_b": "DIVIDE_25",
    },
    {
        "name": "mn_output_mode",
        "type_a": "M:N",
        "type_b": "1:1",
    },
]

# ── 冲突优先级: 当同一种类型被多个混淆组判定时，优先级高者胜出 ──────────

TYPE_PRIORITY: dict[str, int] = {
    "マッチング": 10,
    "キーブレイク": 9,
    "項目チェック(重複含む)": 8,
    "項目チェック(重複含まず)": 8,
    "編集処理(校验)": 7,
    "CSV合并": 6,
    "CSV拆分": 6,
    "単純マッチング": 5,
    "二段階マッチング": 5,
    "純粋マッチング": 4,
    "混合マッチング": 4,
    "DIVIDE_50": 3,
    "DIVIDE_100": 3,
    "DIVIDE_25": 3,
    "M:N": 2,
    "1:1": 2,
}


def detect_contradictions(features: dict) -> list[dict]:
    """检测可能矛盾的类型对，返回矛盾列表。

    检查 features["resolved_types"] 中已判定的类型，
    如果同一混淆组内两个类型同时存在，或不同组的类型存在冲突，则记录。

    Args:
        features: 包含所有已判定的 resolved_types 字典。

    Returns:
        矛盾列表。每个元素格式: {"name": str, "type_a": str, "type_b": str}
    """
    resolved_types: dict[str, str] = features.get("resolved_types", {})
    if not resolved_types:
        return []

    contradictions: list[dict] = []

    for pair in CONTRADICTION_PAIRS:
        name = pair["name"]
        type_a = pair["type_a"]
        type_b = pair["type_b"]

        # 检查该混淆组的判定结果中是否同时包含两个类型
        for key, resolved_type in resolved_types.items():
            if resolved_type == type_a:
                for other_key, other_type in resolved_types.items():
                    if other_key != key and other_type == type_b:
                        contradictions.append({
                            "name": name,
                            "type_a": type_a,
                            "type_b": type_b,
                            "source_a": key,
                            "source_b": other_key,
                        })
                        break
                break

    return contradictions


def resolve_contradiction(features: dict, contradiction: dict) -> str:
    """解决矛盾，返回胜出的类型名。

    策略:
      1. 根据 TYPE_PRIORITY 取优先级高的类型。
      2. 若优先级相同，根据 features 中的额外证据选择。

    Args:
        features: 完整特征字典。
        contradiction: detect_contradictions 返回的单个矛盾。

    Returns:
        胜出的类型名称。
    """
    type_a = contradiction["type_a"]
    type_b = contradiction["type_b"]

    priority_a = TYPE_PRIORITY.get(type_a, 0)
    priority_b = TYPE_PRIORITY.get(type_b, 0)

    if priority_a > priority_b:
        return type_a
    elif priority_b > priority_a:
        return type_b

    # 优先级相同，尝试使用 confusion_groups 重判定
    from .confusion_groups import resolve_confusion_pair

    pair_name = contradiction.get("name", "")
    if pair_name:
        result = resolve_confusion_pair(features, pair_name)
        if result.get("confidence", 0) >= 0.80:
            return result["resolved_type"]

    # 最终回退: 取 type_a
    return type_a