Files
cobol-java-v3/hina/rule_engine/contradiction.py
NB-076 a6c454692a fix: resolve 3 MEDIUM code review findings
M1: Cache confusion-pair confidences in Path B (eliminate redundant
    resolve_confusion_pair re-calls in _path_rule_engine)
M2: Resolve contradictions in Path C instead of hardcoding
    resolved_count=0 in _path_llm_assisted
M4: Add DIVIDE_25 to contradiction pair coverage (50-25, 100-25)
    and update test_contradiction_pairs_defined to verify all 3 variants
2026-06-21 11:25:59 +08:00

164 lines
4.8 KiB
Python

"""矛盾检测与解决 — 检测来自不同混淆组的类型冲突。
CONTRADICTION_PAIRS 定义了可能会矛盾的分类类型对。
"""
from __future__ import annotations
from typing import Any
# ── 矛盾对定义 ──────────────────────────────────────────────────────────────
CONTRADICTION_PAIRS: list[dict[str, str]] = [
{
"name": "matching_vs_keybreak",
"type_a": "マッチング",
"type_b": "キーブレイク",
},
{
"name": "dedup_vs_nodedup",
"type_a": "項目チェック(重複含む)",
"type_b": "項目チェック(重複含まず)",
},
{
"name": "validation_vs_keybreak",
"type_a": "編集処理(校验)",
"type_b": "キーブレイク",
},
{
"name": "csv_merge_vs_split",
"type_a": "CSV合并",
"type_b": "CSV拆分",
},
{
"name": "simple_vs_two_stage",
"type_a": "単純マッチング",
"type_b": "二段階マッチング",
},
{
"name": "pure_vs_mixed",
"type_a": "純粋マッチング",
"type_b": "混合マッチング",
},
{
"name": "division_50_25_100",
"type_a": "DIVIDE_50",
"type_b": "DIVIDE_100",
},
{
"name": "division_50_25_100",
"type_a": "DIVIDE_50",
"type_b": "DIVIDE_25",
},
{
"name": "division_50_25_100",
"type_a": "DIVIDE_100",
"type_b": "DIVIDE_25",
},
{
"name": "mn_output_mode",
"type_a": "M:N",
"type_b": "1:1",
},
]
# ── 冲突优先级: 当同一种类型被多个混淆组判定时,优先级高者胜出 ──────────
TYPE_PRIORITY: dict[str, int] = {
"マッチング": 10,
"キーブレイク": 9,
"項目チェック(重複含む)": 8,
"項目チェック(重複含まず)": 8,
"編集処理(校验)": 7,
"CSV合并": 6,
"CSV拆分": 6,
"単純マッチング": 5,
"二段階マッチング": 5,
"純粋マッチング": 4,
"混合マッチング": 4,
"DIVIDE_50": 3,
"DIVIDE_100": 3,
"DIVIDE_25": 3,
"M:N": 2,
"1:1": 2,
}
def detect_contradictions(features: dict) -> list[dict]:
"""检测可能矛盾的类型对,返回矛盾列表。
检查 features["resolved_types"] 中已判定的类型,
如果同一混淆组内两个类型同时存在,或不同组的类型存在冲突,则记录。
Args:
features: 包含所有已判定的 resolved_types 字典。
Returns:
矛盾列表。每个元素格式: {"name": str, "type_a": str, "type_b": str}
"""
resolved_types: dict[str, str] = features.get("resolved_types", {})
if not resolved_types:
return []
contradictions: list[dict] = []
for pair in CONTRADICTION_PAIRS:
name = pair["name"]
type_a = pair["type_a"]
type_b = pair["type_b"]
# 检查该混淆组的判定结果中是否同时包含两个类型
for key, resolved_type in resolved_types.items():
if resolved_type == type_a:
for other_key, other_type in resolved_types.items():
if other_key != key and other_type == type_b:
contradictions.append({
"name": name,
"type_a": type_a,
"type_b": type_b,
"source_a": key,
"source_b": other_key,
})
break
break
return contradictions
def resolve_contradiction(features: dict, contradiction: dict) -> str:
"""解决矛盾,返回胜出的类型名。
策略:
1. 根据 TYPE_PRIORITY 取优先级高的类型。
2. 若优先级相同,根据 features 中的额外证据选择。
Args:
features: 完整特征字典。
contradiction: detect_contradictions 返回的单个矛盾。
Returns:
胜出的类型名称。
"""
type_a = contradiction["type_a"]
type_b = contradiction["type_b"]
priority_a = TYPE_PRIORITY.get(type_a, 0)
priority_b = TYPE_PRIORITY.get(type_b, 0)
if priority_a > priority_b:
return type_a
elif priority_b > priority_a:
return type_b
# 优先级相同,尝试使用 confusion_groups 重判定
from .confusion_groups import resolve_confusion_pair
pair_name = contradiction.get("name", "")
if pair_name:
result = resolve_confusion_pair(features, pair_name)
if result.get("confidence", 0) >= 0.80:
return result["resolved_type"]
# 最终回退: 取 type_a
return type_a