a6c454692a
M1: Cache confusion-pair confidences in Path B (eliminate redundant
resolve_confusion_pair re-calls in _path_rule_engine)
M2: Resolve contradictions in Path C instead of hardcoding
resolved_count=0 in _path_llm_assisted
M4: Add DIVIDE_25 to contradiction pair coverage (50-25, 100-25)
and update test_contradiction_pairs_defined to verify all 3 variants
164 lines
4.8 KiB
Python
164 lines
4.8 KiB
Python
"""矛盾检测与解决 — 检测来自不同混淆组的类型冲突。
|
|
|
|
CONTRADICTION_PAIRS 定义了可能会矛盾的分类类型对。
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
from typing import Any
|
|
|
|
# ── 矛盾对定义 ──────────────────────────────────────────────────────────────
|
|
|
|
CONTRADICTION_PAIRS: list[dict[str, str]] = [
|
|
{
|
|
"name": "matching_vs_keybreak",
|
|
"type_a": "マッチング",
|
|
"type_b": "キーブレイク",
|
|
},
|
|
{
|
|
"name": "dedup_vs_nodedup",
|
|
"type_a": "項目チェック(重複含む)",
|
|
"type_b": "項目チェック(重複含まず)",
|
|
},
|
|
{
|
|
"name": "validation_vs_keybreak",
|
|
"type_a": "編集処理(校验)",
|
|
"type_b": "キーブレイク",
|
|
},
|
|
{
|
|
"name": "csv_merge_vs_split",
|
|
"type_a": "CSV合并",
|
|
"type_b": "CSV拆分",
|
|
},
|
|
{
|
|
"name": "simple_vs_two_stage",
|
|
"type_a": "単純マッチング",
|
|
"type_b": "二段階マッチング",
|
|
},
|
|
{
|
|
"name": "pure_vs_mixed",
|
|
"type_a": "純粋マッチング",
|
|
"type_b": "混合マッチング",
|
|
},
|
|
{
|
|
"name": "division_50_25_100",
|
|
"type_a": "DIVIDE_50",
|
|
"type_b": "DIVIDE_100",
|
|
},
|
|
{
|
|
"name": "division_50_25_100",
|
|
"type_a": "DIVIDE_50",
|
|
"type_b": "DIVIDE_25",
|
|
},
|
|
{
|
|
"name": "division_50_25_100",
|
|
"type_a": "DIVIDE_100",
|
|
"type_b": "DIVIDE_25",
|
|
},
|
|
{
|
|
"name": "mn_output_mode",
|
|
"type_a": "M:N",
|
|
"type_b": "1:1",
|
|
},
|
|
]
|
|
|
|
# ── 冲突优先级: 当同一种类型被多个混淆组判定时,优先级高者胜出 ──────────
|
|
|
|
TYPE_PRIORITY: dict[str, int] = {
|
|
"マッチング": 10,
|
|
"キーブレイク": 9,
|
|
"項目チェック(重複含む)": 8,
|
|
"項目チェック(重複含まず)": 8,
|
|
"編集処理(校验)": 7,
|
|
"CSV合并": 6,
|
|
"CSV拆分": 6,
|
|
"単純マッチング": 5,
|
|
"二段階マッチング": 5,
|
|
"純粋マッチング": 4,
|
|
"混合マッチング": 4,
|
|
"DIVIDE_50": 3,
|
|
"DIVIDE_100": 3,
|
|
"DIVIDE_25": 3,
|
|
"M:N": 2,
|
|
"1:1": 2,
|
|
}
|
|
|
|
|
|
def detect_contradictions(features: dict) -> list[dict]:
|
|
"""检测可能矛盾的类型对,返回矛盾列表。
|
|
|
|
检查 features["resolved_types"] 中已判定的类型,
|
|
如果同一混淆组内两个类型同时存在,或不同组的类型存在冲突,则记录。
|
|
|
|
Args:
|
|
features: 包含所有已判定的 resolved_types 字典。
|
|
|
|
Returns:
|
|
矛盾列表。每个元素格式: {"name": str, "type_a": str, "type_b": str}
|
|
"""
|
|
resolved_types: dict[str, str] = features.get("resolved_types", {})
|
|
if not resolved_types:
|
|
return []
|
|
|
|
contradictions: list[dict] = []
|
|
|
|
for pair in CONTRADICTION_PAIRS:
|
|
name = pair["name"]
|
|
type_a = pair["type_a"]
|
|
type_b = pair["type_b"]
|
|
|
|
# 检查该混淆组的判定结果中是否同时包含两个类型
|
|
for key, resolved_type in resolved_types.items():
|
|
if resolved_type == type_a:
|
|
for other_key, other_type in resolved_types.items():
|
|
if other_key != key and other_type == type_b:
|
|
contradictions.append({
|
|
"name": name,
|
|
"type_a": type_a,
|
|
"type_b": type_b,
|
|
"source_a": key,
|
|
"source_b": other_key,
|
|
})
|
|
break
|
|
break
|
|
|
|
return contradictions
|
|
|
|
|
|
def resolve_contradiction(features: dict, contradiction: dict) -> str:
|
|
"""解决矛盾,返回胜出的类型名。
|
|
|
|
策略:
|
|
1. 根据 TYPE_PRIORITY 取优先级高的类型。
|
|
2. 若优先级相同,根据 features 中的额外证据选择。
|
|
|
|
Args:
|
|
features: 完整特征字典。
|
|
contradiction: detect_contradictions 返回的单个矛盾。
|
|
|
|
Returns:
|
|
胜出的类型名称。
|
|
"""
|
|
type_a = contradiction["type_a"]
|
|
type_b = contradiction["type_b"]
|
|
|
|
priority_a = TYPE_PRIORITY.get(type_a, 0)
|
|
priority_b = TYPE_PRIORITY.get(type_b, 0)
|
|
|
|
if priority_a > priority_b:
|
|
return type_a
|
|
elif priority_b > priority_a:
|
|
return type_b
|
|
|
|
# 优先级相同,尝试使用 confusion_groups 重判定
|
|
from .confusion_groups import resolve_confusion_pair
|
|
|
|
pair_name = contradiction.get("name", "")
|
|
if pair_name:
|
|
result = resolve_confusion_pair(features, pair_name)
|
|
if result.get("confidence", 0) >= 0.80:
|
|
return result["resolved_type"]
|
|
|
|
# 最终回退: 取 type_a
|
|
return type_a
|