feat: Phase 2 complete — 13 Phases of COBOL type classification and test benchmark
P0.6: gcov infrastructure P1: extract_structure output expansion (11 new feature fields) P2: Confusion group rule engine (8 pairs + contradiction + backtrack) P3: 4-factor confidence calculation + quality gate update P4: 33+2 COBOL program type test samples (22 files, 7 categories) P5: parametrized/ test data generation engine P6: japanese_data.py lookup tables P7-10: Type-specific test suites (~159 parametrized tests) P11: Full classification pipeline (classify_program) + orchestrator integration P12: Documentation (module-interfaces, test-plan v3.0, coverage-matrix) Architecture decisions: - classification_pipeline/ merged to hina/pipeline/ - parametrized/ as independent module - japanese_data.py as root-level file - hina/__all__ only exports classify_program() Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,153 @@
|
||||
"""矛盾检测与解决 — 检测来自不同混淆组的类型冲突。
|
||||
|
||||
CONTRADICTION_PAIRS 定义了可能会矛盾的分类类型对。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
# ── 矛盾对定义 ──────────────────────────────────────────────────────────────
|
||||
|
||||
CONTRADICTION_PAIRS: list[dict[str, str]] = [
|
||||
{
|
||||
"name": "matching_vs_keybreak",
|
||||
"type_a": "マッチング",
|
||||
"type_b": "キーブレイク",
|
||||
},
|
||||
{
|
||||
"name": "dedup_vs_nodedup",
|
||||
"type_a": "項目チェック(重複含む)",
|
||||
"type_b": "項目チェック(重複含まず)",
|
||||
},
|
||||
{
|
||||
"name": "validation_vs_keybreak",
|
||||
"type_a": "編集処理(校验)",
|
||||
"type_b": "キーブレイク",
|
||||
},
|
||||
{
|
||||
"name": "csv_merge_vs_split",
|
||||
"type_a": "CSV合并",
|
||||
"type_b": "CSV拆分",
|
||||
},
|
||||
{
|
||||
"name": "simple_vs_two_stage",
|
||||
"type_a": "単純マッチング",
|
||||
"type_b": "二段階マッチング",
|
||||
},
|
||||
{
|
||||
"name": "pure_vs_mixed",
|
||||
"type_a": "純粋マッチング",
|
||||
"type_b": "混合マッチング",
|
||||
},
|
||||
{
|
||||
"name": "division_50_25_100",
|
||||
"type_a": "DIVIDE_50",
|
||||
"type_b": "DIVIDE_100",
|
||||
},
|
||||
{
|
||||
"name": "mn_output_mode",
|
||||
"type_a": "M:N",
|
||||
"type_b": "1:1",
|
||||
},
|
||||
]
|
||||
|
||||
# ── 冲突优先级: 当同一种类型被多个混淆组判定时,优先级高者胜出 ──────────
|
||||
|
||||
TYPE_PRIORITY: dict[str, int] = {
|
||||
"マッチング": 10,
|
||||
"キーブレイク": 9,
|
||||
"項目チェック(重複含む)": 8,
|
||||
"項目チェック(重複含まず)": 8,
|
||||
"編集処理(校验)": 7,
|
||||
"CSV合并": 6,
|
||||
"CSV拆分": 6,
|
||||
"単純マッチング": 5,
|
||||
"二段階マッチング": 5,
|
||||
"純粋マッチング": 4,
|
||||
"混合マッチング": 4,
|
||||
"DIVIDE_50": 3,
|
||||
"DIVIDE_100": 3,
|
||||
"DIVIDE_25": 3,
|
||||
"M:N": 2,
|
||||
"1:1": 2,
|
||||
}
|
||||
|
||||
|
||||
def detect_contradictions(features: dict) -> list[dict]:
|
||||
"""检测可能矛盾的类型对,返回矛盾列表。
|
||||
|
||||
检查 features["resolved_types"] 中已判定的类型,
|
||||
如果同一混淆组内两个类型同时存在,或不同组的类型存在冲突,则记录。
|
||||
|
||||
Args:
|
||||
features: 包含所有已判定的 resolved_types 字典。
|
||||
|
||||
Returns:
|
||||
矛盾列表。每个元素格式: {"name": str, "type_a": str, "type_b": str}
|
||||
"""
|
||||
resolved_types: dict[str, str] = features.get("resolved_types", {})
|
||||
if not resolved_types:
|
||||
return []
|
||||
|
||||
contradictions: list[dict] = []
|
||||
|
||||
for pair in CONTRADICTION_PAIRS:
|
||||
name = pair["name"]
|
||||
type_a = pair["type_a"]
|
||||
type_b = pair["type_b"]
|
||||
|
||||
# 检查该混淆组的判定结果中是否同时包含两个类型
|
||||
for key, resolved_type in resolved_types.items():
|
||||
if resolved_type == type_a:
|
||||
for other_key, other_type in resolved_types.items():
|
||||
if other_key != key and other_type == type_b:
|
||||
contradictions.append({
|
||||
"name": name,
|
||||
"type_a": type_a,
|
||||
"type_b": type_b,
|
||||
"source_a": key,
|
||||
"source_b": other_key,
|
||||
})
|
||||
break
|
||||
break
|
||||
|
||||
return contradictions
|
||||
|
||||
|
||||
def resolve_contradiction(features: dict, contradiction: dict) -> str:
|
||||
"""解决矛盾,返回胜出的类型名。
|
||||
|
||||
策略:
|
||||
1. 根据 TYPE_PRIORITY 取优先级高的类型。
|
||||
2. 若优先级相同,根据 features 中的额外证据选择。
|
||||
|
||||
Args:
|
||||
features: 完整特征字典。
|
||||
contradiction: detect_contradictions 返回的单个矛盾。
|
||||
|
||||
Returns:
|
||||
胜出的类型名称。
|
||||
"""
|
||||
type_a = contradiction["type_a"]
|
||||
type_b = contradiction["type_b"]
|
||||
|
||||
priority_a = TYPE_PRIORITY.get(type_a, 0)
|
||||
priority_b = TYPE_PRIORITY.get(type_b, 0)
|
||||
|
||||
if priority_a > priority_b:
|
||||
return type_a
|
||||
elif priority_b > priority_a:
|
||||
return type_b
|
||||
|
||||
# 优先级相同,尝试使用 confusion_groups 重判定
|
||||
from .confusion_groups import resolve_confusion_pair
|
||||
|
||||
pair_name = contradiction.get("name", "")
|
||||
if pair_name:
|
||||
result = resolve_confusion_pair(features, pair_name)
|
||||
if result.get("confidence", 0) >= 0.80:
|
||||
return result["resolved_type"]
|
||||
|
||||
# 最终回退: 取 type_a
|
||||
return type_a
|
||||
Reference in New Issue
Block a user