feat: structural matching detection — no KEY variable needed

Add _detect_matching_structure(): detection based on control flow pattern, not variable naming conventions. Uses 5 structural signals: 1. READ + AT END + EOF pattern 2. PERFORM UNTIL with EOF condition 3. ELSE body with conditional READ (matching core) 4. IF comparing hyphenated fields (cross-file comparison) 5. Multi-file OPEN INPUT 5/5 signals → 0.55, 4/5 → 0.50, 3/5 → 0.40. Real-world impact: matching programs with key fields named CUST-CODE and ORDR-CODE (no '-KEY' in name) are now correctly detected. Also: - Rule engine type priority: main types (マッチング etc.) override secondary types (M:N, DIVIDE) when keyword confidence is low - has_structural_match injected into features so rule engine can use it - matching_vs_keybreak accepts equality IFs as matching evidence - New test: test_structural_matching_no_keyword() Regression: 764 passed (0 new failures).
2026-06-21 15:28:32 +08:00
parent 33762ca959
commit da5d1058e7
4 changed files with 176 additions and 25 deletions
@@ -92,6 +92,48 @@ def _get_procedure_division(source_upper: str) -> str:
    return source_upper


+def _detect_matching_structure(source_upper: str) -> float:
+    """结构检测：不依赖变量名 KEY 的模式匹配检测。
+
+    通过分析 COBOL 程序的控制流结构判断是否为匹配程序。
+    返回确信度 0.0~0.55，0.0 表示不是匹配。
+
+    匹配程序的结构性特征:
+      信号 1: READ + AT END + EOF（文件读取循环）
+      信号 2: PERFORM UNTIL + EOF（主循环）
+      信号 3: ELSE 体内 READ（条件性读取——匹配核心）
+      信号 4: IF 比较两个连字号字段（跨文件字段比较）
+      信号 5: 2+ 文件 OPEN INPUT（多文件输入）
+    """
+    import re
+
+    signals = 0
+    # 信号 1: READ + AT END + EOF（文件读取循环）
+    if re.search(r'READ\s+\w+.*AT\s+END.*EOF', source_upper):
+        signals += 1
+    # 信号 2: PERFORM UNTIL + EOF（主循环）
+    if re.search(r'PERFORM\s+UNTIL\s+.*EOF', source_upper):
+        signals += 1
+    # 信号 3: ELSE 体内 READ（条件性读取）
+    if re.search(r'ELSE\s+.*READ\s+', source_upper):
+        signals += 1
+    # 信号 4: IF 比较两个连字号字段（跨文件字段比较）
+    if re.search(r'IF\s+\w+-\w+\s*[=<>]\s*\w+-\w+', source_upper):
+        signals += 1
+    # 信号 5: 2+ 文件 OPEN INPUT
+    if re.search(r'OPEN\s+INPUT\s+\w+\s+\w+', source_upper):
+        signals += 1
+
+    # 确信度: 5 中 5 = 0.55, 5 中 4 = 0.50, 5 中 3 = 0.40
+    if signals >= 5:
+        return 0.55
+    elif signals >= 4:
+        return 0.50
+    elif signals >= 3:
+        return 0.40
+    return 0.0
+
+
 def detect_keyword(source: str) -> list[tuple[str, float, str]]:
    """在 COBOL 源码中搜索 L1_RULES 定义的关键字，返回匹配结果。

@@ -135,6 +177,15 @@ def detect_keyword(source: str) -> list[tuple[str, float, str]]:
                    matched = True
                    break

+    # ── 结构性匹配检测（不依赖 KEY 变量名）──
+    match_conf = _detect_matching_structure(source_upper)
+    if match_conf > 0:
+        has_more_specific = any(
+            cat != "マッチング" for cat, _, _ in results
+        )
+        if not has_more_specific:
+            results.append(("マッチング", match_conf, "structural_matching"))
+
    return results


@@ -166,6 +166,13 @@ def _path_rule_engine(
            r'\b[A-Z]\d{0,2}-[\w-]*KEY\s*[=<>]',  # K01-KEY =
            su
        ))
+        # 注入 has_structural_match: 结构性匹配检测的结果（不依赖变量名 KEY）
+        # 当 detect_keyword 通过结构识别出匹配时，让规则引擎也能利用这个信号
+        features["has_structural_match"] = bool(re.search(
+            r'IF\s+\w+-\w+\s*[=<>]\s*\w+-\w+.*'  # 跨文件字段比较
+            r'(?:PERFORM|END-PERFORM|READ)',        # 含循环/读取
+            su, re.DOTALL
+        ))

    # 2. 运行所有混淆组解析器
    resolved_types: dict[str, str] = {}
@@ -205,19 +212,48 @@ def _path_rule_engine(
        final_category = keyword_info["category"]
        final_base_confidence = keyword_info["confidence"]

+    # 规则引擎结果优先级: 匹配检测 > 辅助推断
+    # マッチング/項目チェック/キーブレイク/編集処理 是主类型，优先级高
+    # M:N/DIVIDE 是辅助推断，仅当主类型未命中时才采纳
+    _MAIN_TYPE_PRIORITY = {"マッチング", "項目チェック(重複含む)", "項目チェック(重複含まず)",
+                           "キーブレイク", "編集処理(校验)", "二段階マッチング",
+                           "単純マッチング", "混合マッチング", "CSV合并", "CSV拆分",
+                           "純粋マッチング"}
+
    # 如果规则引擎有更高置信度的结果, 则采纳
    # 使用第一轮缓存的结果（M1: 消除冗余重复调用）
    best_resolved_type = None
    best_resolved_conf = 0.0
+    best_is_main = False
    for pair_name, rtype in resolved_types.items():
        cached_conf = resolved_confidences.get(pair_name, 0.0)
-        if cached_conf > best_resolved_conf:
-            best_resolved_conf = cached_conf
-            best_resolved_type = rtype
+        is_main = rtype in _MAIN_TYPE_PRIORITY

-    if best_resolved_type and best_resolved_conf > final_base_confidence:
-        final_category = best_resolved_type
-        final_base_confidence = best_resolved_conf
+        if best_resolved_type is None:
+            best_resolved_type = rtype
+            best_resolved_conf = cached_conf
+            best_is_main = is_main
+        elif is_main and not best_is_main:
+            # 主类型覆盖非主类型（即使置信度略低）
+            best_resolved_type = rtype
+            best_resolved_conf = cached_conf
+            best_is_main = True
+        elif cached_conf > best_resolved_conf:
+            best_resolved_type = rtype
+            best_resolved_conf = cached_conf
+            best_is_main = is_main
+
+    if best_resolved_type:
+        final_is_main = final_category in _MAIN_TYPE_PRIORITY
+        if best_resolved_conf > final_base_confidence:
+            # 置信度更高 → 替换
+            final_category = best_resolved_type
+            final_base_confidence = best_resolved_conf
+        elif best_is_main and not final_is_main and final_base_confidence < 0.40:
+            # 主类型替代低确信度的非主类型（如 M:N→マッチング）
+            # 但如果 keyword 已确定具体分类（如编码转换 0.85），不覆盖
+            final_category = best_resolved_type
+            final_base_confidence = max(final_base_confidence, best_resolved_conf)

    # 5. 计算 4 因子确信度
    keyword_result_v2 = _build_keyword_result_for_v2(keyword_info)
@@ -42,11 +42,14 @@ def resolve_matching_vs_keybreak(features: dict) -> dict:
        evidence.append(f"WS-PREV-KEY 存在 + 累加器存在 + IF 分支 → キーブレイク")
        return {"resolved_type": "キーブレイク", "confidence": 0.85, "evidence": evidence}

-    # 补充规则: SELECT 文件数 >= 2 且 comparison 至少 1 → 倾向マッチング
+    # 补充规则: SELECT 文件数 >= 2 且 comparison/eqlality 至少 1 → 倾向マッチング
    # 要求必须有实际的 KEY 变量比较（防止计数器比较误判）
+    # 或结构性匹配检测信号（变量名不含 KEY 但结构是匹配）
    has_key_compare = variable_patterns.get("has_prev_key", False) or features.get("has_key_var", False)
-    if file_count >= 2 and comparison_ifs >= 1 and has_key_compare:
-        evidence.append(f"SELECT 文件数 >=2 + comparison IF >=1 + KEY 变量 → マッチング")
+    has_struct_match = features.get("has_structural_match", False) or features.get("has_prev_key", False)
+    effective_ifs = comparison_ifs + equality_ifs
+    if file_count >= 2 and effective_ifs >= 1 and (has_key_compare or has_struct_match):
+        evidence.append(f"SELECT 文件数 >=2 + IF >=1 + KEY/结构证据 → マッチング")
        return {"resolved_type": "マッチング", "confidence": 0.75, "evidence": evidence}

    # 回退: 无法明确判定