fix: adversarial testing — 4 false positive/negative fixes + comment stripping

COBOL migration expert adversarial testing found 4 real defects: FIX 1: Comment-stripping in detect_keyword() (FP-2) - Remove *> inline comments and * comment lines before keyword matching - Prevents 「マッチング」 from triggering on WS-KEY in comments FIX 2: KEY comparison context validation (FP-1, FP-6) - Add _matches_key_comparison() — requires WS-KEY variable to appear NEAR an actual comparison operator (= < >), not just as PIC/VALUE decl - Same check in _path_rule_engine features via has_key_var injection - Fix regex bug: [=<>\s] vs [=<>] — \s matched whitespace after PIC decl FIX 3: Old-school naming support (FN-1) - Add L1 keyword r'[A-Z]\d{0,2}-\w*KEY' with 0.55 confidence - Matches K01-KEY, KS-KEY etc. (non-WS- prefix naming convention) FIX 4: mn_output_mode over-matching (FP-6) - Require IF branches + KEY evidence before returning M:N for file>=3 - matching_vs_keybreak rule 3 now requires has_key_var New tests: test_adversarial.py — 8 parametrized adversarial tests Regression: 755 passed (0 new failures)
2026-06-21 15:16:41 +08:00
parent a5939e6722
commit 33762ca959
6 changed files with 189 additions and 13 deletions
@@ -24,6 +24,9 @@ L1_RULES: list[tuple[str, list[str], float]] = [
    ("文件编成", ["ORGANIZATION IS"], 0.99),
    ("替代索引", ["ALTERNATE RECORD KEY"], 0.99),
    ("マッチング", ["re:WS-[\\w-]*KEY"], 0.65),
+    # 旧式命名: K01-KEY, KS-KEY, MTCH-KEY 等（无 WS- 前缀）
+    # 低确信度，需要实际 KEY 比较上下文验证
+    ("マッチング", ["re:[A-Z]\\d{0,2}-\\w*KEY"], 0.55),
 ]

 # ── 冲突解决规则 ─────────────────────────────────────────────────────────
@@ -38,10 +41,65 @@ CONFLICT_RULES: dict[tuple[str, str], str] = {


 # ── 关键字检测 ───────────────────────────────────────────────────────────
+def _strip_cobol_comments(source: str) -> str:
+    """剥离 COBOL 注释，避免注释中的关键词触发 L1 匹配。
+
+    处理两种注释:
+    - 固定格式列 7: 行首 `*` (comment line)
+    - 自由格式/内联: `*> ...` 到行尾
+    """
+    lines = source.split('\n')
+    cleaned = []
+    for line in lines:
+        # 自由格式/内联注释: *>
+        idx = line.find('*>')
+        if idx >= 0:
+            line = line[:idx]
+        # 固定格式注释行: 如果第一个非空字符是 *
+        stripped = line.strip()
+        if stripped.startswith('*') and not stripped.startswith('*/'):
+            continue  # 跳过整个注释行
+        cleaned.append(line)
+    return '\n'.join(cleaned)
+
+
+def _matches_key_comparison(source_upper: str) -> bool:
+    """检查源码中是否包含实际的 KEY 变量比较（而非仅声明）。
+
+    匹配 KEY 变量在比较上下文中的使用:
+      WS-KEY = / WS-KEY > / WS-KEY <
+      IF WS-MAST-KEY
+      KEY = WS-...
+    """
+    # 模式 1: KEY 变量出现在比较上下文中（= < > 后跟变量）
+    # 注意: 不能用 \s 代替 [=<>]，否则「WS-KEY PIC」中的空格也会误匹配
+    if re.search(r'WS-[\w-]*KEY[A-Z0-9-]*\s*[=<>]', source_upper):
+        return True
+    # 模式 2: 非 WS- 前缀的 KEY 变量（旧式命名 K01-KEY 等）
+    if re.search(r'\b[A-Z]\d{0,2}-[\w-]*KEY\s*[=<>]', source_upper):
+        return True
+    # 模式 3: 源码中含有 READ INTO + KEY 变量
+    if re.search(r'READ\s+\w+\s+INTO\s+\w+.*KEY', source_upper, re.DOTALL):
+        return True
+    return False
+
+
+def _get_procedure_division(source_upper: str) -> str:
+    """只提取 PROCEDURE DIVISION 部分用于关键词匹配。"""
+    idx = source_upper.find('PROCEDURE DIVISION')
+    if idx >= 0:
+        return source_upper[idx:]
+    return source_upper
+
+
 def detect_keyword(source: str) -> list[tuple[str, float, str]]:
    """在 COBOL 源码中搜索 L1_RULES 定义的关键字，返回匹配结果。

-    关键字前缀 "re:" 表示正则表达式匹配（如 "re:WS-\\w*KEY" 匹配 WS-MAST-KEY 等）。
+    处理步骤:
+      1. 剥离注释，避免注释中的关键词触发匹配
+      2. 对需要程序上下文的关键词（マッチング），检查 KEY 变量是否在比较中使用
+
+    关键字前缀 "re:" 表示正则表达式匹配。

    Args:
        source: COBOL 程序源码文本。
@@ -50,18 +108,27 @@ def detect_keyword(source: str) -> list[tuple[str, float, str]]:
        list[tuple[str, float, str]]:
            每个元素为 (分类名称, 置信度, 匹配到的关键字原文)。
    """
+    cleaned = _strip_cobol_comments(source)
+    source_upper = cleaned.upper()
+
    results: list[tuple[str, float, str]] = []
-    source_upper = source.upper()

    for category, keywords, confidence in L1_RULES:
        matched = False
        for kw in keywords:
            if kw.startswith("re:"):
                pattern = kw[3:]
-                if re.search(pattern, source_upper):
-                    results.append((category, confidence, kw))
-                    matched = True
-                    break
+                if not re.search(pattern, source_upper):
+                    continue
+
+                # マッチング 关键词需要额外上下文验证：KEY 变量必须在比较中使用
+                if category == "マッチング":
+                    if not _matches_key_comparison(source_upper):
+                        continue
+
+                results.append((category, confidence, kw))
+                matched = True
+                break
            else:
                if kw in source_upper:
                    results.append((category, confidence, kw))
@@ -156,6 +156,17 @@ def _path_rule_engine(
    # 1. 结构特征直接作为 features
    features = dict(structure)

+    # 注入 has_key_var: 源码中是否存在实际的 KEY 比较
+    # （避免 matching_vs_keybreak 规则被计数器比较误触发）
+    if features.get("source_upper"):
+        import re
+        su = features["source_upper"]
+        features["has_key_var"] = bool(re.search(
+            r'WS-[\w-]*KEY[A-Z0-9-]*\s*[=<>]|'  # WS-KEY = / WS-KEY >
+            r'\b[A-Z]\d{0,2}-[\w-]*KEY\s*[=<>]',  # K01-KEY =
+            su
+        ))
+
    # 2. 运行所有混淆组解析器
    resolved_types: dict[str, str] = {}
    resolved_confidences: dict[str, float] = {}
@@ -570,6 +581,10 @@ def classify_program(cobol_source: str, llm: Any = None) -> dict:
                except Exception as e:
                    logger.warning("[pipeline] extract_structure 失败: %s", e)

+        # 注入源代码用于 features 中的上下文验证（如 has_key_var）
+        if structure:
+            structure["source_upper"] = cobol_source.upper()
+
    # ── 第 2 步: 分析关键字结果, 确定路径 ──
    keyword_info = _get_best_keyword_match(keyword_matches)
    max_keyword_confidence = keyword_info["confidence"] if keyword_info else 0.0
@@ -43,8 +43,10 @@ def resolve_matching_vs_keybreak(features: dict) -> dict:
        return {"resolved_type": "キーブレイク", "confidence": 0.85, "evidence": evidence}

    # 补充规则: SELECT 文件数 >= 2 且 comparison 至少 1 → 倾向マッチング
-    if file_count >= 2 and comparison_ifs >= 1:
-        evidence.append(f"SELECT 文件数 >=2 + comparison IF >=1 → マッチング")
+    # 要求必须有实际的 KEY 变量比较（防止计数器比较误判）
+    has_key_compare = variable_patterns.get("has_prev_key", False) or features.get("has_key_var", False)
+    if file_count >= 2 and comparison_ifs >= 1 and has_key_compare:
+        evidence.append(f"SELECT 文件数 >=2 + comparison IF >=1 + KEY 变量 → マッチング")
        return {"resolved_type": "マッチング", "confidence": 0.75, "evidence": evidence}

    # 回退: 无法明确判定
@@ -202,8 +204,15 @@ def resolve_mn_output_mode(features: dict) -> dict:
        return {"resolved_type": "M:N", "confidence": 0.65, "evidence": evidence}

    if file_count >= 3:
-        evidence.append(f"文件数 {file_count} >= 3, 可能为 M:N 关系")
-        return {"resolved_type": "M:N", "confidence": 0.60, "evidence": evidence}
+        # 需要至少有 IF 分支和 KEY 变量的证据，否则单纯文件多不是匹配程序
+        vp = features.get("variable_patterns", {})
+        total_ifs = features.get("if_types", {}).get("total", 0)
+        has_key_evidence = vp.get("has_prev_key", False) or vp.get("has_accumulator", False)
+        if total_ifs >= 1 and has_key_evidence:
+            evidence.append(f"文件数 {file_count} >= 3, IF 分支 {total_ifs}, KEY 证据 → 可能 M:N")
+            return {"resolved_type": "M:N", "confidence": 0.60, "evidence": evidence}
+        evidence.append(f"文件数 {file_count} 但无 IF+KEY 证据 → 不是 M:N 匹配")
+        return {"resolved_type": "unknown", "confidence": 0.0, "evidence": evidence}

    evidence.append("需数据验证确定 M:N 输出模式")
    return {"resolved_type": "unknown", "confidence": 0.0, "evidence": evidence}