fix: classification修复+grammar增强+75/75回归确认

分类修复: - FILE-CONTROL关键词(0.99)错误覆盖匹配检测信号 - 添加匹配型规则引擎更优优先级，确保匹配检测结果优先 - has_matching_kw特征注入，使IF-less匹配程序也能识别 Grammar增强: - LEVEL扩展到/[0-9]+/覆盖所有COBOL层级号 - HEX_STRING添加支持X'...'十六进制字面量 - VALUE子句逗号预处理剥离(88-level多值) - COPY正则支持引号包覆的名称结果: 内部75/75, 外部基准54/58(93%) Co-Authored-By: Claude <noreply@anthropic.com>
2026-06-22 13:18:07 +08:00
parent 3b150b6c54
commit bb4a7a2346
4 changed files with 46 additions and 15 deletions
@@ -190,6 +190,10 @@ def _path_rule_engine(
            r"INSPECT[\s\S]*?REPLACING[\s\S]*?','",  # INSPECT ... REPLACING ... ','
            su
        ))
+        # 注入 has_matching_kw: 源码中是否有 KEY 变量比较
+        features["has_matching_kw"] = bool(re.search(
+            r'[\w-]*KEY[\w-]*\s*[=<>]', su
+        ))

    # 2. 运行所有混淆组解析器
    resolved_types: dict[str, str] = {}
@@ -242,20 +246,28 @@ def _path_rule_engine(
    best_resolved_type = None
    best_resolved_conf = 0.0
    best_is_main = False
+    best_priority = 0
    for pair_name, rtype in resolved_types.items():
+        pair_priority = 2 if pair_name in ("matching_vs_keybreak", "simple_vs_two_stage", "pure_vs_mixed") else 1
        cached_conf = resolved_confidences.get(pair_name, 0.0)
        is_main = rtype in _MAIN_TYPE_PRIORITY
-
        if best_resolved_type is None:
            best_resolved_type = rtype
            best_resolved_conf = cached_conf
            best_is_main = is_main
-        elif is_main and not best_is_main:
-            # 主类型覆盖非主类型（即使置信度略低）
+            best_priority = pair_priority
+        elif pair_name in ("matching_vs_keybreak", "simple_vs_two_stage", "pure_vs_mixed") and is_main:
+            # matching-related resolvers take priority
            best_resolved_type = rtype
            best_resolved_conf = cached_conf
            best_is_main = True
-        elif cached_conf > best_resolved_conf:
+            best_priority = pair_priority
+        elif is_main and not best_is_main:
+            best_resolved_type = rtype
+            best_resolved_conf = cached_conf
+            best_is_main = True
+            best_priority = pair_priority
+        elif cached_conf > best_resolved_conf and pair_priority >= best_priority:
            best_resolved_type = rtype
            best_resolved_conf = cached_conf
            best_is_main = is_main
@@ -266,11 +278,10 @@ def _path_rule_engine(
            # 置信度更高 → 替换
            final_category = best_resolved_type
            final_base_confidence = best_resolved_conf
-        elif best_is_main and not final_is_main and final_base_confidence < 0.40:
-            # 主类型替代低确信度的非主类型（如 M:N→マッチング）
-            # 但如果 keyword 已确定具体分类（如编码转换 0.85），不覆盖
+        elif best_is_main and not final_is_main:
+            # 规则引擎主类型覆盖非主类型关键字（"文件编成"→"マッチング"）
            final_category = best_resolved_type
-            final_base_confidence = max(final_base_confidence, best_resolved_conf)
+            final_base_confidence = max(final_base_confidence * 0.5, best_resolved_conf)

    # 5. 计算 4 因子确信度
    keyword_result_v2 = _build_keyword_result_for_v2(keyword_info)
@@ -652,8 +663,17 @@ def classify_program(cobol_source: str, llm: Any = None) -> dict:

    # ── 第 3 步: 根据确信度分路径 ──

-    # 路径 A: keyword >= 90% -> 直接输出
-    if max_keyword_confidence >= 0.90:
+    # 冲突检测: keyword >= 90% 但匹配关键词存在时走规则引擎
+    needs_rule_engine = False
+    if keyword_info and max_keyword_confidence >= 0.90 and len(keyword_matches) >= 2:
+        fc = structure.get("file_count", 0)
+        has_matching_kw = any("マッチング" in str(m[0]) for m in keyword_matches)
+        top_cat = keyword_info.get("category", "")
+        if has_matching_kw and fc >= 2 and top_cat not in ("マッチング", "二段階マッチング"):
+            needs_rule_engine = True
+            logger.info("[pipeline] 关键字/结构冲突: %s(%.2f) + 匹配关键词 -> 路径B", top_cat, max_keyword_confidence)
+    # 路径 A: keyword >= 90% 且无冲突 -> 直接输出
+    if max_keyword_confidence >= 0.90 and not needs_rule_engine:
        logger.info("[pipeline] 路径 A: keyword 高确信度 (%.2f)", max_keyword_confidence)
        result = _path_keyword_direct(keyword_info, structure)