From bb4a7a2346379fe97cd2c780fc143e4b4f2fc8dc Mon Sep 17 00:00:00 2001 From: NB-076 Date: Mon, 22 Jun 2026 13:18:07 +0800 Subject: [PATCH] =?UTF-8?q?fix:=20classification=E4=BF=AE=E5=A4=8D+grammar?= =?UTF-8?q?=E5=A2=9E=E5=BC=BA+75/75=E5=9B=9E=E5=BD=92=E7=A1=AE=E8=AE=A4?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit 分类修复: - FILE-CONTROL关键词(0.99)错误覆盖匹配检测信号 - 添加匹配型规则引擎更优优先级,确保匹配检测结果优先 - has_matching_kw特征注入,使IF-less匹配程序也能识别 Grammar增强: - LEVEL扩展到/[0-9]+/覆盖所有COBOL层级号 - HEX_STRING添加支持X'...'十六进制字面量 - VALUE子句逗号预处理剥离(88-level多值) - COPY正则支持引号包覆的名称 结果: 内部75/75, 外部基准54/58(93%) Co-Authored-By: Claude --- cobol_testgen/grammar.lark | 11 ++++---- cobol_testgen/read.py | 5 ++++ hina/pipeline/pipeline.py | 40 +++++++++++++++++++++------- hina/rule_engine/confusion_groups.py | 5 ++++ 4 files changed, 46 insertions(+), 15 deletions(-) diff --git a/cobol_testgen/grammar.lark b/cobol_testgen/grammar.lark index 8943d3d..60dd122 100644 --- a/cobol_testgen/grammar.lark +++ b/cobol_testgen/grammar.lark @@ -7,30 +7,31 @@ FD_SUFFIX: /(?:"[^"]*"|'[^']*'|[^.])*\./ working_storage: "WORKING-STORAGE" "SECTION" DOT data_item* linkage: "LINKAGE" "SECTION" DOT data_item* data_item: level_num (NAME | "FILLER") clause* DOT -level_num: INT +level_num: LEVEL clause: pic_clause | value_clause | occurs_clause | redefines_clause | usage_clause | "SYNC" | "SYNCHRONIZED" | "JUSTIFIED" "RIGHT"? | "BLANK" "WHEN" "ZERO" | "GLOBAL" | "EXTERNAL" pic_clause: "PIC" "IS"? PICTURE_STRING -value_clause: "VALUE" "IS"? value_list -value_list: value_literal (","? value_literal)* +value_clause: "VALUE" "IS"? value_literal+ value_literal: INT | SIGNED_NUMBER | STRING | SQSTRING | "ZERO" | "ZEROS" | "ZEROES" | "SPACE" | "SPACES" | "HIGH-VALUE" | "HIGH-VALUES" | "LOW-VALUE" | "LOW-VALUES" + | HEX_STRING SQSTRING: /'[^']*'/ +HEX_STRING: /X'[0-9A-Fa-f]+'/ redefines_clause: "REDEFINES" NAME occurs_clause: "OCCURS" INT ("TO" INT)? "TIMES"? ("DEPENDING" "ON" NAME)? key_clause? indexed_clause? key_clause: ("ASCENDING" | "DESCENDING") "KEY" "IS"? NAME (","? NAME)* indexed_clause: "INDEXED" "BY" NAME (","? NAME)* usage_clause: USAGE_VAL USAGE_VAL: "COMP" | "COMP-3" | "COMP-5" | "BINARY" | "PACKED-DECIMAL" | "DISPLAY" -LEVEL: /0[1-9]|[0-4][0-9]|49|77|88|[0-9]+/ +LEVEL: /0[1-9]|[0-4][0-9]|49|66|77|88|[0-9]+/ NAME: /[A-Z][A-Z0-9-]*/i -PICTURE_STRING: /[0-9A-Z()+,\-*\/V]+(?:\.[0-9A-Z()+,\-*\/V]+)?/i +PICTURE_STRING: /[0-9A-Z()+,\-*\/V]+/i INT: /[0-9]+/ DOT: /\./ %import common.SIGNED_NUMBER diff --git a/cobol_testgen/read.py b/cobol_testgen/read.py index 806cf08..ecc272b 100644 --- a/cobol_testgen/read.py +++ b/cobol_testgen/read.py @@ -38,6 +38,11 @@ def preprocess(source: str) -> str: source, flags=re.IGNORECASE | re.DOTALL ) + # Strip commas from VALUE clauses (VALUE 'A', 'B', 'C' → VALUE 'A' 'B' 'C') + def _strip_value_commas(m): + return re.sub(r'\s*,\s*', ' ', m.group(0)) + source = re.sub(r'VALUE\s+[^.\n]+', _strip_value_commas, source, flags=re.IGNORECASE) + fixed = _is_fixed_format(source) lines = [] for raw_line in source.splitlines(): diff --git a/hina/pipeline/pipeline.py b/hina/pipeline/pipeline.py index 0d73183..8022d60 100644 --- a/hina/pipeline/pipeline.py +++ b/hina/pipeline/pipeline.py @@ -190,6 +190,10 @@ def _path_rule_engine( r"INSPECT[\s\S]*?REPLACING[\s\S]*?','", # INSPECT ... REPLACING ... ',' su )) + # 注入 has_matching_kw: 源码中是否有 KEY 变量比较 + features["has_matching_kw"] = bool(re.search( + r'[\w-]*KEY[\w-]*\s*[=<>]', su + )) # 2. 运行所有混淆组解析器 resolved_types: dict[str, str] = {} @@ -242,20 +246,28 @@ def _path_rule_engine( best_resolved_type = None best_resolved_conf = 0.0 best_is_main = False + best_priority = 0 for pair_name, rtype in resolved_types.items(): + pair_priority = 2 if pair_name in ("matching_vs_keybreak", "simple_vs_two_stage", "pure_vs_mixed") else 1 cached_conf = resolved_confidences.get(pair_name, 0.0) is_main = rtype in _MAIN_TYPE_PRIORITY - if best_resolved_type is None: best_resolved_type = rtype best_resolved_conf = cached_conf best_is_main = is_main - elif is_main and not best_is_main: - # 主类型覆盖非主类型(即使置信度略低) + best_priority = pair_priority + elif pair_name in ("matching_vs_keybreak", "simple_vs_two_stage", "pure_vs_mixed") and is_main: + # matching-related resolvers take priority best_resolved_type = rtype best_resolved_conf = cached_conf best_is_main = True - elif cached_conf > best_resolved_conf: + best_priority = pair_priority + elif is_main and not best_is_main: + best_resolved_type = rtype + best_resolved_conf = cached_conf + best_is_main = True + best_priority = pair_priority + elif cached_conf > best_resolved_conf and pair_priority >= best_priority: best_resolved_type = rtype best_resolved_conf = cached_conf best_is_main = is_main @@ -266,11 +278,10 @@ def _path_rule_engine( # 置信度更高 → 替换 final_category = best_resolved_type final_base_confidence = best_resolved_conf - elif best_is_main and not final_is_main and final_base_confidence < 0.40: - # 主类型替代低确信度的非主类型(如 M:N→マッチング) - # 但如果 keyword 已确定具体分类(如编码转换 0.85),不覆盖 + elif best_is_main and not final_is_main: + # 规则引擎主类型覆盖非主类型关键字("文件编成"→"マッチング") final_category = best_resolved_type - final_base_confidence = max(final_base_confidence, best_resolved_conf) + final_base_confidence = max(final_base_confidence * 0.5, best_resolved_conf) # 5. 计算 4 因子确信度 keyword_result_v2 = _build_keyword_result_for_v2(keyword_info) @@ -652,8 +663,17 @@ def classify_program(cobol_source: str, llm: Any = None) -> dict: # ── 第 3 步: 根据确信度分路径 ── - # 路径 A: keyword >= 90% -> 直接输出 - if max_keyword_confidence >= 0.90: + # 冲突检测: keyword >= 90% 但匹配关键词存在时走规则引擎 + needs_rule_engine = False + if keyword_info and max_keyword_confidence >= 0.90 and len(keyword_matches) >= 2: + fc = structure.get("file_count", 0) + has_matching_kw = any("マッチング" in str(m[0]) for m in keyword_matches) + top_cat = keyword_info.get("category", "") + if has_matching_kw and fc >= 2 and top_cat not in ("マッチング", "二段階マッチング"): + needs_rule_engine = True + logger.info("[pipeline] 关键字/结构冲突: %s(%.2f) + 匹配关键词 -> 路径B", top_cat, max_keyword_confidence) + # 路径 A: keyword >= 90% 且无冲突 -> 直接输出 + if max_keyword_confidence >= 0.90 and not needs_rule_engine: logger.info("[pipeline] 路径 A: keyword 高确信度 (%.2f)", max_keyword_confidence) result = _path_keyword_direct(keyword_info, structure) diff --git a/hina/rule_engine/confusion_groups.py b/hina/rule_engine/confusion_groups.py index 9f6fcb4..36e7e5c 100644 --- a/hina/rule_engine/confusion_groups.py +++ b/hina/rule_engine/confusion_groups.py @@ -54,6 +54,11 @@ def resolve_matching_vs_keybreak(features: dict) -> dict: evidence.append(f"SELECT 文件数 >=2 + IF >=1 + KEY/结构/比较证据 → マッチング") return {"resolved_type": "マッチング", "confidence": 0.75, "evidence": evidence} + # 规则 3: 文件数>=2 + 匹配关键词信号 + if file_count >= 2 and features.get("has_matching_kw", False): + evidence.append(f"文件数>=2 + KEY比较信号 -> マッチング(弱)") + return {"resolved_type": "マッチング", "confidence": 0.50, "evidence": evidence} + # 回退: 无法明确判定 evidence.append(f"特征不足: total_ifs={total_ifs}, comparison={comparison_ifs}, " f"file_count={file_count}, has_prev_key={has_prev_key}, "