diff --git a/hina/classifier.py b/hina/classifier.py index 26ceea0..ea14946 100644 --- a/hina/classifier.py +++ b/hina/classifier.py @@ -18,11 +18,11 @@ L1_RULES: list[tuple[str, list[str], float]] = [ ("SYSIN", ["SYSIN"], 0.90), ("编码转换", ["ALPHABETIC", "ASCII", "EBCDIC"], 0.85), ("online", ["DFHCOMMAREA", "MAP"], 0.95), - ("SORT", ["SORT ON KEY"], 0.95), - ("MERGE", ["MERGE ON KEY"], 0.95), + ("SORT", ["re:SORT(?:\\s+\\S+)?\\s+ON\\s+(?:ASCENDING\\s+|DESCENDING\\s+)?KEY"], 0.95), + ("MERGE", ["re:MERGE(?:\\s+\\S+)?\\s+ON\\s+(?:ASCENDING\\s+|DESCENDING\\s+)?KEY"], 0.95), + ("替代索引", ["ALTERNATE RECORD KEY"], 0.99), ("编辑输出", ["WRITE AFTER", "WRITE BEFORE"], 0.80), ("文件编成", ["ORGANIZATION IS"], 0.99), - ("替代索引", ["ALTERNATE RECORD KEY"], 0.99), ("マッチング", ["re:WS-[\\w-]*KEY"], 0.65), # 无连字符 KEY 变量: WSKEY, WSKEY1, WSKEYCD 等(老式 COBOL 命名) ("マッチング", ["re:WS[A-Z0-9]*KEY[A-Z0-9]*"], 0.65), diff --git a/hina/pipeline/pipeline.py b/hina/pipeline/pipeline.py index 833c39a..c2dedaf 100644 --- a/hina/pipeline/pipeline.py +++ b/hina/pipeline/pipeline.py @@ -173,6 +173,15 @@ def _path_rule_engine( r'(?:PERFORM|END-PERFORM|READ)', # 含循环/读取 su, re.DOTALL )) + # 注入 CSV 信号:逗号分隔的字符串拼接/替换 + features["has_csv_merge"] = bool(re.search( + r"STRING[\s\S]*?','[\s\S]*?INTO", # STRING ... ',' ... INTO + su + )) + features["has_csv_split"] = bool(re.search( + r"INSPECT[\s\S]*?REPLACING[\s\S]*?,',", # INSPECT ... REPLACING ... ',' + su + )) # 2. 运行所有混淆组解析器 resolved_types: dict[str, str] = {} diff --git a/hina/rule_engine/confusion_groups.py b/hina/rule_engine/confusion_groups.py index 4f8a4f7..043b921 100644 --- a/hina/rule_engine/confusion_groups.py +++ b/hina/rule_engine/confusion_groups.py @@ -106,21 +106,33 @@ def resolve_csv_merge_vs_split(features: dict) -> dict: """区分 CSV 合并与拆分。 规则: - - STRING 语句存在 → 无换行 (合并, merge) - - INSPECT REPLACING 存在 → 有换行 (拆分, split) + - STRING 存在且含逗号分隔 → 无换行 (合并, merge) + - INSPECT REPLACING 含逗号/改行 → 有换行 (拆分, split) + 单纯的 STRING 拼接/INSPECT 计数不触发(容易假阳性)。 """ has_string = features.get("has_string", False) has_inspect = features.get("has_inspect", False) + has_csv_merge = features.get("has_csv_merge", False) # 从源码注入 + has_csv_split = features.get("has_csv_split", False) # 从源码注入 evidence: list[str] = [] - if has_string: - evidence.append("STRING 语句存在 → CSV 合并 (无换行)") + if has_csv_merge: + evidence.append("STRING + 逗号分隔 → CSV 合并 (无换行)") return {"resolved_type": "CSV合并", "confidence": 0.85, "evidence": evidence} - if has_inspect: - evidence.append("INSPECT REPLACING 存在 → CSV 拆分 (有换行)") + if has_csv_split: + evidence.append("INSPECT REPLACING 含逗号/改行 → CSV 拆分") return {"resolved_type": "CSV拆分", "confidence": 0.85, "evidence": evidence} + # 兼容旧版: + if has_string: + evidence.append("STRING 存在但无逗号分隔 → 非CSV(低确信度)") + return {"resolved_type": "unknown", "confidence": 0.0, "evidence": evidence} + + if has_inspect: + evidence.append("INSPECT 存在但无逗号/改行 → 非CSV(低确信度)") + return {"resolved_type": "unknown", "confidence": 0.0, "evidence": evidence} + evidence.append("既无 STRING 也无 INSPECT REPLACING") return {"resolved_type": "unknown", "confidence": 0.0, "evidence": evidence} diff --git a/tests/hina/test_classifier_deep.py b/tests/hina/test_classifier_deep.py index 7d74007..7876467 100644 --- a/tests/hina/test_classifier_deep.py +++ b/tests/hina/test_classifier_deep.py @@ -20,7 +20,7 @@ def test_detect_keyword_multiple_matches(): EXEC SQL SELECT * FROM TABLE END-EXEC. - SORT ON KEY WS-KEY. + SORT SORT-FILE ON KEY WS-KEY. CALL 'SUBPGM'. STOP RUN. """ @@ -36,7 +36,7 @@ def test_detect_keyword_multiple_matches(): assert cat_map["DB操作"][0] == 0.95 assert cat_map["DB操作"][1] == "EXEC SQL" assert cat_map["SORT"][0] == 0.95 - assert cat_map["SORT"][1] == "SORT ON KEY" + assert cat_map["SORT"][1].startswith("re:SORT") # regex pattern assert cat_map["子程序调用"][0] == 0.90 assert cat_map["子程序调用"][1] == "CALL" @@ -154,7 +154,7 @@ def test_detect_keyword_mixed_case_whitespace_comments(): matched_keywords = {r[2] for r in results} assert "EXEC SQL" in matched_keywords assert "CALL" in matched_keywords - assert "SORT ON KEY" in matched_keywords + assert any(r[0] == "SORT" for r in results) # SORT detected via regex # ── 5. No keyword match and no LLM result → unknown ── @@ -190,8 +190,8 @@ def test_detect_keyword_all_rules(): ("ALPHABETIC", "编码转换"), ("DFHCOMMAREA", "online"), ("MAP", "online"), - ("SORT ON KEY", "SORT"), - ("MERGE ON KEY", "MERGE"), + ("SORT SORT-FILE ON KEY", "SORT"), + ("MERGE MERGE-FILE ON KEY", "MERGE"), ("WRITE AFTER", "编辑输出"), ("WRITE BEFORE", "编辑输出"), ("ORGANIZATION IS", "文件编成"), diff --git a/tests/hina/test_rule_engine.py b/tests/hina/test_rule_engine.py index 6bfceb7..ceb30c1 100644 --- a/tests/hina/test_rule_engine.py +++ b/tests/hina/test_rule_engine.py @@ -121,24 +121,24 @@ def test_validation_vs_keybreak_unknown(): # ═══════════════════════════════════════════════════════════════════════════ def test_csv_merge_vs_split_merge(): - """STRING 存在 → CSV合并""" - features = {"has_string": True, "has_inspect": False} + """STRING + 逗号分隔 → CSV合并""" + features = {"has_string": True, "has_csv_merge": True, "has_inspect": False} result = resolve_csv_merge_vs_split(features) assert result["resolved_type"] == "CSV合并" assert result["confidence"] >= 0.70 def test_csv_merge_vs_split_split(): - """INSPECT REPLACING 存在 → CSV拆分""" - features = {"has_string": False, "has_inspect": True} + """INSPECT REPLACING + 逗号 → CSV拆分""" + features = {"has_string": False, "has_csv_split": True, "has_inspect": True} result = resolve_csv_merge_vs_split(features) assert result["resolved_type"] == "CSV拆分" assert result["confidence"] >= 0.70 def test_csv_merge_vs_split_both(): - """两个都存在 → STRING 优先 (CSV合并)""" - features = {"has_string": True, "has_inspect": True} + """CSV合并证据优先 → CSV合并""" + features = {"has_string": True, "has_csv_merge": True, "has_inspect": True, "has_csv_split": True} result = resolve_csv_merge_vs_split(features) assert result["resolved_type"] == "CSV合并"