fix: HINA 全类型缺陷修复 — SORT/CSV/ALT 3 个真实缺陷
对抗性全类型测试发现的缺陷和修复: 缺陷1: SORT/MERGE L1 关键词太严格(漏检) - 旧: 'SORT ON KEY' / 'MERGE ON KEY'(精确字符串) - COBOL 中的真实写法: SORT WORK-FILE ON ASCENDING KEY ... - 新: 正则 SORT(?:\s+\S+)?\s+ON\s+(?:ASCENDING|DESCENDING)?KEY 缺陷2: CSV 假阳性(STRING/INSPECT 非CSV也触发) - 旧: has_string=True -> CSV合并 - 新: 要求 has_csv_merge(STRING+逗号分隔) - 单纯字符串拼接不再触发 CSV 分类 缺陷3: ALTERNATE RECORD KEY 被 ORGANIZATION IS 覆盖 - 旧: 文件编成先于替代索引(同确信度先者胜) - 新: 替代索引放前面(更具体的分类优先) 回归: 767 passed(0 new failures)
This commit is contained in:
+3
-3
@@ -18,11 +18,11 @@ L1_RULES: list[tuple[str, list[str], float]] = [
|
|||||||
("SYSIN", ["SYSIN"], 0.90),
|
("SYSIN", ["SYSIN"], 0.90),
|
||||||
("编码转换", ["ALPHABETIC", "ASCII", "EBCDIC"], 0.85),
|
("编码转换", ["ALPHABETIC", "ASCII", "EBCDIC"], 0.85),
|
||||||
("online", ["DFHCOMMAREA", "MAP"], 0.95),
|
("online", ["DFHCOMMAREA", "MAP"], 0.95),
|
||||||
("SORT", ["SORT ON KEY"], 0.95),
|
("SORT", ["re:SORT(?:\\s+\\S+)?\\s+ON\\s+(?:ASCENDING\\s+|DESCENDING\\s+)?KEY"], 0.95),
|
||||||
("MERGE", ["MERGE ON KEY"], 0.95),
|
("MERGE", ["re:MERGE(?:\\s+\\S+)?\\s+ON\\s+(?:ASCENDING\\s+|DESCENDING\\s+)?KEY"], 0.95),
|
||||||
|
("替代索引", ["ALTERNATE RECORD KEY"], 0.99),
|
||||||
("编辑输出", ["WRITE AFTER", "WRITE BEFORE"], 0.80),
|
("编辑输出", ["WRITE AFTER", "WRITE BEFORE"], 0.80),
|
||||||
("文件编成", ["ORGANIZATION IS"], 0.99),
|
("文件编成", ["ORGANIZATION IS"], 0.99),
|
||||||
("替代索引", ["ALTERNATE RECORD KEY"], 0.99),
|
|
||||||
("マッチング", ["re:WS-[\\w-]*KEY"], 0.65),
|
("マッチング", ["re:WS-[\\w-]*KEY"], 0.65),
|
||||||
# 无连字符 KEY 变量: WSKEY, WSKEY1, WSKEYCD 等(老式 COBOL 命名)
|
# 无连字符 KEY 变量: WSKEY, WSKEY1, WSKEYCD 等(老式 COBOL 命名)
|
||||||
("マッチング", ["re:WS[A-Z0-9]*KEY[A-Z0-9]*"], 0.65),
|
("マッチング", ["re:WS[A-Z0-9]*KEY[A-Z0-9]*"], 0.65),
|
||||||
|
|||||||
@@ -173,6 +173,15 @@ def _path_rule_engine(
|
|||||||
r'(?:PERFORM|END-PERFORM|READ)', # 含循环/读取
|
r'(?:PERFORM|END-PERFORM|READ)', # 含循环/读取
|
||||||
su, re.DOTALL
|
su, re.DOTALL
|
||||||
))
|
))
|
||||||
|
# 注入 CSV 信号:逗号分隔的字符串拼接/替换
|
||||||
|
features["has_csv_merge"] = bool(re.search(
|
||||||
|
r"STRING[\s\S]*?','[\s\S]*?INTO", # STRING ... ',' ... INTO
|
||||||
|
su
|
||||||
|
))
|
||||||
|
features["has_csv_split"] = bool(re.search(
|
||||||
|
r"INSPECT[\s\S]*?REPLACING[\s\S]*?,',", # INSPECT ... REPLACING ... ','
|
||||||
|
su
|
||||||
|
))
|
||||||
|
|
||||||
# 2. 运行所有混淆组解析器
|
# 2. 运行所有混淆组解析器
|
||||||
resolved_types: dict[str, str] = {}
|
resolved_types: dict[str, str] = {}
|
||||||
|
|||||||
@@ -106,21 +106,33 @@ def resolve_csv_merge_vs_split(features: dict) -> dict:
|
|||||||
"""区分 CSV 合并与拆分。
|
"""区分 CSV 合并与拆分。
|
||||||
|
|
||||||
规则:
|
规则:
|
||||||
- STRING 语句存在 → 无换行 (合并, merge)
|
- STRING 存在且含逗号分隔 → 无换行 (合并, merge)
|
||||||
- INSPECT REPLACING 存在 → 有换行 (拆分, split)
|
- INSPECT REPLACING 含逗号/改行 → 有换行 (拆分, split)
|
||||||
|
单纯的 STRING 拼接/INSPECT 计数不触发(容易假阳性)。
|
||||||
"""
|
"""
|
||||||
has_string = features.get("has_string", False)
|
has_string = features.get("has_string", False)
|
||||||
has_inspect = features.get("has_inspect", False)
|
has_inspect = features.get("has_inspect", False)
|
||||||
|
has_csv_merge = features.get("has_csv_merge", False) # 从源码注入
|
||||||
|
has_csv_split = features.get("has_csv_split", False) # 从源码注入
|
||||||
evidence: list[str] = []
|
evidence: list[str] = []
|
||||||
|
|
||||||
if has_string:
|
if has_csv_merge:
|
||||||
evidence.append("STRING 语句存在 → CSV 合并 (无换行)")
|
evidence.append("STRING + 逗号分隔 → CSV 合并 (无换行)")
|
||||||
return {"resolved_type": "CSV合并", "confidence": 0.85, "evidence": evidence}
|
return {"resolved_type": "CSV合并", "confidence": 0.85, "evidence": evidence}
|
||||||
|
|
||||||
if has_inspect:
|
if has_csv_split:
|
||||||
evidence.append("INSPECT REPLACING 存在 → CSV 拆分 (有换行)")
|
evidence.append("INSPECT REPLACING 含逗号/改行 → CSV 拆分")
|
||||||
return {"resolved_type": "CSV拆分", "confidence": 0.85, "evidence": evidence}
|
return {"resolved_type": "CSV拆分", "confidence": 0.85, "evidence": evidence}
|
||||||
|
|
||||||
|
# 兼容旧版:
|
||||||
|
if has_string:
|
||||||
|
evidence.append("STRING 存在但无逗号分隔 → 非CSV(低确信度)")
|
||||||
|
return {"resolved_type": "unknown", "confidence": 0.0, "evidence": evidence}
|
||||||
|
|
||||||
|
if has_inspect:
|
||||||
|
evidence.append("INSPECT 存在但无逗号/改行 → 非CSV(低确信度)")
|
||||||
|
return {"resolved_type": "unknown", "confidence": 0.0, "evidence": evidence}
|
||||||
|
|
||||||
evidence.append("既无 STRING 也无 INSPECT REPLACING")
|
evidence.append("既无 STRING 也无 INSPECT REPLACING")
|
||||||
return {"resolved_type": "unknown", "confidence": 0.0, "evidence": evidence}
|
return {"resolved_type": "unknown", "confidence": 0.0, "evidence": evidence}
|
||||||
|
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ def test_detect_keyword_multiple_matches():
|
|||||||
EXEC SQL
|
EXEC SQL
|
||||||
SELECT * FROM TABLE
|
SELECT * FROM TABLE
|
||||||
END-EXEC.
|
END-EXEC.
|
||||||
SORT ON KEY WS-KEY.
|
SORT SORT-FILE ON KEY WS-KEY.
|
||||||
CALL 'SUBPGM'.
|
CALL 'SUBPGM'.
|
||||||
STOP RUN.
|
STOP RUN.
|
||||||
"""
|
"""
|
||||||
@@ -36,7 +36,7 @@ def test_detect_keyword_multiple_matches():
|
|||||||
assert cat_map["DB操作"][0] == 0.95
|
assert cat_map["DB操作"][0] == 0.95
|
||||||
assert cat_map["DB操作"][1] == "EXEC SQL"
|
assert cat_map["DB操作"][1] == "EXEC SQL"
|
||||||
assert cat_map["SORT"][0] == 0.95
|
assert cat_map["SORT"][0] == 0.95
|
||||||
assert cat_map["SORT"][1] == "SORT ON KEY"
|
assert cat_map["SORT"][1].startswith("re:SORT") # regex pattern
|
||||||
assert cat_map["子程序调用"][0] == 0.90
|
assert cat_map["子程序调用"][0] == 0.90
|
||||||
assert cat_map["子程序调用"][1] == "CALL"
|
assert cat_map["子程序调用"][1] == "CALL"
|
||||||
|
|
||||||
@@ -154,7 +154,7 @@ def test_detect_keyword_mixed_case_whitespace_comments():
|
|||||||
matched_keywords = {r[2] for r in results}
|
matched_keywords = {r[2] for r in results}
|
||||||
assert "EXEC SQL" in matched_keywords
|
assert "EXEC SQL" in matched_keywords
|
||||||
assert "CALL" in matched_keywords
|
assert "CALL" in matched_keywords
|
||||||
assert "SORT ON KEY" in matched_keywords
|
assert any(r[0] == "SORT" for r in results) # SORT detected via regex
|
||||||
|
|
||||||
|
|
||||||
# ── 5. No keyword match and no LLM result → unknown ──
|
# ── 5. No keyword match and no LLM result → unknown ──
|
||||||
@@ -190,8 +190,8 @@ def test_detect_keyword_all_rules():
|
|||||||
("ALPHABETIC", "编码转换"),
|
("ALPHABETIC", "编码转换"),
|
||||||
("DFHCOMMAREA", "online"),
|
("DFHCOMMAREA", "online"),
|
||||||
("MAP", "online"),
|
("MAP", "online"),
|
||||||
("SORT ON KEY", "SORT"),
|
("SORT SORT-FILE ON KEY", "SORT"),
|
||||||
("MERGE ON KEY", "MERGE"),
|
("MERGE MERGE-FILE ON KEY", "MERGE"),
|
||||||
("WRITE AFTER", "编辑输出"),
|
("WRITE AFTER", "编辑输出"),
|
||||||
("WRITE BEFORE", "编辑输出"),
|
("WRITE BEFORE", "编辑输出"),
|
||||||
("ORGANIZATION IS", "文件编成"),
|
("ORGANIZATION IS", "文件编成"),
|
||||||
|
|||||||
@@ -121,24 +121,24 @@ def test_validation_vs_keybreak_unknown():
|
|||||||
# ═══════════════════════════════════════════════════════════════════════════
|
# ═══════════════════════════════════════════════════════════════════════════
|
||||||
|
|
||||||
def test_csv_merge_vs_split_merge():
|
def test_csv_merge_vs_split_merge():
|
||||||
"""STRING 存在 → CSV合并"""
|
"""STRING + 逗号分隔 → CSV合并"""
|
||||||
features = {"has_string": True, "has_inspect": False}
|
features = {"has_string": True, "has_csv_merge": True, "has_inspect": False}
|
||||||
result = resolve_csv_merge_vs_split(features)
|
result = resolve_csv_merge_vs_split(features)
|
||||||
assert result["resolved_type"] == "CSV合并"
|
assert result["resolved_type"] == "CSV合并"
|
||||||
assert result["confidence"] >= 0.70
|
assert result["confidence"] >= 0.70
|
||||||
|
|
||||||
|
|
||||||
def test_csv_merge_vs_split_split():
|
def test_csv_merge_vs_split_split():
|
||||||
"""INSPECT REPLACING 存在 → CSV拆分"""
|
"""INSPECT REPLACING + 逗号 → CSV拆分"""
|
||||||
features = {"has_string": False, "has_inspect": True}
|
features = {"has_string": False, "has_csv_split": True, "has_inspect": True}
|
||||||
result = resolve_csv_merge_vs_split(features)
|
result = resolve_csv_merge_vs_split(features)
|
||||||
assert result["resolved_type"] == "CSV拆分"
|
assert result["resolved_type"] == "CSV拆分"
|
||||||
assert result["confidence"] >= 0.70
|
assert result["confidence"] >= 0.70
|
||||||
|
|
||||||
|
|
||||||
def test_csv_merge_vs_split_both():
|
def test_csv_merge_vs_split_both():
|
||||||
"""两个都存在 → STRING 优先 (CSV合并)"""
|
"""CSV合并证据优先 → CSV合并"""
|
||||||
features = {"has_string": True, "has_inspect": True}
|
features = {"has_string": True, "has_csv_merge": True, "has_inspect": True, "has_csv_split": True}
|
||||||
result = resolve_csv_merge_vs_split(features)
|
result = resolve_csv_merge_vs_split(features)
|
||||||
assert result["resolved_type"] == "CSV合并"
|
assert result["resolved_type"] == "CSV合并"
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user