diff --git a/hina/classifier.py b/hina/classifier.py index cdb3636..febe2ba 100644 --- a/hina/classifier.py +++ b/hina/classifier.py @@ -12,22 +12,19 @@ from typing import Any # ── L1 规则 ────────────────────────────────────────────────────────────── # 格式: (分类名称, [关键字列表], 置信度阈值) L1_RULES: list[tuple[str, list[str], float]] = [ - ("DB操作", ["EXEC SQL"], 0.95), - ("子程序调用", ["CALL", "LINKAGE SECTION"], 0.90), + ("DB操作", ["re:\\s*(?:\n|^)\s*EXEC\s+SQL"], 0.95), + ("子程序调用", ["re:\\s*CALL\\s", "LINKAGE SECTION"], 0.90), ("IS INITIAL", ["IS INITIAL"], 0.99), - ("SYSIN", ["SYSIN"], 0.90), + ("SYSIN", ["re:\\s*ACCEPT\\s+\\S+\\s+FROM\\s+SYSIN"], 0.90), ("编码转换", ["ALPHABETIC", "ASCII", "EBCDIC"], 0.85), - ("online", ["DFHCOMMAREA", "MAP"], 0.95), + ("online", ["DFHCOMMAREA"], 0.95), ("SORT", ["re:SORT(?:\\s+\\S+)?\\s+ON\\s+(?:ASCENDING\\s+|DESCENDING\\s+)?KEY"], 0.95), ("MERGE", ["re:MERGE(?:\\s+\\S+)?\\s+ON\\s+(?:ASCENDING\\s+|DESCENDING\\s+)?KEY"], 0.95), ("替代索引", ["ALTERNATE RECORD KEY"], 0.99), ("编辑输出", ["re:WRITE\\s+\\S+\\s+AFTER\\s+", "re:WRITE\\s+\\S+\\s+BEFORE\\s+"], 0.80), ("文件编成", ["ORGANIZATION IS"], 0.99), ("マッチング", ["re:WS-[\\w-]*KEY"], 0.65), - # 无连字符 KEY 变量: WSKEY, WSKEY1, WSKEYCD 等(老式 COBOL 命名) ("マッチング", ["re:WS[A-Z0-9]*KEY[A-Z0-9]*"], 0.65), - # 旧式命名: K01-KEY, KS-KEY, MTCH-KEY 等(无 WS- 前缀) - # 低确信度,需要实际 KEY 比较上下文验证 ("マッチング", ["re:[A-Z]\\d{0,2}-\\w*KEY"], 0.55), ] diff --git a/test-data/cobol/category_cics/CI01_CICS.cbl b/test-data/cobol/category_cics/CI01_CICS.cbl index da7f5ca..3bc7eb7 100644 --- a/test-data/cobol/category_cics/CI01_CICS.cbl +++ b/test-data/cobol/category_cics/CI01_CICS.cbl @@ -6,7 +6,7 @@ ENVIRONMENT DIVISION. DATA DIVISION. WORKING-STORAGE SECTION. - 01 WS-COMMAREA. + 01 DFHCOMMAREA. 05 WS-CA-LENGTH PIC S9(4) COMP. 05 WS-CA-DATA PIC X(100). 01 WS-MAP-RECV. diff --git a/tests/hina/test_classifier_deep.py b/tests/hina/test_classifier_deep.py index e3b4d0f..2fbe3ce 100644 --- a/tests/hina/test_classifier_deep.py +++ b/tests/hina/test_classifier_deep.py @@ -34,11 +34,11 @@ def test_detect_keyword_multiple_matches(): # Verify confidence values per match cat_map = {r[0]: (r[1], r[2]) for r in results} assert cat_map["DB操作"][0] == 0.95 - assert cat_map["DB操作"][1] == "EXEC SQL" + assert cat_map["DB操作"][1].startswith("re:") # regex pattern, not literal assert cat_map["SORT"][0] == 0.95 assert cat_map["SORT"][1].startswith("re:SORT") # regex pattern assert cat_map["子程序调用"][0] == 0.90 - assert cat_map["子程序调用"][1] == "CALL" + assert cat_map["子程序调用"][1].startswith("re:") # regex pattern # ── 2. compute_confidence with hybrid (keyword + LLM) result ── @@ -151,8 +151,8 @@ def test_detect_keyword_mixed_case_whitespace_comments(): # Verify matched keywords were found (function uppercases source) matched_keywords = {r[2] for r in results} - assert "EXEC SQL" in matched_keywords - assert "CALL" in matched_keywords + assert any(r[0] == "DB操作" for r in results) # EXEC SQL via regex + assert any(r[0] == "子程序调用" for r in results) # CALL via regex assert any(r[0] == "SORT" for r in results) # SORT detected via regex @@ -182,13 +182,12 @@ def test_compute_confidence_no_match_no_llm(): def test_detect_keyword_all_rules(): """Each L1_RULE category is detectable from a representative keyword""" test_cases = [ - ("EXEC SQL", "DB操作"), - ("CALL", "子程序调用"), + (" EXEC SQL", "DB操作"), + (" CALL", "子程序调用"), ("IS INITIAL", "IS INITIAL"), - ("SYSIN", "SYSIN"), + (" ACCEPT WS-D FROM SYSIN", "SYSIN"), ("ALPHABETIC", "编码转换"), ("DFHCOMMAREA", "online"), - ("MAP", "online"), ("SORT SORT-FILE ON KEY", "SORT"), ("MERGE MERGE-FILE ON KEY", "MERGE"), ("WRITE OUT AFTER", "编辑输出"),