feat: structural matching detection — no KEY variable needed
Add _detect_matching_structure(): detection based on control flow pattern, not variable naming conventions. Uses 5 structural signals: 1. READ + AT END + EOF pattern 2. PERFORM UNTIL with EOF condition 3. ELSE body with conditional READ (matching core) 4. IF comparing hyphenated fields (cross-file comparison) 5. Multi-file OPEN INPUT 5/5 signals → 0.55, 4/5 → 0.50, 3/5 → 0.40. Real-world impact: matching programs with key fields named CUST-CODE and ORDR-CODE (no '-KEY' in name) are now correctly detected. Also: - Rule engine type priority: main types (マッチング etc.) override secondary types (M:N, DIVIDE) when keyword confidence is low - has_structural_match injected into features so rule engine can use it - matching_vs_keybreak accepts equality IFs as matching evidence - New test: test_structural_matching_no_keyword() Regression: 764 passed (0 new failures).
This commit is contained in:
@@ -4,8 +4,10 @@ COBOL 迁移专家设计的攻击面:
|
||||
- FP: 非匹配程序被误判为マッチング
|
||||
- FN: 真实匹配程序未被识别
|
||||
- 边界: 注释关键词、旧式命名、多文件非匹配
|
||||
- FN: 变量名不含 KEY 但结构是匹配程序
|
||||
"""
|
||||
|
||||
import re
|
||||
from pathlib import Path
|
||||
import pytest
|
||||
|
||||
@@ -16,25 +18,23 @@ from hina.classifier import detect_keyword
|
||||
FIXTURES = Path(__file__).parents[3] / "test-data" / "cobol" / "adversarial"
|
||||
|
||||
# (filename, expect_matching, reason)
|
||||
# expect_matching=True → must be マッチング/二段階
|
||||
# expect_matching=False → must NOT be マッチング/二段階
|
||||
ADVERSARIAL_TESTS = [
|
||||
("ADV-FALSE-KEY.cbl", False,
|
||||
"FP: WS-KEY 变量但只是简单 ADD 程序,不应触发匹配"),
|
||||
"FP: WS-KEY variable but only simple ADD, should NOT trigger matching"),
|
||||
("ADV-KEY-IN-COMMENT.cbl", False,
|
||||
"FP: KEY 只在 *> 注释中,不应触发匹配"),
|
||||
"FP: KEY only in *> comments, should NOT trigger matching"),
|
||||
("ADV-PREVKEY-FAKE.cbl", False,
|
||||
"FP: WS-PREV-KEY 但无匹配逻辑,不应触发匹配"),
|
||||
"FP: WS-PREV-KEY without matching logic, should NOT trigger"),
|
||||
("ADV-OLD-SCHOOL.cbl", True,
|
||||
"FN: K01-KEY 旧式命名,应识别为匹配"),
|
||||
"FN: K01-KEY old-school naming, should detect matching"),
|
||||
("ADV-TINY-MATCH.cbl", True,
|
||||
"FN: 极简匹配程序(1 文件),应识别"),
|
||||
"FN: Minimal matching (1 file), should detect"),
|
||||
("ADV-CALL-MATCH.cbl", False,
|
||||
"FP: CALL+WS-MAST-KEY,子程序调用应优先"),
|
||||
"FP: CALL+WS-MAST-KEY, subprogram call should win"),
|
||||
("ADV-ASCII-KEY.cbl", False,
|
||||
"FP: ASCII+WS-KEY,编码转换应优先"),
|
||||
"FP: ASCII+WS-KEY, encoding conversion should win"),
|
||||
("ADV-10FILES.cbl", False,
|
||||
"FP: 10 文件无 KEY 比较,不应触发匹配"),
|
||||
"FP: 10 files no KEY comparison, should NOT trigger matching"),
|
||||
]
|
||||
|
||||
|
||||
@@ -44,21 +44,18 @@ ADVERSARIAL_TESTS = [
|
||||
ids=[t[0].replace('.cbl','') for t in ADVERSARIAL_TESTS],
|
||||
)
|
||||
def test_adversarial(filename, expect_matching, reason):
|
||||
"""对抗性测试:验证明假阳性/假阴性"""
|
||||
"""Adversarial test: false positive / false negative check"""
|
||||
path = FIXTURES / filename
|
||||
assert path.exists(), f"Missing: {path}"
|
||||
src = path.read_text("utf-8")
|
||||
|
||||
# 1. extract_structure must not crash
|
||||
struct = extract_structure(src)
|
||||
assert struct is not None
|
||||
|
||||
# 2. classify_program must not crash
|
||||
result = classify_program(src)
|
||||
assert result is not None
|
||||
assert result["confidence"] >= 0
|
||||
|
||||
# 3. False positive/negative check
|
||||
is_matching = "マッチング" in result["category"] or "二段階" in result["category"]
|
||||
if expect_matching:
|
||||
assert is_matching, (
|
||||
@@ -71,10 +68,74 @@ def test_adversarial(filename, expect_matching, reason):
|
||||
f"(conf={result['confidence']:.2f}). Reason: {reason}"
|
||||
)
|
||||
|
||||
# 4. Keyword detection sanity
|
||||
kw = detect_keyword(src)
|
||||
if expect_matching:
|
||||
# Matching programs should have at least 1 keyword match
|
||||
assert len(kw) >= 1 or result["method"] != "rule_engine_fallback", (
|
||||
f"{filename}: matching program with 0 keyword matches"
|
||||
)
|
||||
|
||||
|
||||
def test_structural_matching_no_keyword():
|
||||
"""FN: Matching program without KEY in variable names (CUST-CODE vs ORDR-CODE)
|
||||
|
||||
Real-world COBOL matching programs often use -CODE or -ID instead of -KEY.
|
||||
Structural detection must catch these even without naming hints.
|
||||
"""
|
||||
src = """ IDENTIFICATION DIVISION.
|
||||
PROGRAM-ID. REALMT.
|
||||
ENVIRONMENT DIVISION.
|
||||
INPUT-OUTPUT SECTION.
|
||||
FILE-CONTROL.
|
||||
SELECT CUST-FILE ASSIGN TO 'CUST.DAT'.
|
||||
SELECT ORDR-FILE ASSIGN TO 'ORDR.DAT'.
|
||||
DATA DIVISION.
|
||||
FILE SECTION.
|
||||
FD CUST-FILE.
|
||||
01 CUST-REC.
|
||||
05 CUST-CODE PIC X(10).
|
||||
05 CUST-NAME PIC X(30).
|
||||
FD ORDR-FILE.
|
||||
01 ORDR-REC.
|
||||
05 ORDR-CODE PIC X(10).
|
||||
05 ORDR-AMT PIC 9(7)V99.
|
||||
WORKING-STORAGE SECTION.
|
||||
01 WS-CUST-CODE PIC X(10).
|
||||
01 WS-ORDR-CODE PIC X(10).
|
||||
01 WS-EOF1 PIC X VALUE 'N'.
|
||||
01 WS-EOF2 PIC X VALUE 'N'.
|
||||
PROCEDURE DIVISION.
|
||||
MAIN.
|
||||
OPEN INPUT CUST-FILE ORDR-FILE.
|
||||
READ CUST-FILE INTO CUST-REC
|
||||
AT END MOVE 'Y' TO WS-EOF1.
|
||||
READ ORDR-FILE INTO ORDR-REC
|
||||
AT END MOVE 'Y' TO WS-EOF2.
|
||||
PERFORM UNTIL WS-EOF1 = 'Y' OR WS-EOF2 = 'Y'
|
||||
IF CUST-CODE = ORDR-CODE
|
||||
DISPLAY 'MATCH'
|
||||
ELSE IF CUST-CODE < ORDR-CODE
|
||||
READ CUST-FILE AT END MOVE 'Y' TO WS-EOF1
|
||||
ELSE
|
||||
READ ORDR-FILE AT END MOVE 'Y' TO WS-EOF2
|
||||
END-IF
|
||||
END-PERFORM.
|
||||
CLOSE CUST-FILE ORDR-FILE.
|
||||
STOP RUN.
|
||||
"""
|
||||
result = classify_program(src)
|
||||
kw = detect_keyword(src)
|
||||
|
||||
# Must have structural matching keyword
|
||||
assert any("structural" in k[2] for k in kw), (
|
||||
f"Expected structural matching keyword, got {kw}"
|
||||
)
|
||||
|
||||
# Must be classified as matching
|
||||
assert "マッチング" in result["category"] or "二段階" in result["category"], (
|
||||
f"Expected matching, got '{result['category']}'"
|
||||
)
|
||||
|
||||
# Confidence should be reasonable
|
||||
assert result["confidence"] > 0.30, (
|
||||
f"Confidence too low: {result['confidence']:.2f}"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user