feat: structural matching detection — no KEY variable needed
Add _detect_matching_structure(): detection based on control flow pattern, not variable naming conventions. Uses 5 structural signals: 1. READ + AT END + EOF pattern 2. PERFORM UNTIL with EOF condition 3. ELSE body with conditional READ (matching core) 4. IF comparing hyphenated fields (cross-file comparison) 5. Multi-file OPEN INPUT 5/5 signals → 0.55, 4/5 → 0.50, 3/5 → 0.40. Real-world impact: matching programs with key fields named CUST-CODE and ORDR-CODE (no '-KEY' in name) are now correctly detected. Also: - Rule engine type priority: main types (マッチング etc.) override secondary types (M:N, DIVIDE) when keyword confidence is low - has_structural_match injected into features so rule engine can use it - matching_vs_keybreak accepts equality IFs as matching evidence - New test: test_structural_matching_no_keyword() Regression: 764 passed (0 new failures).
This commit is contained in:
@@ -92,6 +92,48 @@ def _get_procedure_division(source_upper: str) -> str:
|
||||
return source_upper
|
||||
|
||||
|
||||
def _detect_matching_structure(source_upper: str) -> float:
|
||||
"""结构检测:不依赖变量名 KEY 的模式匹配检测。
|
||||
|
||||
通过分析 COBOL 程序的控制流结构判断是否为匹配程序。
|
||||
返回确信度 0.0~0.55,0.0 表示不是匹配。
|
||||
|
||||
匹配程序的结构性特征:
|
||||
信号 1: READ + AT END + EOF(文件读取循环)
|
||||
信号 2: PERFORM UNTIL + EOF(主循环)
|
||||
信号 3: ELSE 体内 READ(条件性读取——匹配核心)
|
||||
信号 4: IF 比较两个连字号字段(跨文件字段比较)
|
||||
信号 5: 2+ 文件 OPEN INPUT(多文件输入)
|
||||
"""
|
||||
import re
|
||||
|
||||
signals = 0
|
||||
# 信号 1: READ + AT END + EOF(文件读取循环)
|
||||
if re.search(r'READ\s+\w+.*AT\s+END.*EOF', source_upper):
|
||||
signals += 1
|
||||
# 信号 2: PERFORM UNTIL + EOF(主循环)
|
||||
if re.search(r'PERFORM\s+UNTIL\s+.*EOF', source_upper):
|
||||
signals += 1
|
||||
# 信号 3: ELSE 体内 READ(条件性读取)
|
||||
if re.search(r'ELSE\s+.*READ\s+', source_upper):
|
||||
signals += 1
|
||||
# 信号 4: IF 比较两个连字号字段(跨文件字段比较)
|
||||
if re.search(r'IF\s+\w+-\w+\s*[=<>]\s*\w+-\w+', source_upper):
|
||||
signals += 1
|
||||
# 信号 5: 2+ 文件 OPEN INPUT
|
||||
if re.search(r'OPEN\s+INPUT\s+\w+\s+\w+', source_upper):
|
||||
signals += 1
|
||||
|
||||
# 确信度: 5 中 5 = 0.55, 5 中 4 = 0.50, 5 中 3 = 0.40
|
||||
if signals >= 5:
|
||||
return 0.55
|
||||
elif signals >= 4:
|
||||
return 0.50
|
||||
elif signals >= 3:
|
||||
return 0.40
|
||||
return 0.0
|
||||
|
||||
|
||||
def detect_keyword(source: str) -> list[tuple[str, float, str]]:
|
||||
"""在 COBOL 源码中搜索 L1_RULES 定义的关键字,返回匹配结果。
|
||||
|
||||
@@ -135,6 +177,15 @@ def detect_keyword(source: str) -> list[tuple[str, float, str]]:
|
||||
matched = True
|
||||
break
|
||||
|
||||
# ── 结构性匹配检测(不依赖 KEY 变量名)──
|
||||
match_conf = _detect_matching_structure(source_upper)
|
||||
if match_conf > 0:
|
||||
has_more_specific = any(
|
||||
cat != "マッチング" for cat, _, _ in results
|
||||
)
|
||||
if not has_more_specific:
|
||||
results.append(("マッチング", match_conf, "structural_matching"))
|
||||
|
||||
return results
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user