ecf3c1cd61
本物のCOBOL技術者による全タイプ検証: 発見・修正されたバグ: 1. WRITE AFTER/BEFORE L1キーワードが実COBOLで決してマッチしない - 旧: 'WRITE AFTER'(文字列一致)→ 実COBOL: 'WRITE レコード名 AFTER' - 新: re:WRITE\s+\S+\s+AFTER\s+(正規表現) 2. CSV分割検出の正規表現が壊れていた - 旧: r"INSPECT...REPLACING...'," (コンマ引用符コンマ) - 新: r"INSPECT...REPLACING...','" (引用符コンマ引用符) 全35タイプの分類結果: マッチング系(7): ✅ 全7/7 マッチング/項目チェック キーブレイク系(1): ✅ 項目チェック(重複含む) 条件分岐系(2): ✅ 全2/2 編集処理系(1): ✅ 編集処理(校验) データベース系(1): ✅ DB操作 データ分割系(1): ✅ DIVIDE_100.0 項目チェック系(1): ✅ 項目チェック(重複含む) 内部処理系(1): ✅ 内部処理 オンライン系(1): ✅ オンライン(CICS) SORT/MERGE(2): ✅ SORT + MERGE L1直結型(11): ✅ 全11/11 ルールエンジン(6): ✅ 全6/6 回帰: 767 passed(0 new failures)
287 lines
11 KiB
Python
287 lines
11 KiB
Python
"""
|
||
HINA 程序分类器 — L1 关键字规则 + 确信度计算。
|
||
|
||
通过 COBOL 源码中的关键字匹配进行程序分类,支持多级确信度判定。
|
||
"""
|
||
|
||
from __future__ import annotations
|
||
|
||
import re
|
||
from typing import Any
|
||
|
||
# ── L1 规则 ──────────────────────────────────────────────────────────────
|
||
# 格式: (分类名称, [关键字列表], 置信度阈值)
|
||
L1_RULES: list[tuple[str, list[str], float]] = [
|
||
("DB操作", ["EXEC SQL"], 0.95),
|
||
("子程序调用", ["CALL", "LINKAGE SECTION"], 0.90),
|
||
("IS INITIAL", ["IS INITIAL"], 0.99),
|
||
("SYSIN", ["SYSIN"], 0.90),
|
||
("编码转换", ["ALPHABETIC", "ASCII", "EBCDIC"], 0.85),
|
||
("online", ["DFHCOMMAREA", "MAP"], 0.95),
|
||
("SORT", ["re:SORT(?:\\s+\\S+)?\\s+ON\\s+(?:ASCENDING\\s+|DESCENDING\\s+)?KEY"], 0.95),
|
||
("MERGE", ["re:MERGE(?:\\s+\\S+)?\\s+ON\\s+(?:ASCENDING\\s+|DESCENDING\\s+)?KEY"], 0.95),
|
||
("替代索引", ["ALTERNATE RECORD KEY"], 0.99),
|
||
("编辑输出", ["re:WRITE\\s+\\S+\\s+AFTER\\s+", "re:WRITE\\s+\\S+\\s+BEFORE\\s+"], 0.80),
|
||
("文件编成", ["ORGANIZATION IS"], 0.99),
|
||
("マッチング", ["re:WS-[\\w-]*KEY"], 0.65),
|
||
# 无连字符 KEY 变量: WSKEY, WSKEY1, WSKEYCD 等(老式 COBOL 命名)
|
||
("マッチング", ["re:WS[A-Z0-9]*KEY[A-Z0-9]*"], 0.65),
|
||
# 旧式命名: K01-KEY, KS-KEY, MTCH-KEY 等(无 WS- 前缀)
|
||
# 低确信度,需要实际 KEY 比较上下文验证
|
||
("マッチング", ["re:[A-Z]\\d{0,2}-\\w*KEY"], 0.55),
|
||
]
|
||
|
||
# ── 冲突解决规则 ─────────────────────────────────────────────────────────
|
||
# 当 L1 匹配到多个分类时的消歧策略:
|
||
# value = "file_count" → 取测试数更多的分类
|
||
# value = "has_accumulator" → 取包含累加器的分类
|
||
CONFLICT_RULES: dict[tuple[str, str], str] = {
|
||
("マッチング", "キーブレイク"): "file_count",
|
||
("編集処理", "項目チェック"): "file_count",
|
||
("キーブレイク", "項目チェック(重複)"): "has_accumulator",
|
||
}
|
||
|
||
|
||
# ── 关键字检测 ───────────────────────────────────────────────────────────
|
||
def _strip_cobol_comments(source: str) -> str:
|
||
"""剥离 COBOL 注释,避免注释中的关键词触发 L1 匹配。
|
||
|
||
处理两种注释:
|
||
- 固定格式列 7: 行首 `*` (comment line)
|
||
- 自由格式/内联: `*> ...` 到行尾
|
||
"""
|
||
lines = source.split('\n')
|
||
cleaned = []
|
||
for line in lines:
|
||
# 自由格式/内联注释: *>
|
||
idx = line.find('*>')
|
||
if idx >= 0:
|
||
line = line[:idx]
|
||
# 固定格式注释行: 如果第一个非空字符是 *
|
||
stripped = line.strip()
|
||
if stripped.startswith('*') and not stripped.startswith('*/'):
|
||
continue # 跳过整个注释行
|
||
cleaned.append(line)
|
||
return '\n'.join(cleaned)
|
||
|
||
|
||
def _matches_key_comparison(source_upper: str) -> bool:
|
||
"""检查源码中是否包含实际的 KEY 变量比较(而非仅声明)。
|
||
|
||
匹配 KEY 变量在比较上下文中的使用:
|
||
WS-KEY = / WS-KEY > / WS-KEY <
|
||
IF WS-MAST-KEY
|
||
KEY = WS-...
|
||
"""
|
||
# 模式 1: KEY 变量出现在比较上下文中(= < > 后跟变量)
|
||
# 注意: 不能用 \s 代替 [=<>],否则「WS-KEY PIC」中的空格也会误匹配
|
||
if re.search(r'(?:WS-[\w-]*KEY[A-Z0-9-]*|WS[A-Z0-9]*KEY[A-Z0-9]*)\s*[=<>]', source_upper):
|
||
return True
|
||
# 模式 2: 非 WS- 前缀的 KEY 变量(旧式命名 K01-KEY 等)
|
||
if re.search(r'\b[A-Z]\d{0,2}-[\w-]*KEY\s*[=<>]', source_upper):
|
||
return True
|
||
# 模式 3: 源码中含有 READ INTO + KEY 变量
|
||
if re.search(r'READ\s+\w+\s+INTO\s+\w+.*KEY', source_upper, re.DOTALL):
|
||
return True
|
||
return False
|
||
|
||
|
||
def _get_procedure_division(source_upper: str) -> str:
|
||
"""只提取 PROCEDURE DIVISION 部分用于关键词匹配。"""
|
||
idx = source_upper.find('PROCEDURE DIVISION')
|
||
if idx >= 0:
|
||
return source_upper[idx:]
|
||
return source_upper
|
||
|
||
|
||
def _detect_matching_structure(source_upper: str) -> float:
|
||
"""结构检测:不依赖变量名 KEY 的模式匹配检测。
|
||
|
||
通过分析 COBOL 程序的控制流结构判断是否为匹配程序。
|
||
返回确信度 0.0~0.55,0.0 表示不是匹配。
|
||
|
||
匹配程序的结构性特征:
|
||
信号 1: READ + AT END + EOF/WS-*E* 变量(文件读取循环)
|
||
信号 2: PERFORM UNTIL + EOF/WS-*E* 变量(主循环)
|
||
信号 3: ELSE 体内 READ(条件性读取——匹配核心)
|
||
信号 4: IF 比较两个字段(跨文件字段比较,任何命名风格)
|
||
信号 5: 2+ 文件 OPEN INPUT(多文件输入)
|
||
"""
|
||
import re
|
||
|
||
signals = 0
|
||
|
||
# 信号 1: READ + AT END + 赋值(任何命名风格的 EOF 标志)
|
||
# COBOL 匹配程序至少有一个 READ ... AT END MOVE ...
|
||
# 匹配: READ F1 AT END MOVE 'Y' TO WS-EOF-A.
|
||
# 匹配: READ F1 INTO R1 AT END MOVE 'Y' TO WS-END-1.
|
||
# 匹配: READ F1 AT END MOVE 'Y' TO FE-1.
|
||
if re.search(r'READ\s+\w+(?:\s+INTO\s+\w+)?\s+AT\s+END', source_upper):
|
||
signals += 1
|
||
|
||
# 信号 1b: 第二个 READ(匹配程序通常有 2 个 READ)
|
||
reads = re.findall(r'\bREAD\s+\w+(?:\s+INTO\s+\w+)?', source_upper)
|
||
if len(reads) >= 2:
|
||
signals += 1
|
||
|
||
# 信号 2: PERFORM UNTIL + 结束条件(EOF, E1, END-FLAG 等)
|
||
if re.search(r'PERFORM\s+UNTIL\s+\w+[-A-Z0-9]*\s*=\s*[\'\"][YN]', source_upper):
|
||
signals += 1
|
||
|
||
# 信号 2b: GO TO 循环(LOOP〜EXIT-PGM/END)
|
||
if (re.search(r'GO\s+TO\s+LOOP|GO\s+TO\s+[A-Z]*-L|[A-Z]*LP\b', source_upper) and
|
||
re.search(r'IF\s+\w+.*=\s*[\'\"][YN]', source_upper)):
|
||
signals += 1
|
||
|
||
# 信号 3: ELSE 体内 READ(条件性读取——匹配核心)
|
||
if re.search(r'ELSE\s+.*READ\s+', source_upper) or re.search(r'ELSE\s+\w+\s+READ\s+', source_upper):
|
||
signals += 1
|
||
|
||
# 信号 4: IF 比较两个不同变量(跨文件字段比较,任何命名风格)
|
||
# K1 = K2 (简单名), CUST-CODE = ORDR-CODE (连字号), WS-KEY1 = WS-KEY2
|
||
if re.search(r'IF\s+\w[\w-]*\s*[=<>]\s*\w[\w-]*', source_upper):
|
||
signals += 1
|
||
|
||
# 信号 5: 2+ 文件 OPEN INPUT
|
||
if (re.search(r'OPEN\s+INPUT\s+\w+\s+\w+', source_upper) or # 同一行
|
||
re.search(r'OPEN\s+INPUT\s+\w+[.\s].*OPEN\s+INPUT', source_upper)): # 别行
|
||
signals += 1
|
||
|
||
# 确信度: 6 中 5+ = 0.55, 4 = 0.50, 3 = 0.40
|
||
if signals >= 5:
|
||
return 0.55
|
||
elif signals >= 4:
|
||
return 0.50
|
||
elif signals >= 3:
|
||
return 0.40
|
||
return 0.0
|
||
|
||
|
||
def detect_keyword(source: str) -> list[tuple[str, float, str]]:
|
||
"""在 COBOL 源码中搜索 L1_RULES 定义的关键字,返回匹配结果。
|
||
|
||
处理步骤:
|
||
1. 剥离注释,避免注释中的关键词触发匹配
|
||
2. 对需要程序上下文的关键词(マッチング),检查 KEY 变量是否在比较中使用
|
||
|
||
关键字前缀 "re:" 表示正则表达式匹配。
|
||
|
||
Args:
|
||
source: COBOL 程序源码文本。
|
||
|
||
Returns:
|
||
list[tuple[str, float, str]]:
|
||
每个元素为 (分类名称, 置信度, 匹配到的关键字原文)。
|
||
"""
|
||
cleaned = _strip_cobol_comments(source)
|
||
source_upper = cleaned.upper()
|
||
|
||
results: list[tuple[str, float, str]] = []
|
||
|
||
for category, keywords, confidence in L1_RULES:
|
||
matched = False
|
||
for kw in keywords:
|
||
if kw.startswith("re:"):
|
||
pattern = kw[3:]
|
||
if not re.search(pattern, source_upper):
|
||
continue
|
||
|
||
# マッチング 关键词需要额外上下文验证:KEY 变量必须在比较中使用
|
||
if category == "マッチング":
|
||
if not _matches_key_comparison(source_upper):
|
||
continue
|
||
|
||
results.append((category, confidence, kw))
|
||
matched = True
|
||
break
|
||
else:
|
||
if kw in source_upper:
|
||
results.append((category, confidence, kw))
|
||
matched = True
|
||
break
|
||
|
||
# ── 结构性匹配检测(不依赖 KEY 变量名)──
|
||
match_conf = _detect_matching_structure(source_upper)
|
||
if match_conf > 0:
|
||
has_more_specific = any(
|
||
cat != "マッチング" for cat, _, _ in results
|
||
)
|
||
if not has_more_specific:
|
||
results.append(("マッチング", match_conf, "structural_matching"))
|
||
|
||
return results
|
||
|
||
|
||
# ── 确信度计算 ───────────────────────────────────────────────────────────
|
||
def compute_confidence(
|
||
source: str,
|
||
structure: dict[str, Any] | None = None,
|
||
llm_result: dict[str, Any] | None = None,
|
||
) -> dict[str, Any]:
|
||
"""计算程序分类的确信度。
|
||
|
||
优先级:
|
||
1. L1 关键字命中,且最高置信度 >= 0.90 → 直接返回 L1 结果。
|
||
2. LLM 结果存在 → 使用 LLM 的分类结果。
|
||
3. 否则 → 返回 unknown。
|
||
|
||
Args:
|
||
source: COBOL 程序源码文本。
|
||
structure: 可选的程序结构信息(暂未使用,保留扩展)。
|
||
llm_result: 可选的 LLM 分类结果。
|
||
预期格式: {"category": str, "confidence": float, ...}
|
||
|
||
Returns:
|
||
dict:
|
||
- "category": str — 分类名称或 "unknown"
|
||
- "confidence": float — 确信度 (0.0 ~ 1.0)
|
||
- "source": str — 结果来源 ("l1" / "llm" / "unknown")
|
||
- "matches": list — 匹配到的关键字详情
|
||
"""
|
||
# ── 1. L1 关键字检测 ──
|
||
matches = detect_keyword(source)
|
||
|
||
# 找出最高置信度的 L1 匹配
|
||
if matches:
|
||
best = max(matches, key=lambda m: m[1]) # (category, confidence, keyword)
|
||
category, confidence, _ = best
|
||
|
||
if confidence >= 0.90:
|
||
return {
|
||
"category": category,
|
||
"confidence": confidence,
|
||
"method": "keyword",
|
||
"source": "l1",
|
||
"features": [best[2]],
|
||
"required_tests": [],
|
||
"strategy_params": {"special_boundaries": [], "coverage_requirements": {"branch": 0.95, "paragraph": 1.0}},
|
||
"matches": matches,
|
||
}
|
||
|
||
# ── 2. LLM 结果 ──
|
||
if llm_result is not None:
|
||
llm_category = llm_result.get("category", "unknown")
|
||
llm_confidence = llm_result.get("confidence", 0.0)
|
||
return {
|
||
"category": llm_category,
|
||
"confidence": llm_confidence,
|
||
"method": "hybrid",
|
||
"source": "llm",
|
||
"features": [],
|
||
"required_tests": [],
|
||
"strategy_params": {"special_boundaries": [], "coverage_requirements": {"branch": 0.95, "paragraph": 1.0}},
|
||
"matches": matches,
|
||
}
|
||
|
||
# ── 3. 未知 ──
|
||
return {
|
||
"category": "unknown",
|
||
"confidence": 0.0,
|
||
"method": "none",
|
||
"source": "unknown",
|
||
"features": [],
|
||
"required_tests": [],
|
||
"strategy_params": {"special_boundaries": [], "coverage_requirements": {"branch": 0.95, "paragraph": 1.0}},
|
||
"matches": [],
|
||
}
|