65e9919933
Three-part fix for matching program classification:
1. L1 regex keyword WS-[-\w]*KEY (confidence 0.65):
- Captures WS-KEY, WS-MAST-KEY, WS-TRAN-KEY, WS-PREV-KEY etc.
- Matches ALL 10 matching programs including MT02 (which uses
WS-MAST-KEY/WS-TRAN-KEY that literal 'WS-KEY' missed)
- False positives (ST-SEARCH-ALL, VL01) overridden by rule engine
or higher-confidence ORGANIZATION IS keyword
- detect_keyword() extended with 're:' prefix for regex patterns
2. Consensus bonus in compute_confidence_v2:
- When L1 keyword category matches rule engine's final category,
context_factor boosted by +0.15
- Pushes matching programs from manual (0.50-0.69) toward
review (0.70-0.89) range
3. Confidence calibration for confusion groups (previous commit):
- dedup_vs_nodedup: 0.85→0.50 for negative detection
- validation_vs_keybreak: 0.80→0.55 for has_counter
- simple_vs_two_stage: 0.80→0.50 for sequential OPEN
Results - matching programs:
MT01: 0.38→0.75, MT02: 0.30→0.60, MT03: 0.30→0.60,
MT16: 0.45→0.81, MT17: 0.36→0.65, MT18: 0.60→0.60,
MT19: 0.30→0.60, MT20: 0.30→0.65, MT33: 0.30→0.60
All now rule_engine (not fallback), no false negatives.
Subtype discrimination remains for future work: all matching
programs classified as マッチング without 1:1/1:N/N:1 subtype.
146 lines
5.7 KiB
Python
146 lines
5.7 KiB
Python
"""
|
|
HINA 程序分类器 — L1 关键字规则 + 确信度计算。
|
|
|
|
通过 COBOL 源码中的关键字匹配进行程序分类,支持多级确信度判定。
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import re
|
|
from typing import Any
|
|
|
|
# ── L1 规则 ──────────────────────────────────────────────────────────────
|
|
# 格式: (分类名称, [关键字列表], 置信度阈值)
|
|
L1_RULES: list[tuple[str, list[str], float]] = [
|
|
("DB操作", ["EXEC SQL"], 0.95),
|
|
("子程序调用", ["CALL", "LINKAGE SECTION"], 0.90),
|
|
("IS INITIAL", ["IS INITIAL"], 0.99),
|
|
("SYSIN", ["SYSIN"], 0.90),
|
|
("编码转换", ["ALPHABETIC", "ASCII", "EBCDIC"], 0.85),
|
|
("online", ["DFHCOMMAREA", "MAP"], 0.95),
|
|
("SORT", ["SORT ON KEY"], 0.95),
|
|
("MERGE", ["MERGE ON KEY"], 0.95),
|
|
("编辑输出", ["WRITE AFTER", "WRITE BEFORE"], 0.80),
|
|
("文件编成", ["ORGANIZATION IS"], 0.99),
|
|
("替代索引", ["ALTERNATE RECORD KEY"], 0.99),
|
|
("マッチング", ["re:WS-[-\\w]*KEY"], 0.65),
|
|
]
|
|
|
|
# ── 冲突解决规则 ─────────────────────────────────────────────────────────
|
|
# 当 L1 匹配到多个分类时的消歧策略:
|
|
# value = "file_count" → 取测试数更多的分类
|
|
# value = "has_accumulator" → 取包含累加器的分类
|
|
CONFLICT_RULES: dict[tuple[str, str], str] = {
|
|
("マッチング", "キーブレイク"): "file_count",
|
|
("編集処理", "項目チェック"): "file_count",
|
|
("キーブレイク", "項目チェック(重複)"): "has_accumulator",
|
|
}
|
|
|
|
|
|
# ── 关键字检测 ───────────────────────────────────────────────────────────
|
|
def detect_keyword(source: str) -> list[tuple[str, float, str]]:
|
|
"""在 COBOL 源码中搜索 L1_RULES 定义的关键字,返回匹配结果。
|
|
|
|
关键字前缀 "re:" 表示正则表达式匹配(如 "re:WS-\\w*KEY" 匹配 WS-MAST-KEY 等)。
|
|
|
|
Args:
|
|
source: COBOL 程序源码文本。
|
|
|
|
Returns:
|
|
list[tuple[str, float, str]]:
|
|
每个元素为 (分类名称, 置信度, 匹配到的关键字原文)。
|
|
"""
|
|
results: list[tuple[str, float, str]] = []
|
|
source_upper = source.upper()
|
|
|
|
for category, keywords, confidence in L1_RULES:
|
|
matched = False
|
|
for kw in keywords:
|
|
if kw.startswith("re:"):
|
|
pattern = kw[3:]
|
|
if re.search(pattern, source_upper):
|
|
results.append((category, confidence, kw))
|
|
matched = True
|
|
break
|
|
else:
|
|
if kw in source_upper:
|
|
results.append((category, confidence, kw))
|
|
matched = True
|
|
break
|
|
|
|
return results
|
|
|
|
|
|
# ── 确信度计算 ───────────────────────────────────────────────────────────
|
|
def compute_confidence(
|
|
source: str,
|
|
structure: dict[str, Any] | None = None,
|
|
llm_result: dict[str, Any] | None = None,
|
|
) -> dict[str, Any]:
|
|
"""计算程序分类的确信度。
|
|
|
|
优先级:
|
|
1. L1 关键字命中,且最高置信度 >= 0.90 → 直接返回 L1 结果。
|
|
2. LLM 结果存在 → 使用 LLM 的分类结果。
|
|
3. 否则 → 返回 unknown。
|
|
|
|
Args:
|
|
source: COBOL 程序源码文本。
|
|
structure: 可选的程序结构信息(暂未使用,保留扩展)。
|
|
llm_result: 可选的 LLM 分类结果。
|
|
预期格式: {"category": str, "confidence": float, ...}
|
|
|
|
Returns:
|
|
dict:
|
|
- "category": str — 分类名称或 "unknown"
|
|
- "confidence": float — 确信度 (0.0 ~ 1.0)
|
|
- "source": str — 结果来源 ("l1" / "llm" / "unknown")
|
|
- "matches": list — 匹配到的关键字详情
|
|
"""
|
|
# ── 1. L1 关键字检测 ──
|
|
matches = detect_keyword(source)
|
|
|
|
# 找出最高置信度的 L1 匹配
|
|
if matches:
|
|
best = max(matches, key=lambda m: m[1]) # (category, confidence, keyword)
|
|
category, confidence, _ = best
|
|
|
|
if confidence >= 0.90:
|
|
return {
|
|
"category": category,
|
|
"confidence": confidence,
|
|
"method": "keyword",
|
|
"source": "l1",
|
|
"features": [best[2]],
|
|
"required_tests": [],
|
|
"strategy_params": {"special_boundaries": [], "coverage_requirements": {"branch": 0.95, "paragraph": 1.0}},
|
|
"matches": matches,
|
|
}
|
|
|
|
# ── 2. LLM 结果 ──
|
|
if llm_result is not None:
|
|
llm_category = llm_result.get("category", "unknown")
|
|
llm_confidence = llm_result.get("confidence", 0.0)
|
|
return {
|
|
"category": llm_category,
|
|
"confidence": llm_confidence,
|
|
"method": "hybrid",
|
|
"source": "llm",
|
|
"features": [],
|
|
"required_tests": [],
|
|
"strategy_params": {"special_boundaries": [], "coverage_requirements": {"branch": 0.95, "paragraph": 1.0}},
|
|
"matches": matches,
|
|
}
|
|
|
|
# ── 3. 未知 ──
|
|
return {
|
|
"category": "unknown",
|
|
"confidence": 0.0,
|
|
"method": "none",
|
|
"source": "unknown",
|
|
"features": [],
|
|
"required_tests": [],
|
|
"strategy_params": {"special_boundaries": [], "coverage_requirements": {"branch": 0.95, "paragraph": 1.0}},
|
|
"matches": [],
|
|
}
|