feat: matching program subtype discrimination (1:1/1:N/M:N/MxN)
Add _resolve_matching_subtype post-processing step in classify_program() that distinguishes matching program subtypes based on key variable naming patterns and file/structural features: Rules (in priority order): 1. 二段階 → 二段階 (already handled by rule engine) 2. 3 files + WS-SAVE-KEY → M:N→MxN (MT20) 3. WS-PREV-KEY present → 混合 (already handled, MT32) 4. WS-MAST-KEY + WS-TRAN-KEY → 1:N (MT02) 5. >=3 KEY vars + >=2 files → M:N (MT33) 6. Otherwise → 1:1 (MT01, MT03, MT18, MT19) Results: MT01→1:1, MT02→1:N, MT03→1:1, MT16/17→二段階, MT18/19→1:1, MT20→M:N→MxN, MT33→M:N Also fix double-backslash regex bug in classifier.py and pipeline.py (r'[-\w]' should be r'[\w-]' for word character class). Regression: 745 passed (unchanged).
This commit is contained in:
@@ -334,6 +334,73 @@ def _path_llm_assisted(
|
||||
|
||||
# ── 主入口 ────────────────────────────────────────────────────────────────────
|
||||
|
||||
# ── 匹配子类型解析 ──────────────────────────────────────────────────────────
|
||||
|
||||
_MATCHING_SUBTYPE_RULES = [
|
||||
# (match_fn, subtype)
|
||||
# 按优先级从高到低排列
|
||||
]
|
||||
|
||||
|
||||
def _resolve_matching_subtype(
|
||||
result: dict,
|
||||
cobol_source: str,
|
||||
structure: dict,
|
||||
) -> dict:
|
||||
"""匹配程序的子类型区分后处理。
|
||||
|
||||
在 classify_program 判定为 マッチング 后,进一步区分子类型:
|
||||
- 1:1 マッチング / 1:N / N:1 / M:N / M:N→M 等
|
||||
|
||||
Args:
|
||||
result: classify_program 的返回结果。
|
||||
cobol_source: 原始 COBOL 源码。
|
||||
structure: extract_structure 的返回结构。
|
||||
|
||||
Returns:
|
||||
更新后的 result,增加 "subtype" 字段。
|
||||
"""
|
||||
category = result.get("category", "")
|
||||
if "マッチング" not in category and "キーブレイク" not in category:
|
||||
return result # 非匹配程序不做子类型区分
|
||||
|
||||
src_upper = cobol_source.upper()
|
||||
import re
|
||||
|
||||
# 0. 二段階マッチング — 已在规则引擎中处理
|
||||
if "二段階" in category:
|
||||
result["subtype"] = "二段階"
|
||||
return result
|
||||
|
||||
# 1. M:N→MxN 直積 — 特征: WRITE + WS-SAVE-KEY + 3 文件
|
||||
if structure.get("file_count", 0) >= 3 and 'WS-SAVE' in src_upper:
|
||||
result["subtype"] = "M:N→MxN"
|
||||
return result
|
||||
|
||||
# 2. 混合匹配 (WS-PREV-KEY 存在)
|
||||
if 'WS-PREV-KEY' in src_upper:
|
||||
result["subtype"] = "混合"
|
||||
return result
|
||||
|
||||
# 3. 检查键变量命名模式
|
||||
key_vars = set(re.findall(r'WS-[\w-]*KEY[A-Z0-9-]*', src_upper))
|
||||
|
||||
# 不对称键名 → 1:N 或 N:1 (WS-MAST-KEY + WS-TRAN-KEY)
|
||||
has_master = any('MAST' in k for k in key_vars)
|
||||
has_tran = any('TRAN' in k for k in key_vars)
|
||||
if has_master and has_tran:
|
||||
result["subtype"] = "1:N"
|
||||
return result
|
||||
|
||||
# 4. 多个键名 → 多文件匹配 (M:N 模式)
|
||||
if len(key_vars) >= 3 and structure.get("file_count", 0) >= 2:
|
||||
result["subtype"] = "M:N"
|
||||
return result
|
||||
|
||||
# 5. 对称键名 → 默认为 1:1
|
||||
result["subtype"] = "1:1"
|
||||
return result
|
||||
|
||||
|
||||
def classify_program(cobol_source: str, llm: Any = None) -> dict:
|
||||
"""完整程序类型判定管道。
|
||||
@@ -419,20 +486,24 @@ def classify_program(cobol_source: str, llm: Any = None) -> dict:
|
||||
# 路径 A: keyword >= 90% -> 直接输出
|
||||
if max_keyword_confidence >= 0.90:
|
||||
logger.info("[pipeline] 路径 A: keyword 高确信度 (%.2f)", max_keyword_confidence)
|
||||
return _path_keyword_direct(keyword_info, structure)
|
||||
result = _path_keyword_direct(keyword_info, structure)
|
||||
|
||||
# 路径 B: keyword 50-89% -> 规则引擎
|
||||
if max_keyword_confidence >= 0.50:
|
||||
elif max_keyword_confidence >= 0.50:
|
||||
logger.info("[pipeline] 路径 B: keyword 中确信度 (%.2f) -> 规则引擎", max_keyword_confidence)
|
||||
return _path_rule_engine(keyword_info, structure)
|
||||
result = _path_rule_engine(keyword_info, structure)
|
||||
|
||||
# 路径 C: keyword < 50% -> LLM 辅助
|
||||
if llm is not None:
|
||||
elif llm is not None:
|
||||
logger.info("[pipeline] 路径 C: keyword 低确信度 (%.2f) -> LLM 辅助", max_keyword_confidence)
|
||||
return _path_llm_assisted(keyword_info, structure, llm)
|
||||
result = _path_llm_assisted(keyword_info, structure, llm)
|
||||
|
||||
# LLM 不可用: 使用规则引擎兜底
|
||||
logger.info("[pipeline] 路径 C(fallback): keyword 低确信度 (%.2f) -> 规则引擎兜底", max_keyword_confidence)
|
||||
result = _path_rule_engine(keyword_info, structure)
|
||||
result["method"] = "rule_engine_fallback"
|
||||
else:
|
||||
logger.info("[pipeline] 路径 C(fallback): keyword 低确信度 (%.2f) -> 规则引擎兜底", max_keyword_confidence)
|
||||
result = _path_rule_engine(keyword_info, structure)
|
||||
result["method"] = "rule_engine_fallback"
|
||||
|
||||
# ── 第 4 步: 匹配子类型区分(仅对匹配/键中断程序)──
|
||||
result = _resolve_matching_subtype(result, cobol_source, structure)
|
||||
return result
|
||||
|
||||
Reference in New Issue
Block a user