feat: structural matching detection — no KEY variable needed
Add _detect_matching_structure(): detection based on control flow pattern, not variable naming conventions. Uses 5 structural signals: 1. READ + AT END + EOF pattern 2. PERFORM UNTIL with EOF condition 3. ELSE body with conditional READ (matching core) 4. IF comparing hyphenated fields (cross-file comparison) 5. Multi-file OPEN INPUT 5/5 signals → 0.55, 4/5 → 0.50, 3/5 → 0.40. Real-world impact: matching programs with key fields named CUST-CODE and ORDR-CODE (no '-KEY' in name) are now correctly detected. Also: - Rule engine type priority: main types (マッチング etc.) override secondary types (M:N, DIVIDE) when keyword confidence is low - has_structural_match injected into features so rule engine can use it - matching_vs_keybreak accepts equality IFs as matching evidence - New test: test_structural_matching_no_keyword() Regression: 764 passed (0 new failures).
This commit is contained in:
@@ -166,6 +166,13 @@ def _path_rule_engine(
|
||||
r'\b[A-Z]\d{0,2}-[\w-]*KEY\s*[=<>]', # K01-KEY =
|
||||
su
|
||||
))
|
||||
# 注入 has_structural_match: 结构性匹配检测的结果(不依赖变量名 KEY)
|
||||
# 当 detect_keyword 通过结构识别出匹配时,让规则引擎也能利用这个信号
|
||||
features["has_structural_match"] = bool(re.search(
|
||||
r'IF\s+\w+-\w+\s*[=<>]\s*\w+-\w+.*' # 跨文件字段比较
|
||||
r'(?:PERFORM|END-PERFORM|READ)', # 含循环/读取
|
||||
su, re.DOTALL
|
||||
))
|
||||
|
||||
# 2. 运行所有混淆组解析器
|
||||
resolved_types: dict[str, str] = {}
|
||||
@@ -205,19 +212,48 @@ def _path_rule_engine(
|
||||
final_category = keyword_info["category"]
|
||||
final_base_confidence = keyword_info["confidence"]
|
||||
|
||||
# 规则引擎结果优先级: 匹配检测 > 辅助推断
|
||||
# マッチング/項目チェック/キーブレイク/編集処理 是主类型,优先级高
|
||||
# M:N/DIVIDE 是辅助推断,仅当主类型未命中时才采纳
|
||||
_MAIN_TYPE_PRIORITY = {"マッチング", "項目チェック(重複含む)", "項目チェック(重複含まず)",
|
||||
"キーブレイク", "編集処理(校验)", "二段階マッチング",
|
||||
"単純マッチング", "混合マッチング", "CSV合并", "CSV拆分",
|
||||
"純粋マッチング"}
|
||||
|
||||
# 如果规则引擎有更高置信度的结果, 则采纳
|
||||
# 使用第一轮缓存的结果(M1: 消除冗余重复调用)
|
||||
best_resolved_type = None
|
||||
best_resolved_conf = 0.0
|
||||
best_is_main = False
|
||||
for pair_name, rtype in resolved_types.items():
|
||||
cached_conf = resolved_confidences.get(pair_name, 0.0)
|
||||
if cached_conf > best_resolved_conf:
|
||||
best_resolved_conf = cached_conf
|
||||
best_resolved_type = rtype
|
||||
is_main = rtype in _MAIN_TYPE_PRIORITY
|
||||
|
||||
if best_resolved_type and best_resolved_conf > final_base_confidence:
|
||||
final_category = best_resolved_type
|
||||
final_base_confidence = best_resolved_conf
|
||||
if best_resolved_type is None:
|
||||
best_resolved_type = rtype
|
||||
best_resolved_conf = cached_conf
|
||||
best_is_main = is_main
|
||||
elif is_main and not best_is_main:
|
||||
# 主类型覆盖非主类型(即使置信度略低)
|
||||
best_resolved_type = rtype
|
||||
best_resolved_conf = cached_conf
|
||||
best_is_main = True
|
||||
elif cached_conf > best_resolved_conf:
|
||||
best_resolved_type = rtype
|
||||
best_resolved_conf = cached_conf
|
||||
best_is_main = is_main
|
||||
|
||||
if best_resolved_type:
|
||||
final_is_main = final_category in _MAIN_TYPE_PRIORITY
|
||||
if best_resolved_conf > final_base_confidence:
|
||||
# 置信度更高 → 替换
|
||||
final_category = best_resolved_type
|
||||
final_base_confidence = best_resolved_conf
|
||||
elif best_is_main and not final_is_main and final_base_confidence < 0.40:
|
||||
# 主类型替代低确信度的非主类型(如 M:N→マッチング)
|
||||
# 但如果 keyword 已确定具体分类(如编码转换 0.85),不覆盖
|
||||
final_category = best_resolved_type
|
||||
final_base_confidence = max(final_base_confidence, best_resolved_conf)
|
||||
|
||||
# 5. 计算 4 因子确信度
|
||||
keyword_result_v2 = _build_keyword_result_for_v2(keyword_info)
|
||||
|
||||
Reference in New Issue
Block a user