fix: classification修复+grammar增强+75/75回归确认
分类修复: - FILE-CONTROL关键词(0.99)错误覆盖匹配检测信号 - 添加匹配型规则引擎更优优先级,确保匹配检测结果优先 - has_matching_kw特征注入,使IF-less匹配程序也能识别 Grammar增强: - LEVEL扩展到/[0-9]+/覆盖所有COBOL层级号 - HEX_STRING添加支持X'...'十六进制字面量 - VALUE子句逗号预处理剥离(88-level多值) - COPY正则支持引号包覆的名称 结果: 内部75/75, 外部基准54/58(93%) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -7,30 +7,31 @@ FD_SUFFIX: /(?:"[^"]*"|'[^']*'|[^.])*\./
|
|||||||
working_storage: "WORKING-STORAGE" "SECTION" DOT data_item*
|
working_storage: "WORKING-STORAGE" "SECTION" DOT data_item*
|
||||||
linkage: "LINKAGE" "SECTION" DOT data_item*
|
linkage: "LINKAGE" "SECTION" DOT data_item*
|
||||||
data_item: level_num (NAME | "FILLER") clause* DOT
|
data_item: level_num (NAME | "FILLER") clause* DOT
|
||||||
level_num: INT
|
level_num: LEVEL
|
||||||
clause: pic_clause | value_clause | occurs_clause | redefines_clause | usage_clause
|
clause: pic_clause | value_clause | occurs_clause | redefines_clause | usage_clause
|
||||||
| "SYNC" | "SYNCHRONIZED"
|
| "SYNC" | "SYNCHRONIZED"
|
||||||
| "JUSTIFIED" "RIGHT"?
|
| "JUSTIFIED" "RIGHT"?
|
||||||
| "BLANK" "WHEN" "ZERO"
|
| "BLANK" "WHEN" "ZERO"
|
||||||
| "GLOBAL" | "EXTERNAL"
|
| "GLOBAL" | "EXTERNAL"
|
||||||
pic_clause: "PIC" "IS"? PICTURE_STRING
|
pic_clause: "PIC" "IS"? PICTURE_STRING
|
||||||
value_clause: "VALUE" "IS"? value_list
|
value_clause: "VALUE" "IS"? value_literal+
|
||||||
value_list: value_literal (","? value_literal)*
|
|
||||||
value_literal: INT | SIGNED_NUMBER | STRING | SQSTRING
|
value_literal: INT | SIGNED_NUMBER | STRING | SQSTRING
|
||||||
| "ZERO" | "ZEROS" | "ZEROES"
|
| "ZERO" | "ZEROS" | "ZEROES"
|
||||||
| "SPACE" | "SPACES"
|
| "SPACE" | "SPACES"
|
||||||
| "HIGH-VALUE" | "HIGH-VALUES"
|
| "HIGH-VALUE" | "HIGH-VALUES"
|
||||||
| "LOW-VALUE" | "LOW-VALUES"
|
| "LOW-VALUE" | "LOW-VALUES"
|
||||||
|
| HEX_STRING
|
||||||
SQSTRING: /'[^']*'/
|
SQSTRING: /'[^']*'/
|
||||||
|
HEX_STRING: /X'[0-9A-Fa-f]+'/
|
||||||
redefines_clause: "REDEFINES" NAME
|
redefines_clause: "REDEFINES" NAME
|
||||||
occurs_clause: "OCCURS" INT ("TO" INT)? "TIMES"? ("DEPENDING" "ON" NAME)? key_clause? indexed_clause?
|
occurs_clause: "OCCURS" INT ("TO" INT)? "TIMES"? ("DEPENDING" "ON" NAME)? key_clause? indexed_clause?
|
||||||
key_clause: ("ASCENDING" | "DESCENDING") "KEY" "IS"? NAME (","? NAME)*
|
key_clause: ("ASCENDING" | "DESCENDING") "KEY" "IS"? NAME (","? NAME)*
|
||||||
indexed_clause: "INDEXED" "BY" NAME (","? NAME)*
|
indexed_clause: "INDEXED" "BY" NAME (","? NAME)*
|
||||||
usage_clause: USAGE_VAL
|
usage_clause: USAGE_VAL
|
||||||
USAGE_VAL: "COMP" | "COMP-3" | "COMP-5" | "BINARY" | "PACKED-DECIMAL" | "DISPLAY"
|
USAGE_VAL: "COMP" | "COMP-3" | "COMP-5" | "BINARY" | "PACKED-DECIMAL" | "DISPLAY"
|
||||||
LEVEL: /0[1-9]|[0-4][0-9]|49|77|88|[0-9]+/
|
LEVEL: /0[1-9]|[0-4][0-9]|49|66|77|88|[0-9]+/
|
||||||
NAME: /[A-Z][A-Z0-9-]*/i
|
NAME: /[A-Z][A-Z0-9-]*/i
|
||||||
PICTURE_STRING: /[0-9A-Z()+,\-*\/V]+(?:\.[0-9A-Z()+,\-*\/V]+)?/i
|
PICTURE_STRING: /[0-9A-Z()+,\-*\/V]+/i
|
||||||
INT: /[0-9]+/
|
INT: /[0-9]+/
|
||||||
DOT: /\./
|
DOT: /\./
|
||||||
%import common.SIGNED_NUMBER
|
%import common.SIGNED_NUMBER
|
||||||
|
|||||||
@@ -38,6 +38,11 @@ def preprocess(source: str) -> str:
|
|||||||
source, flags=re.IGNORECASE | re.DOTALL
|
source, flags=re.IGNORECASE | re.DOTALL
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Strip commas from VALUE clauses (VALUE 'A', 'B', 'C' → VALUE 'A' 'B' 'C')
|
||||||
|
def _strip_value_commas(m):
|
||||||
|
return re.sub(r'\s*,\s*', ' ', m.group(0))
|
||||||
|
source = re.sub(r'VALUE\s+[^.\n]+', _strip_value_commas, source, flags=re.IGNORECASE)
|
||||||
|
|
||||||
fixed = _is_fixed_format(source)
|
fixed = _is_fixed_format(source)
|
||||||
lines = []
|
lines = []
|
||||||
for raw_line in source.splitlines():
|
for raw_line in source.splitlines():
|
||||||
|
|||||||
+30
-10
@@ -190,6 +190,10 @@ def _path_rule_engine(
|
|||||||
r"INSPECT[\s\S]*?REPLACING[\s\S]*?','", # INSPECT ... REPLACING ... ','
|
r"INSPECT[\s\S]*?REPLACING[\s\S]*?','", # INSPECT ... REPLACING ... ','
|
||||||
su
|
su
|
||||||
))
|
))
|
||||||
|
# 注入 has_matching_kw: 源码中是否有 KEY 变量比较
|
||||||
|
features["has_matching_kw"] = bool(re.search(
|
||||||
|
r'[\w-]*KEY[\w-]*\s*[=<>]', su
|
||||||
|
))
|
||||||
|
|
||||||
# 2. 运行所有混淆组解析器
|
# 2. 运行所有混淆组解析器
|
||||||
resolved_types: dict[str, str] = {}
|
resolved_types: dict[str, str] = {}
|
||||||
@@ -242,20 +246,28 @@ def _path_rule_engine(
|
|||||||
best_resolved_type = None
|
best_resolved_type = None
|
||||||
best_resolved_conf = 0.0
|
best_resolved_conf = 0.0
|
||||||
best_is_main = False
|
best_is_main = False
|
||||||
|
best_priority = 0
|
||||||
for pair_name, rtype in resolved_types.items():
|
for pair_name, rtype in resolved_types.items():
|
||||||
|
pair_priority = 2 if pair_name in ("matching_vs_keybreak", "simple_vs_two_stage", "pure_vs_mixed") else 1
|
||||||
cached_conf = resolved_confidences.get(pair_name, 0.0)
|
cached_conf = resolved_confidences.get(pair_name, 0.0)
|
||||||
is_main = rtype in _MAIN_TYPE_PRIORITY
|
is_main = rtype in _MAIN_TYPE_PRIORITY
|
||||||
|
|
||||||
if best_resolved_type is None:
|
if best_resolved_type is None:
|
||||||
best_resolved_type = rtype
|
best_resolved_type = rtype
|
||||||
best_resolved_conf = cached_conf
|
best_resolved_conf = cached_conf
|
||||||
best_is_main = is_main
|
best_is_main = is_main
|
||||||
elif is_main and not best_is_main:
|
best_priority = pair_priority
|
||||||
# 主类型覆盖非主类型(即使置信度略低)
|
elif pair_name in ("matching_vs_keybreak", "simple_vs_two_stage", "pure_vs_mixed") and is_main:
|
||||||
|
# matching-related resolvers take priority
|
||||||
best_resolved_type = rtype
|
best_resolved_type = rtype
|
||||||
best_resolved_conf = cached_conf
|
best_resolved_conf = cached_conf
|
||||||
best_is_main = True
|
best_is_main = True
|
||||||
elif cached_conf > best_resolved_conf:
|
best_priority = pair_priority
|
||||||
|
elif is_main and not best_is_main:
|
||||||
|
best_resolved_type = rtype
|
||||||
|
best_resolved_conf = cached_conf
|
||||||
|
best_is_main = True
|
||||||
|
best_priority = pair_priority
|
||||||
|
elif cached_conf > best_resolved_conf and pair_priority >= best_priority:
|
||||||
best_resolved_type = rtype
|
best_resolved_type = rtype
|
||||||
best_resolved_conf = cached_conf
|
best_resolved_conf = cached_conf
|
||||||
best_is_main = is_main
|
best_is_main = is_main
|
||||||
@@ -266,11 +278,10 @@ def _path_rule_engine(
|
|||||||
# 置信度更高 → 替换
|
# 置信度更高 → 替换
|
||||||
final_category = best_resolved_type
|
final_category = best_resolved_type
|
||||||
final_base_confidence = best_resolved_conf
|
final_base_confidence = best_resolved_conf
|
||||||
elif best_is_main and not final_is_main and final_base_confidence < 0.40:
|
elif best_is_main and not final_is_main:
|
||||||
# 主类型替代低确信度的非主类型(如 M:N→マッチング)
|
# 规则引擎主类型覆盖非主类型关键字("文件编成"→"マッチング")
|
||||||
# 但如果 keyword 已确定具体分类(如编码转换 0.85),不覆盖
|
|
||||||
final_category = best_resolved_type
|
final_category = best_resolved_type
|
||||||
final_base_confidence = max(final_base_confidence, best_resolved_conf)
|
final_base_confidence = max(final_base_confidence * 0.5, best_resolved_conf)
|
||||||
|
|
||||||
# 5. 计算 4 因子确信度
|
# 5. 计算 4 因子确信度
|
||||||
keyword_result_v2 = _build_keyword_result_for_v2(keyword_info)
|
keyword_result_v2 = _build_keyword_result_for_v2(keyword_info)
|
||||||
@@ -652,8 +663,17 @@ def classify_program(cobol_source: str, llm: Any = None) -> dict:
|
|||||||
|
|
||||||
# ── 第 3 步: 根据确信度分路径 ──
|
# ── 第 3 步: 根据确信度分路径 ──
|
||||||
|
|
||||||
# 路径 A: keyword >= 90% -> 直接输出
|
# 冲突检测: keyword >= 90% 但匹配关键词存在时走规则引擎
|
||||||
if max_keyword_confidence >= 0.90:
|
needs_rule_engine = False
|
||||||
|
if keyword_info and max_keyword_confidence >= 0.90 and len(keyword_matches) >= 2:
|
||||||
|
fc = structure.get("file_count", 0)
|
||||||
|
has_matching_kw = any("マッチング" in str(m[0]) for m in keyword_matches)
|
||||||
|
top_cat = keyword_info.get("category", "")
|
||||||
|
if has_matching_kw and fc >= 2 and top_cat not in ("マッチング", "二段階マッチング"):
|
||||||
|
needs_rule_engine = True
|
||||||
|
logger.info("[pipeline] 关键字/结构冲突: %s(%.2f) + 匹配关键词 -> 路径B", top_cat, max_keyword_confidence)
|
||||||
|
# 路径 A: keyword >= 90% 且无冲突 -> 直接输出
|
||||||
|
if max_keyword_confidence >= 0.90 and not needs_rule_engine:
|
||||||
logger.info("[pipeline] 路径 A: keyword 高确信度 (%.2f)", max_keyword_confidence)
|
logger.info("[pipeline] 路径 A: keyword 高确信度 (%.2f)", max_keyword_confidence)
|
||||||
result = _path_keyword_direct(keyword_info, structure)
|
result = _path_keyword_direct(keyword_info, structure)
|
||||||
|
|
||||||
|
|||||||
@@ -54,6 +54,11 @@ def resolve_matching_vs_keybreak(features: dict) -> dict:
|
|||||||
evidence.append(f"SELECT 文件数 >=2 + IF >=1 + KEY/结构/比较证据 → マッチング")
|
evidence.append(f"SELECT 文件数 >=2 + IF >=1 + KEY/结构/比较证据 → マッチング")
|
||||||
return {"resolved_type": "マッチング", "confidence": 0.75, "evidence": evidence}
|
return {"resolved_type": "マッチング", "confidence": 0.75, "evidence": evidence}
|
||||||
|
|
||||||
|
# 规则 3: 文件数>=2 + 匹配关键词信号
|
||||||
|
if file_count >= 2 and features.get("has_matching_kw", False):
|
||||||
|
evidence.append(f"文件数>=2 + KEY比较信号 -> マッチング(弱)")
|
||||||
|
return {"resolved_type": "マッチング", "confidence": 0.50, "evidence": evidence}
|
||||||
|
|
||||||
# 回退: 无法明确判定
|
# 回退: 无法明确判定
|
||||||
evidence.append(f"特征不足: total_ifs={total_ifs}, comparison={comparison_ifs}, "
|
evidence.append(f"特征不足: total_ifs={total_ifs}, comparison={comparison_ifs}, "
|
||||||
f"file_count={file_count}, has_prev_key={has_prev_key}, "
|
f"file_count={file_count}, has_prev_key={has_prev_key}, "
|
||||||
|
|||||||
Reference in New Issue
Block a user