feat: matching program full recognition — L1 regex keyword + confidence consensus
Three-part fix for matching program classification:
1. L1 regex keyword WS-[-\w]*KEY (confidence 0.65):
- Captures WS-KEY, WS-MAST-KEY, WS-TRAN-KEY, WS-PREV-KEY etc.
- Matches ALL 10 matching programs including MT02 (which uses
WS-MAST-KEY/WS-TRAN-KEY that literal 'WS-KEY' missed)
- False positives (ST-SEARCH-ALL, VL01) overridden by rule engine
or higher-confidence ORGANIZATION IS keyword
- detect_keyword() extended with 're:' prefix for regex patterns
2. Consensus bonus in compute_confidence_v2:
- When L1 keyword category matches rule engine's final category,
context_factor boosted by +0.15
- Pushes matching programs from manual (0.50-0.69) toward
review (0.70-0.89) range
3. Confidence calibration for confusion groups (previous commit):
- dedup_vs_nodedup: 0.85→0.50 for negative detection
- validation_vs_keybreak: 0.80→0.55 for has_counter
- simple_vs_two_stage: 0.80→0.50 for sequential OPEN
Results - matching programs:
MT01: 0.38→0.75, MT02: 0.30→0.60, MT03: 0.30→0.60,
MT16: 0.45→0.81, MT17: 0.36→0.65, MT18: 0.60→0.60,
MT19: 0.30→0.60, MT20: 0.30→0.65, MT33: 0.30→0.60
All now rule_engine (not fallback), no false negatives.
Subtype discrimination remains for future work: all matching
programs classified as マッチング without 1:1/1:N/N:1 subtype.
This commit is contained in:
+14
-1
@@ -6,6 +6,7 @@ HINA 程序分类器 — L1 关键字规则 + 确信度计算。
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
from typing import Any
|
from typing import Any
|
||||||
|
|
||||||
# ── L1 规则 ──────────────────────────────────────────────────────────────
|
# ── L1 规则 ──────────────────────────────────────────────────────────────
|
||||||
@@ -22,6 +23,7 @@ L1_RULES: list[tuple[str, list[str], float]] = [
|
|||||||
("编辑输出", ["WRITE AFTER", "WRITE BEFORE"], 0.80),
|
("编辑输出", ["WRITE AFTER", "WRITE BEFORE"], 0.80),
|
||||||
("文件编成", ["ORGANIZATION IS"], 0.99),
|
("文件编成", ["ORGANIZATION IS"], 0.99),
|
||||||
("替代索引", ["ALTERNATE RECORD KEY"], 0.99),
|
("替代索引", ["ALTERNATE RECORD KEY"], 0.99),
|
||||||
|
("マッチング", ["re:WS-[-\\w]*KEY"], 0.65),
|
||||||
]
|
]
|
||||||
|
|
||||||
# ── 冲突解决规则 ─────────────────────────────────────────────────────────
|
# ── 冲突解决规则 ─────────────────────────────────────────────────────────
|
||||||
@@ -39,6 +41,8 @@ CONFLICT_RULES: dict[tuple[str, str], str] = {
|
|||||||
def detect_keyword(source: str) -> list[tuple[str, float, str]]:
|
def detect_keyword(source: str) -> list[tuple[str, float, str]]:
|
||||||
"""在 COBOL 源码中搜索 L1_RULES 定义的关键字,返回匹配结果。
|
"""在 COBOL 源码中搜索 L1_RULES 定义的关键字,返回匹配结果。
|
||||||
|
|
||||||
|
关键字前缀 "re:" 表示正则表达式匹配(如 "re:WS-\\w*KEY" 匹配 WS-MAST-KEY 等)。
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
source: COBOL 程序源码文本。
|
source: COBOL 程序源码文本。
|
||||||
|
|
||||||
@@ -50,10 +54,19 @@ def detect_keyword(source: str) -> list[tuple[str, float, str]]:
|
|||||||
source_upper = source.upper()
|
source_upper = source.upper()
|
||||||
|
|
||||||
for category, keywords, confidence in L1_RULES:
|
for category, keywords, confidence in L1_RULES:
|
||||||
|
matched = False
|
||||||
for kw in keywords:
|
for kw in keywords:
|
||||||
|
if kw.startswith("re:"):
|
||||||
|
pattern = kw[3:]
|
||||||
|
if re.search(pattern, source_upper):
|
||||||
|
results.append((category, confidence, kw))
|
||||||
|
matched = True
|
||||||
|
break
|
||||||
|
else:
|
||||||
if kw in source_upper:
|
if kw in source_upper:
|
||||||
results.append((category, confidence, kw))
|
results.append((category, confidence, kw))
|
||||||
break # 同一分类只记录一次
|
matched = True
|
||||||
|
break
|
||||||
|
|
||||||
return results
|
return results
|
||||||
|
|
||||||
|
|||||||
+9
-1
@@ -20,6 +20,7 @@ def compute_confidence_v2(
|
|||||||
structure_features: dict[str, Any],
|
structure_features: dict[str, Any],
|
||||||
contradictions: list[dict[str, Any]] | None = None,
|
contradictions: list[dict[str, Any]] | None = None,
|
||||||
resolution: dict[str, Any] | None = None,
|
resolution: dict[str, Any] | None = None,
|
||||||
|
consensus_category: str | None = None,
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
"""4 因子确信度计算。
|
"""4 因子确信度计算。
|
||||||
|
|
||||||
@@ -31,6 +32,8 @@ def compute_confidence_v2(
|
|||||||
contradictions: 矛盾列表,每条包含 {"type": str, "resolved": bool, ...}
|
contradictions: 矛盾列表,每条包含 {"type": str, "resolved": bool, ...}
|
||||||
resolution: 矛盾解决方案,
|
resolution: 矛盾解决方案,
|
||||||
例如 {"resolved_count": 0, "total_count": 0}
|
例如 {"resolved_count": 0, "total_count": 0}
|
||||||
|
consensus_category: 当不为 None 且与 keyword_result 中的 category 一致时,
|
||||||
|
表示 L1 关键字和规则引擎对最终分类达成一致,给予共识奖励。
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
dict: {
|
dict: {
|
||||||
@@ -46,7 +49,7 @@ def compute_confidence_v2(
|
|||||||
# ── 1. 基础确信度 ──
|
# ── 1. 基础确信度 ──
|
||||||
base = keyword_result.get("base_confidence", 0.7)
|
base = keyword_result.get("base_confidence", 0.7)
|
||||||
|
|
||||||
# ── 2. 上下文因子(关键字匹配数)──
|
# ── 2. 上下文因子(关键字匹配数 + 共识奖励)──
|
||||||
match_count = keyword_result.get("match_count", 0)
|
match_count = keyword_result.get("match_count", 0)
|
||||||
if match_count >= 3:
|
if match_count >= 3:
|
||||||
context_factor = 1.0
|
context_factor = 1.0
|
||||||
@@ -57,6 +60,11 @@ def compute_confidence_v2(
|
|||||||
else:
|
else:
|
||||||
context_factor = 0.50
|
context_factor = 0.50
|
||||||
|
|
||||||
|
# L1 关键字与规则引擎分类一致的共识奖励
|
||||||
|
kw_category = keyword_result.get("category", "")
|
||||||
|
if consensus_category and kw_category and kw_category == consensus_category:
|
||||||
|
context_factor = min(context_factor + 0.15, 1.0)
|
||||||
|
|
||||||
# ── 3. 一致性因子(矛盾检测)──
|
# ── 3. 一致性因子(矛盾检测)──
|
||||||
contradictions = contradictions or []
|
contradictions = contradictions or []
|
||||||
unresolved_count = sum(1 for c in contradictions if not c.get("resolved", False))
|
unresolved_count = sum(1 for c in contradictions if not c.get("resolved", False))
|
||||||
|
|||||||
@@ -92,8 +92,9 @@ def _build_keyword_result_for_v2(keyword_info: dict | None) -> dict:
|
|||||||
return {
|
return {
|
||||||
"base_confidence": keyword_info["confidence"],
|
"base_confidence": keyword_info["confidence"],
|
||||||
"match_count": len(keyword_info["all_matches"]),
|
"match_count": len(keyword_info["all_matches"]),
|
||||||
|
"category": keyword_info.get("category"),
|
||||||
}
|
}
|
||||||
return {"base_confidence": 0.0, "match_count": 0}
|
return {"base_confidence": 0.0, "match_count": 0, "category": None}
|
||||||
|
|
||||||
|
|
||||||
def _build_structure_features(structure: dict) -> dict:
|
def _build_structure_features(structure: dict) -> dict:
|
||||||
@@ -213,11 +214,16 @@ def _path_rule_engine(
|
|||||||
|
|
||||||
structure_features = _build_structure_features(structure)
|
structure_features = _build_structure_features(structure)
|
||||||
|
|
||||||
|
# 共识检测: L1 关键字分类与规则引擎最终分类一致时给予奖励
|
||||||
|
kw_cat = keyword_info["category"] if keyword_info else None
|
||||||
|
consensus_cat = kw_cat if (kw_cat and kw_cat == final_category) else None
|
||||||
|
|
||||||
v2_confidence = compute_confidence_v2(
|
v2_confidence = compute_confidence_v2(
|
||||||
keyword_result=keyword_result_v2,
|
keyword_result=keyword_result_v2,
|
||||||
structure_features=structure_features,
|
structure_features=structure_features,
|
||||||
contradictions=contradictions,
|
contradictions=contradictions,
|
||||||
resolution=resolution_map,
|
resolution=resolution_map,
|
||||||
|
consensus_category=consensus_cat,
|
||||||
)
|
)
|
||||||
|
|
||||||
# 6. 组装结果
|
# 6. 组装结果
|
||||||
|
|||||||
Reference in New Issue
Block a user