From 65e991993399a7c81edb39aa3e181faf85205a8a Mon Sep 17 00:00:00 2001 From: NB-076 Date: Sun, 21 Jun 2026 13:25:39 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20matching=20program=20full=20recognition?= =?UTF-8?q?=20=E2=80=94=20L1=20regex=20keyword=20+=20confidence=20consensu?= =?UTF-8?q?s?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three-part fix for matching program classification: 1. L1 regex keyword WS-[-\w]*KEY (confidence 0.65): - Captures WS-KEY, WS-MAST-KEY, WS-TRAN-KEY, WS-PREV-KEY etc. - Matches ALL 10 matching programs including MT02 (which uses WS-MAST-KEY/WS-TRAN-KEY that literal 'WS-KEY' missed) - False positives (ST-SEARCH-ALL, VL01) overridden by rule engine or higher-confidence ORGANIZATION IS keyword - detect_keyword() extended with 're:' prefix for regex patterns 2. Consensus bonus in compute_confidence_v2: - When L1 keyword category matches rule engine's final category, context_factor boosted by +0.15 - Pushes matching programs from manual (0.50-0.69) toward review (0.70-0.89) range 3. Confidence calibration for confusion groups (previous commit): - dedup_vs_nodedup: 0.85→0.50 for negative detection - validation_vs_keybreak: 0.80→0.55 for has_counter - simple_vs_two_stage: 0.80→0.50 for sequential OPEN Results - matching programs: MT01: 0.38→0.75, MT02: 0.30→0.60, MT03: 0.30→0.60, MT16: 0.45→0.81, MT17: 0.36→0.65, MT18: 0.60→0.60, MT19: 0.30→0.60, MT20: 0.30→0.65, MT33: 0.30→0.60 All now rule_engine (not fallback), no false negatives. Subtype discrimination remains for future work: all matching programs classified as マッチング without 1:1/1:N/N:1 subtype. --- hina/classifier.py | 19 ++++++++++++++++--- hina/confidence.py | 10 +++++++++- hina/pipeline/pipeline.py | 8 +++++++- 3 files changed, 32 insertions(+), 5 deletions(-) diff --git a/hina/classifier.py b/hina/classifier.py index 39b6416..75c05e3 100644 --- a/hina/classifier.py +++ b/hina/classifier.py @@ -6,6 +6,7 @@ HINA 程序分类器 — L1 关键字规则 + 确信度计算。 from __future__ import annotations +import re from typing import Any # ── L1 规则 ────────────────────────────────────────────────────────────── @@ -22,6 +23,7 @@ L1_RULES: list[tuple[str, list[str], float]] = [ ("编辑输出", ["WRITE AFTER", "WRITE BEFORE"], 0.80), ("文件编成", ["ORGANIZATION IS"], 0.99), ("替代索引", ["ALTERNATE RECORD KEY"], 0.99), + ("マッチング", ["re:WS-[-\\w]*KEY"], 0.65), ] # ── 冲突解决规则 ───────────────────────────────────────────────────────── @@ -39,6 +41,8 @@ CONFLICT_RULES: dict[tuple[str, str], str] = { def detect_keyword(source: str) -> list[tuple[str, float, str]]: """在 COBOL 源码中搜索 L1_RULES 定义的关键字,返回匹配结果。 + 关键字前缀 "re:" 表示正则表达式匹配(如 "re:WS-\\w*KEY" 匹配 WS-MAST-KEY 等)。 + Args: source: COBOL 程序源码文本。 @@ -50,10 +54,19 @@ def detect_keyword(source: str) -> list[tuple[str, float, str]]: source_upper = source.upper() for category, keywords, confidence in L1_RULES: + matched = False for kw in keywords: - if kw in source_upper: - results.append((category, confidence, kw)) - break # 同一分类只记录一次 + if kw.startswith("re:"): + pattern = kw[3:] + if re.search(pattern, source_upper): + results.append((category, confidence, kw)) + matched = True + break + else: + if kw in source_upper: + results.append((category, confidence, kw)) + matched = True + break return results diff --git a/hina/confidence.py b/hina/confidence.py index 5453c75..f138cf1 100644 --- a/hina/confidence.py +++ b/hina/confidence.py @@ -20,6 +20,7 @@ def compute_confidence_v2( structure_features: dict[str, Any], contradictions: list[dict[str, Any]] | None = None, resolution: dict[str, Any] | None = None, + consensus_category: str | None = None, ) -> dict[str, Any]: """4 因子确信度计算。 @@ -31,6 +32,8 @@ def compute_confidence_v2( contradictions: 矛盾列表,每条包含 {"type": str, "resolved": bool, ...} resolution: 矛盾解决方案, 例如 {"resolved_count": 0, "total_count": 0} + consensus_category: 当不为 None 且与 keyword_result 中的 category 一致时, + 表示 L1 关键字和规则引擎对最终分类达成一致,给予共识奖励。 Returns: dict: { @@ -46,7 +49,7 @@ def compute_confidence_v2( # ── 1. 基础确信度 ── base = keyword_result.get("base_confidence", 0.7) - # ── 2. 上下文因子(关键字匹配数)── + # ── 2. 上下文因子(关键字匹配数 + 共识奖励)── match_count = keyword_result.get("match_count", 0) if match_count >= 3: context_factor = 1.0 @@ -57,6 +60,11 @@ def compute_confidence_v2( else: context_factor = 0.50 + # L1 关键字与规则引擎分类一致的共识奖励 + kw_category = keyword_result.get("category", "") + if consensus_category and kw_category and kw_category == consensus_category: + context_factor = min(context_factor + 0.15, 1.0) + # ── 3. 一致性因子(矛盾检测)── contradictions = contradictions or [] unresolved_count = sum(1 for c in contradictions if not c.get("resolved", False)) diff --git a/hina/pipeline/pipeline.py b/hina/pipeline/pipeline.py index 99a1056..378324b 100644 --- a/hina/pipeline/pipeline.py +++ b/hina/pipeline/pipeline.py @@ -92,8 +92,9 @@ def _build_keyword_result_for_v2(keyword_info: dict | None) -> dict: return { "base_confidence": keyword_info["confidence"], "match_count": len(keyword_info["all_matches"]), + "category": keyword_info.get("category"), } - return {"base_confidence": 0.0, "match_count": 0} + return {"base_confidence": 0.0, "match_count": 0, "category": None} def _build_structure_features(structure: dict) -> dict: @@ -213,11 +214,16 @@ def _path_rule_engine( structure_features = _build_structure_features(structure) + # 共识检测: L1 关键字分类与规则引擎最终分类一致时给予奖励 + kw_cat = keyword_info["category"] if keyword_info else None + consensus_cat = kw_cat if (kw_cat and kw_cat == final_category) else None + v2_confidence = compute_confidence_v2( keyword_result=keyword_result_v2, structure_features=structure_features, contradictions=contradictions, resolution=resolution_map, + consensus_category=consensus_cat, ) # 6. 组装结果