""" HINA 程序分类器 — L1 关键字规则 + 确信度计算。 通过 COBOL 源码中的关键字匹配进行程序分类,支持多级确信度判定。 """ from __future__ import annotations from typing import Any # ── L1 规则 ────────────────────────────────────────────────────────────── # 格式: (分类名称, [关键字列表], 置信度阈值) L1_RULES: list[tuple[str, list[str], float]] = [ ("DB操作", ["EXEC SQL"], 0.95), ("子程序调用", ["CALL", "LINKAGE SECTION"], 0.90), ("IS INITIAL", ["IS INITIAL"], 0.99), ("SYSIN", ["SYSIN"], 0.90), ("编码转换", ["ALPHABETIC", "ASCII", "EBCDIC"], 0.85), ("online", ["DFHCOMMAREA", "MAP"], 0.95), ("SORT", ["SORT ON KEY"], 0.95), ("MERGE", ["MERGE ON KEY"], 0.95), ("编辑输出", ["WRITE AFTER", "WRITE BEFORE"], 0.80), ("文件编成", ["ORGANIZATION IS"], 0.99), ("替代索引", ["ALTERNATE RECORD KEY"], 0.99), ] # ── 冲突解决规则 ───────────────────────────────────────────────────────── # 当 L1 匹配到多个分类时的消歧策略: # value = "file_count" → 取测试数更多的分类 # value = "has_accumulator" → 取包含累加器的分类 CONFLICT_RULES: dict[tuple[str, str], str] = { ("マッチング", "キーブレイク"): "file_count", ("編集処理", "項目チェック"): "file_count", ("キーブレイク", "項目チェック(重複)"): "has_accumulator", } # ── 关键字检测 ─────────────────────────────────────────────────────────── def detect_keyword(source: str) -> list[tuple[str, float, str]]: """在 COBOL 源码中搜索 L1_RULES 定义的关键字,返回匹配结果。 Args: source: COBOL 程序源码文本。 Returns: list[tuple[str, float, str]]: 每个元素为 (分类名称, 置信度, 匹配到的关键字原文)。 """ results: list[tuple[str, float, str]] = [] source_upper = source.upper() for category, keywords, confidence in L1_RULES: for kw in keywords: if kw in source_upper: results.append((category, confidence, kw)) break # 同一分类只记录一次 return results # ── 确信度计算 ─────────────────────────────────────────────────────────── def compute_confidence( source: str, structure: dict[str, Any] | None = None, llm_result: dict[str, Any] | None = None, ) -> dict[str, Any]: """计算程序分类的确信度。 优先级: 1. L1 关键字命中,且最高置信度 >= 0.90 → 直接返回 L1 结果。 2. LLM 结果存在 → 使用 LLM 的分类结果。 3. 否则 → 返回 unknown。 Args: source: COBOL 程序源码文本。 structure: 可选的程序结构信息(暂未使用,保留扩展)。 llm_result: 可选的 LLM 分类结果。 预期格式: {"category": str, "confidence": float, ...} Returns: dict: - "category": str — 分类名称或 "unknown" - "confidence": float — 确信度 (0.0 ~ 1.0) - "source": str — 结果来源 ("l1" / "llm" / "unknown") - "matches": list — 匹配到的关键字详情 """ # ── 1. L1 关键字检测 ── matches = detect_keyword(source) # 找出最高置信度的 L1 匹配 if matches: best = max(matches, key=lambda m: m[1]) # (category, confidence, keyword) category, confidence, _ = best if confidence >= 0.90: return { "category": category, "confidence": confidence, "method": "keyword", "source": "l1", "features": [best[2]], "required_tests": [], "strategy_params": {"special_boundaries": [], "coverage_requirements": {"branch": 0.95, "paragraph": 1.0}}, "matches": matches, } # ── 2. LLM 结果 ── if llm_result is not None: llm_category = llm_result.get("category", "unknown") llm_confidence = llm_result.get("confidence", 0.0) return { "category": llm_category, "confidence": llm_confidence, "method": "hybrid", "source": "llm", "features": [], "required_tests": [], "strategy_params": {"special_boundaries": [], "coverage_requirements": {"branch": 0.95, "paragraph": 1.0}}, "matches": matches, } # ── 3. 未知 ── return { "category": "unknown", "confidence": 0.0, "method": "none", "source": "unknown", "features": [], "required_tests": [], "strategy_params": {"special_boundaries": [], "coverage_requirements": {"branch": 0.95, "paragraph": 1.0}}, "matches": [], }