cobol-java-v3/hina/classifier.py

"""
HINA 程序分类器 — L1 关键字规则 + 确信度计算。

通过 COBOL 源码中的关键字匹配进行程序分类，支持多级确信度判定。
"""

from __future__ import annotations

import re
from typing import Any

# ── L1 规则 ──────────────────────────────────────────────────────────────
# 格式: (分类名称, [关键字列表], 置信度阈值)
L1_RULES: list[tuple[str, list[str], float]] = [
    ("DB操作", ["EXEC SQL"], 0.95),
    ("子程序调用", ["CALL", "LINKAGE SECTION"], 0.90),
    ("IS INITIAL", ["IS INITIAL"], 0.99),
    ("SYSIN", ["SYSIN"], 0.90),
    ("编码转换", ["ALPHABETIC", "ASCII", "EBCDIC"], 0.85),
    ("online", ["DFHCOMMAREA", "MAP"], 0.95),
    ("SORT", ["SORT ON KEY"], 0.95),
    ("MERGE", ["MERGE ON KEY"], 0.95),
    ("编辑输出", ["WRITE AFTER", "WRITE BEFORE"], 0.80),
    ("文件编成", ["ORGANIZATION IS"], 0.99),
    ("替代索引", ["ALTERNATE RECORD KEY"], 0.99),
    ("マッチング", ["re:WS-[-\\w]*KEY"], 0.65),
]

# ── 冲突解决规则 ─────────────────────────────────────────────────────────
# 当 L1 匹配到多个分类时的消歧策略:
#   value = "file_count"         → 取测试数更多的分类
#   value = "has_accumulator"    → 取包含累加器的分类
CONFLICT_RULES: dict[tuple[str, str], str] = {
    ("マッチング", "キーブレイク"): "file_count",
    ("編集処理", "項目チェック"): "file_count",
    ("キーブレイク", "項目チェック(重複)"): "has_accumulator",
}


# ── 关键字检测 ───────────────────────────────────────────────────────────
def detect_keyword(source: str) -> list[tuple[str, float, str]]:
    """在 COBOL 源码中搜索 L1_RULES 定义的关键字，返回匹配结果。

    关键字前缀 "re:" 表示正则表达式匹配（如 "re:WS-\\w*KEY" 匹配 WS-MAST-KEY 等）。

    Args:
        source: COBOL 程序源码文本。

    Returns:
        list[tuple[str, float, str]]:
            每个元素为 (分类名称, 置信度, 匹配到的关键字原文)。
    """
    results: list[tuple[str, float, str]] = []
    source_upper = source.upper()

    for category, keywords, confidence in L1_RULES:
        matched = False
        for kw in keywords:
            if kw.startswith("re:"):
                pattern = kw[3:]
                if re.search(pattern, source_upper):
                    results.append((category, confidence, kw))
                    matched = True
                    break
            else:
                if kw in source_upper:
                    results.append((category, confidence, kw))
                    matched = True
                    break

    return results


# ── 确信度计算 ───────────────────────────────────────────────────────────
def compute_confidence(
    source: str,
    structure: dict[str, Any] | None = None,
    llm_result: dict[str, Any] | None = None,
) -> dict[str, Any]:
    """计算程序分类的确信度。

    优先级:
      1. L1 关键字命中，且最高置信度 >= 0.90 → 直接返回 L1 结果。
      2. LLM 结果存在 → 使用 LLM 的分类结果。
      3. 否则 → 返回 unknown。

    Args:
        source: COBOL 程序源码文本。
        structure: 可选的程序结构信息（暂未使用，保留扩展）。
        llm_result: 可选的 LLM 分类结果。
                    预期格式: {"category": str, "confidence": float, ...}

    Returns:
        dict:
            - "category": str  — 分类名称或 "unknown"
            - "confidence": float — 确信度 (0.0 ~ 1.0)
            - "source": str    — 结果来源 ("l1" / "llm" / "unknown")
            - "matches": list  — 匹配到的关键字详情
    """
    # ── 1. L1 关键字检测 ──
    matches = detect_keyword(source)

    # 找出最高置信度的 L1 匹配
    if matches:
        best = max(matches, key=lambda m: m[1])  # (category, confidence, keyword)
        category, confidence, _ = best

        if confidence >= 0.90:
            return {
                "category": category,
                "confidence": confidence,
                "method": "keyword",
                "source": "l1",
                "features": [best[2]],
                "required_tests": [],
                "strategy_params": {"special_boundaries": [], "coverage_requirements": {"branch": 0.95, "paragraph": 1.0}},
                "matches": matches,
            }

    # ── 2. LLM 结果 ──
    if llm_result is not None:
        llm_category = llm_result.get("category", "unknown")
        llm_confidence = llm_result.get("confidence", 0.0)
        return {
            "category": llm_category,
            "confidence": llm_confidence,
            "method": "hybrid",
            "source": "llm",
            "features": [],
            "required_tests": [],
            "strategy_params": {"special_boundaries": [], "coverage_requirements": {"branch": 0.95, "paragraph": 1.0}},
            "matches": matches,
        }

    # ── 3. 未知 ──
    return {
        "category": "unknown",
        "confidence": 0.0,
        "method": "none",
        "source": "unknown",
        "features": [],
        "required_tests": [],
        "strategy_params": {"special_boundaries": [], "coverage_requirements": {"branch": 0.95, "paragraph": 1.0}},
        "matches": [],
    }