cobol-java-v3/hina/classifier.py

"""
HINA 程序分类器 — L1 关键字规则 + 确信度计算。

通过 COBOL 源码中的关键字匹配进行程序分类，支持多级确信度判定。
"""

from __future__ import annotations

import re
from typing import Any

# ── L1 规则 ──────────────────────────────────────────────────────────────
# 格式: (分类名称, [关键字列表], 置信度阈值)
L1_RULES: list[tuple[str, list[str], float]] = [
    ("DB操作", ["EXEC SQL"], 0.95),
    ("子程序调用", ["CALL", "LINKAGE SECTION"], 0.90),
    ("IS INITIAL", ["IS INITIAL"], 0.99),
    ("SYSIN", ["SYSIN"], 0.90),
    ("编码转换", ["ALPHABETIC", "ASCII", "EBCDIC"], 0.85),
    ("online", ["DFHCOMMAREA", "MAP"], 0.95),
    ("SORT", ["re:SORT(?:\\s+\\S+)?\\s+ON\\s+(?:ASCENDING\\s+|DESCENDING\\s+)?KEY"], 0.95),
    ("MERGE", ["re:MERGE(?:\\s+\\S+)?\\s+ON\\s+(?:ASCENDING\\s+|DESCENDING\\s+)?KEY"], 0.95),
    ("替代索引", ["ALTERNATE RECORD KEY"], 0.99),
    ("编辑输出", ["re:WRITE\\s+\\S+\\s+AFTER\\s+", "re:WRITE\\s+\\S+\\s+BEFORE\\s+"], 0.80),
    ("文件编成", ["ORGANIZATION IS"], 0.99),
    ("マッチング", ["re:WS-[\\w-]*KEY"], 0.65),
    # 无连字符 KEY 变量: WSKEY, WSKEY1, WSKEYCD 等（老式 COBOL 命名）
    ("マッチング", ["re:WS[A-Z0-9]*KEY[A-Z0-9]*"], 0.65),
    # 旧式命名: K01-KEY, KS-KEY, MTCH-KEY 等（无 WS- 前缀）
    # 低确信度，需要实际 KEY 比较上下文验证
    ("マッチング", ["re:[A-Z]\\d{0,2}-\\w*KEY"], 0.55),
]

# ── 冲突解决规则 ─────────────────────────────────────────────────────────
# 当 L1 匹配到多个分类时的消歧策略:
#   value = "file_count"         → 取测试数更多的分类
#   value = "has_accumulator"    → 取包含累加器的分类
CONFLICT_RULES: dict[tuple[str, str], str] = {
    ("マッチング", "キーブレイク"): "file_count",
    ("編集処理", "項目チェック"): "file_count",
    ("キーブレイク", "項目チェック(重複)"): "has_accumulator",
}


# ── 关键字检测 ───────────────────────────────────────────────────────────
def _strip_cobol_comments(source: str) -> str:
    """剥离 COBOL 注释，避免注释中的关键词触发 L1 匹配。

    处理两种注释:
    - 固定格式列 7: 行首 `*` (comment line)
    - 自由格式/内联: `*> ...` 到行尾
    """
    lines = source.split('\n')
    cleaned = []
    for line in lines:
        # 自由格式/内联注释: *>
        idx = line.find('*>')
        if idx >= 0:
            line = line[:idx]
        # 固定格式注释行: 如果第一个非空字符是 *
        stripped = line.strip()
        if stripped.startswith('*') and not stripped.startswith('*/'):
            continue  # 跳过整个注释行
        cleaned.append(line)
    return '\n'.join(cleaned)


def _matches_key_comparison(source_upper: str) -> bool:
    """检查源码中是否包含实际的 KEY 变量比较（而非仅声明）。

    匹配 KEY 变量在比较上下文中的使用:
      WS-KEY = / WS-KEY > / WS-KEY <
      IF WS-MAST-KEY
      KEY = WS-...
    """
    # 模式 1: KEY 变量出现在比较上下文中（= < > 后跟变量）
    # 注意: 不能用 \s 代替 [=<>]，否则「WS-KEY PIC」中的空格也会误匹配
    # 排除: 右边的 Figurative Constant (SPACES, ZERO, HIGH-VALUE 等)
    _figurative = r'(?:SPAC?E?S?|ZERO[S]?E?S?|HIGH[-\s]VALUE[S]?|LOW[-\s]VALUE[S]?|'
    _figurative += r'NULL[S]?|QUOTE[S]?|ALL\s+\'[^\']*\')'
    if re.search(r'(?:WS-[\w-]*KEY[A-Z0-9-]*|WS[A-Z0-9]*KEY[A-Z0-9]*)\s*[=<>]'
                 r'(?!\s*' + _figurative + r')', source_upper):
        return True
    # 模式 2: 非 WS- 前缀的 KEY 变量（旧式命名 K01-KEY 等）
    if re.search(r'\b[A-Z]\d{0,2}-[\w-]*KEY\s*[=<>]'
                 r'(?!\s*' + _figurative + r')', source_upper):
        return True
    # 模式 3: 源码中含有 READ INTO + KEY 变量
    if re.search(r'READ\s+\w+\s+INTO\s+\w+.*KEY', source_upper, re.DOTALL):
        return True
    return False


def _get_procedure_division(source_upper: str) -> str:
    """只提取 PROCEDURE DIVISION 部分用于关键词匹配。"""
    idx = source_upper.find('PROCEDURE DIVISION')
    if idx >= 0:
        return source_upper[idx:]
    return source_upper


def _detect_matching_structure(source_upper: str) -> float:
    """结构检测：不依赖变量名 KEY 的模式匹配检测。

    通过分析 COBOL 程序的控制流结构判断是否为匹配程序。
    返回确信度 0.0~0.55，0.0 表示不是匹配。

    匹配程序的结构性特征:
      信号 1: READ + AT END + EOF/WS-*E* 变量（文件读取循环）
      信号 2: PERFORM UNTIL + EOF/WS-*E* 变量（主循环）
      信号 3: ELSE 体内 READ（条件性读取——匹配核心）
      信号 4: IF 比较两个字段（跨文件字段比较，任何命名风格）
      信号 5: 2+ 文件 OPEN INPUT（多文件输入）
    """
    import re

    signals = 0

    # 信号 1: READ + AT END + 赋值（任何命名风格的 EOF 标志）
    # COBOL 匹配程序至少有一个 READ ... AT END MOVE ...
    # 匹配: READ F1 AT END MOVE 'Y' TO WS-EOF-A.
    # 匹配: READ F1 INTO R1 AT END MOVE 'Y' TO WS-END-1.
    # 匹配: READ F1 AT END MOVE 'Y' TO FE-1.
    if re.search(r'READ\s+\w+(?:\s+INTO\s+\w+)?\s+AT\s+END', source_upper):
        signals += 1

    # 信号 1b: 第二个 READ（匹配程序通常有 2 个 READ）
    reads = re.findall(r'\bREAD\s+\w+(?:\s+INTO\s+\w+)?', source_upper)
    if len(reads) >= 2:
        signals += 1

    # 信号 2: PERFORM UNTIL + 结束条件（EOF, E1, END-FLAG 等）
    if re.search(r'PERFORM\s+UNTIL\s+\w+[-A-Z0-9]*\s*=\s*[\'\"][YN]', source_upper):
        signals += 1

    # 信号 2b: GO TO 循环（LOOP〜EXIT-PGM/END）
    if (re.search(r'GO\s+TO\s+LOOP|GO\s+TO\s+[A-Z]*-L|[A-Z]*LP\b', source_upper) and
        re.search(r'IF\s+\w+.*=\s*[\'\"][YN]', source_upper)):
        signals += 1

    # 信号 3: ELSE 体内 READ（条件性读取——匹配核心）
    if re.search(r'ELSE\s+.*READ\s+', source_upper) or re.search(r'ELSE\s+\w+\s+READ\s+', source_upper):
        signals += 1

    # 信号 4: IF 比较两个不同变量（跨文件字段比较，任何命名风格）
    # K1 = K2 (简单名), CUST-CODE = ORDR-CODE (连字号), WS-KEY1 = WS-KEY2
    # 排除右侧为 figurative constant (SPACES, ZERO, HIGH-VALUE 等)
    _fig = r'(?:SPACES?|ZERO[S]?E?S?|HIGH[-\s]VALUE[S]?|LOW[-\s]VALUE[S]?|NULL[S]?|QUOTE[S]?)'
    if re.search(r'IF\s+\w[\w-]*\s*[=<>]\s+\w[\w-]*', source_upper) and \
       not re.search(r'IF\s+\w[\w-]*\s*[=<>]\s+' + _fig, source_upper):
        signals += 1

    # 信号 5: 2+ 文件 OPEN INPUT
    if (re.search(r'OPEN\s+INPUT\s+\w+\s+\w+', source_upper) or  # 同一行
        re.search(r'OPEN\s+INPUT\s+\w+[.\s].*OPEN\s+INPUT', source_upper)):  # 别行
        signals += 1

    # 确信度: 6 中 5+ = 0.55, 4 = 0.50, 3 = 0.40
    # 单文件程序（无多文件特征）降级确信度
    has_multi_file = bool(re.search(r'OPEN\s+INPUT\s+\w+\s+\w+', source_upper)) or \
                     len(re.findall(r'\bFD\s+\w+', source_upper)) >= 2 or \
                     len(re.findall(r'SELECT\s+\w+', source_upper)) >= 2
    if not has_multi_file:
        # 单文件: 仅当有明显键比较（非 figurative constant）时才保留低确信度
        _fig = r'(?:SPACES?|ZERO[S]?E?S?|HIGH[-\s]VALUE[S]?|LOW[-\s]VALUE[S]?)'
        has_real_key_cmp = bool(re.search(r'IF\s+\w[\w-]*\s*[=<>]\s+\w[\w-]*', source_upper)) and \
                          not bool(re.search(r'IF\s+\w[\w-]*\s*[=<>]\s+' + _fig, source_upper))
        if has_real_key_cmp and re.search(r'READ\s+\w+', source_upper):
            pass  # 有键比较+文件读取 → 可能是极简匹配，保留
        else:
            signals -= 2  # 无多文件特征 → 大幅降级
    if signals >= 5:
        return 0.55
    elif signals >= 4:
        return 0.50
    elif signals >= 3:
        return 0.40
    return 0.0


def detect_keyword(source: str) -> list[tuple[str, float, str]]:
    """在 COBOL 源码中搜索 L1_RULES 定义的关键字，返回匹配结果。

    处理步骤:
      1. 剥离注释，避免注释中的关键词触发匹配
      2. 对需要程序上下文的关键词（マッチング），检查 KEY 变量是否在比较中使用

    关键字前缀 "re:" 表示正则表达式匹配。

    Args:
        source: COBOL 程序源码文本。

    Returns:
        list[tuple[str, float, str]]:
            每个元素为 (分类名称, 置信度, 匹配到的关键字原文)。
    """
    cleaned = _strip_cobol_comments(source)
    source_upper = cleaned.upper()

    results: list[tuple[str, float, str]] = []

    for category, keywords, confidence in L1_RULES:
        matched = False
        for kw in keywords:
            if kw.startswith("re:"):
                pattern = kw[3:]
                if not re.search(pattern, source_upper):
                    continue

                # マッチング 关键词需要额外上下文验证：KEY 变量必须在比较中使用
                if category == "マッチング":
                    if not _matches_key_comparison(source_upper):
                        continue

                results.append((category, confidence, kw))
                matched = True
                break
            else:
                if kw in source_upper:
                    results.append((category, confidence, kw))
                    matched = True
                    break

    # ── 结构性匹配检测（不依赖 KEY 变量名）──
    match_conf = _detect_matching_structure(source_upper)
    if match_conf > 0:
        has_more_specific = any(
            cat != "マッチング" for cat, _, _ in results
        )
        if not has_more_specific:
            results.append(("マッチング", match_conf, "structural_matching"))

    return results


# ── 确信度计算 ───────────────────────────────────────────────────────────
def compute_confidence(
    source: str,
    structure: dict[str, Any] | None = None,
    llm_result: dict[str, Any] | None = None,
) -> dict[str, Any]:
    """计算程序分类的确信度。

    优先级:
      1. L1 关键字命中，且最高置信度 >= 0.90 → 直接返回 L1 结果。
      2. LLM 结果存在 → 使用 LLM 的分类结果。
      3. 否则 → 返回 unknown。

    Args:
        source: COBOL 程序源码文本。
        structure: 可选的程序结构信息（暂未使用，保留扩展）。
        llm_result: 可选的 LLM 分类结果。
                    预期格式: {"category": str, "confidence": float, ...}

    Returns:
        dict:
            - "category": str  — 分类名称或 "unknown"
            - "confidence": float — 确信度 (0.0 ~ 1.0)
            - "source": str    — 结果来源 ("l1" / "llm" / "unknown")
            - "matches": list  — 匹配到的关键字详情
    """
    # ── 1. L1 关键字检测 ──
    matches = detect_keyword(source)

    # 找出最高置信度的 L1 匹配
    if matches:
        best = max(matches, key=lambda m: m[1])  # (category, confidence, keyword)
        category, confidence, _ = best

        if confidence >= 0.90:
            return {
                "category": category,
                "confidence": confidence,
                "method": "keyword",
                "source": "l1",
                "features": [best[2]],
                "required_tests": [],
                "strategy_params": {"special_boundaries": [], "coverage_requirements": {"branch": 0.95, "paragraph": 1.0}},
                "matches": matches,
            }

    # ── 2. LLM 结果 ──
    if llm_result is not None:
        llm_category = llm_result.get("category", "unknown")
        llm_confidence = llm_result.get("confidence", 0.0)
        return {
            "category": llm_category,
            "confidence": llm_confidence,
            "method": "hybrid",
            "source": "llm",
            "features": [],
            "required_tests": [],
            "strategy_params": {"special_boundaries": [], "coverage_requirements": {"branch": 0.95, "paragraph": 1.0}},
            "matches": matches,
        }

    # ── 3. 未知 ──
    return {
        "category": "unknown",
        "confidence": 0.0,
        "method": "none",
        "source": "unknown",
        "features": [],
        "required_tests": [],
        "strategy_params": {"special_boundaries": [], "coverage_requirements": {"branch": 0.95, "paragraph": 1.0}},
        "matches": [],
    }