cobol-java-v3/tests/hina/test_classifier_deep.py

"""Deep classifier tests: keyword detection, confidence boundaries, edge cases"""

import sys, os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))

from hina.classifier import detect_keyword, compute_confidence


# ── 1. detect_keyword with SQL + SORT + CALL all present ──

def test_detect_keyword_multiple_matches():
    """Source with SQL, SORT and CALL keywords → multiple matches with correct confidence ranking"""
    source = """
       IDENTIFICATION DIVISION.
       PROGRAM-ID. TESTPGM.
       DATA DIVISION.
       WORKING-STORAGE SECTION.
       01 WS-A PIC X(100).
       PROCEDURE DIVISION.
           EXEC SQL
               SELECT * FROM TABLE
           END-EXEC.
           SORT SORT-FILE ON KEY WS-KEY.
           CALL 'SUBPGM'.
           STOP RUN.
    """
    results = detect_keyword(source)

    categories = {r[0] for r in results}
    assert "DB操作" in categories    # EXEC SQL → 0.95
    assert "SORT" in categories       # SORT ON KEY → 0.95
    assert "子程序调用" in categories  # CALL → 0.90

    # Verify confidence values per match
    cat_map = {r[0]: (r[1], r[2]) for r in results}
    assert cat_map["DB操作"][0] == 0.95
    assert cat_map["DB操作"][1] == "EXEC SQL"
    assert cat_map["SORT"][0] == 0.95
    assert cat_map["SORT"][1].startswith("re:SORT")  # regex pattern
    assert cat_map["子程序调用"][0] == 0.90
    assert cat_map["子程序调用"][1] == "CALL"


# ── 2. compute_confidence with hybrid (keyword + LLM) result ──

def test_compute_confidence_hybrid():
    """Keyword match below 0.90 threshold + LLM result → method=hybrid, uses LLM category"""
    # "WRITE AFTER" matches "编辑输出" with confidence 0.80 (< 0.90)
    source = "WRITE AFTER ADVANCING 1 LINE."
    llm_result = {"category": "output_heavy", "confidence": 0.75}

    result = compute_confidence(source, llm_result=llm_result)

    assert result["method"] == "hybrid"
    assert result["source"] == "llm"
    assert result["category"] == "output_heavy"
    assert result["confidence"] == 0.75
    # Keyword matches are still attached to the result
    assert len(result["matches"]) > 0
    assert any("WRITE AFTER" in str(m) for m in result["matches"])


def test_compute_confidence_keyword_high_confidence_overrides_llm():
    """Keyword match >= 0.90 → keyword method wins, LLM ignored"""
    # "EXEC SQL" matches "DB操作" with confidence 0.95 (>= 0.90)
    source = "EXEC SQL SELECT * FROM TABLE"
    llm_result = {"category": "something_else", "confidence": 0.50}

    result = compute_confidence(source, llm_result=llm_result)

    assert result["method"] == "keyword"
    assert result["source"] == "l1"
    assert result["category"] == "DB操作"
    assert result["confidence"] == 0.95


# ── 3. compute_confidence boundaries: 0.0, 0.69, 0.70, 0.71, 1.0 ──

def test_confidence_boundary_zero():
    """No keyword match, no LLM → category=unknown, confidence=0.0"""
    source = "       MOVE 1 TO A.\n       ADD 1 TO B.\n       STOP RUN."
    result = compute_confidence(source, llm_result=None)

    assert result["category"] == "unknown"
    assert result["confidence"] == 0.0
    assert result["method"] == "none"
    assert result["matches"] == []


def test_confidence_boundary_069():
    """LLM result with confidence 0.69 (below 0.70 boundary)"""
    source = "       MOVE 1 TO A."
    llm_result = {"category": "custom_category", "confidence": 0.69}
    result = compute_confidence(source, llm_result=llm_result)

    assert result["category"] == "custom_category"
    assert result["confidence"] == 0.69
    assert result["method"] == "hybrid"


def test_confidence_boundary_070():
    """LLM result with confidence 0.70 (at 0.70 boundary)"""
    source = "       MOVE 1 TO A."
    llm_result = {"category": "custom_category", "confidence": 0.70}
    result = compute_confidence(source, llm_result=llm_result)

    assert result["category"] == "custom_category"
    assert result["confidence"] == 0.70
    assert result["method"] == "hybrid"


def test_confidence_boundary_071():
    """LLM result with confidence 0.71 (above 0.70 boundary)"""
    source = "       MOVE 1 TO A."
    llm_result = {"category": "custom_category", "confidence": 0.71}
    result = compute_confidence(source, llm_result=llm_result)

    assert result["category"] == "custom_category"
    assert result["confidence"] == 0.71
    assert result["method"] == "hybrid"


def test_confidence_boundary_max():
    """LLM result with confidence 1.0"""
    source = "       MOVE 1 TO A."
    llm_result = {"category": "perfect", "confidence": 1.0}
    result = compute_confidence(source, llm_result=llm_result)

    assert result["category"] == "perfect"
    assert result["confidence"] == 1.0
    assert result["method"] == "hybrid"


# ── 4. Keyword source text with mixed case, extra whitespace, inline comments ──

def test_detect_keyword_mixed_case_whitespace_comments():
    """Source with mixed case, inline *> comments"""
    source = """
       IDENTIFICATION DIVISION.
           ExEc Sql
               SELECT * FROM TABLE
           END-EXEC.   *> inline comment
           Call 'SUBPGM'   *> some comment
           Sort On Key WS-KEY.
    """
    results = detect_keyword(source)

    categories = {r[0] for r in results}
    assert "DB操作" in categories      # EXEC SQL (mixed case)
    assert "子程序调用" in categories   # CALL (mixed case)
    assert "SORT" in categories         # SORT ON KEY (mixed case)

    # Verify matched keywords were found (function uppercases source)
    matched_keywords = {r[2] for r in results}
    assert "EXEC SQL" in matched_keywords
    assert "CALL" in matched_keywords
    assert any(r[0] == "SORT" for r in results)  # SORT detected via regex


# ── 5. No keyword match and no LLM result → unknown ──

def test_detect_keyword_no_match():
    """Source with no known keywords → empty list"""
    source = "       MOVE 1 TO A.\n       ADD 1 TO B.\n       STOP RUN."
    results = detect_keyword(source)
    assert len(results) == 0


def test_compute_confidence_no_match_no_llm():
    """No keyword match and no LLM → category=unknown, confidence=0, method=none"""
    source = "       MOVE 1 TO A.\n       ADD 1 TO B.\n       STOP RUN."
    result = compute_confidence(source, llm_result=None)

    assert result["category"] == "unknown"
    assert result["confidence"] == 0.0
    assert result["method"] == "none"
    assert result["source"] == "unknown"
    assert result["matches"] == []


# ── Additional: verify L1_RULES via detect_keyword ──

def test_detect_keyword_all_rules():
    """Each L1_RULE category is detectable from a representative keyword"""
    test_cases = [
        ("EXEC SQL", "DB操作"),
        ("CALL", "子程序调用"),
        ("IS INITIAL", "IS INITIAL"),
        ("SYSIN", "SYSIN"),
        ("ALPHABETIC", "编码转换"),
        ("DFHCOMMAREA", "online"),
        ("MAP", "online"),
        ("SORT SORT-FILE ON KEY", "SORT"),
        ("MERGE MERGE-FILE ON KEY", "MERGE"),
        ("WRITE AFTER", "编辑输出"),
        ("WRITE BEFORE", "编辑输出"),
        ("ORGANIZATION IS", "文件编成"),
        ("ALTERNATE RECORD KEY", "替代索引"),
    ]
    for keyword, expected_category in test_cases:
        source = f"       {keyword}  DUMMY."
        results = detect_keyword(source)
        categories = {r[0] for r in results}
        assert expected_category in categories, \
            f"Keyword '{keyword}' should trigger category '{expected_category}', got {categories}"