cobol-java-v3/tests/hina/test_classifier_deep.py

"""Deep classifier tests: keyword detection, confidence boundaries, edge cases"""

import sys, os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))

from hina.classifier import detect_keyword, compute_confidence


# ── 1. detect_keyword with SQL + SORT + CALL all present ──

def test_detect_keyword_multiple_matches():
    """Source with SQL, SORT and CALL keywords → multiple matches with correct confidence ranking"""
    source = """
       IDENTIFICATION DIVISION.
       PROGRAM-ID. TESTPGM.
       DATA DIVISION.
       WORKING-STORAGE SECTION.
       01 WS-A PIC X(100).
       PROCEDURE DIVISION.
           EXEC SQL
               SELECT * FROM TABLE
           END-EXEC.
           SORT ON KEY WS-KEY.
           CALL 'SUBPGM'.
           STOP RUN.
    """
    results = detect_keyword(source)

    categories = {r[0] for r in results}
    assert "DB操作" in categories    # EXEC SQL → 0.95
    assert "SORT" in categories       # SORT ON KEY → 0.95
    assert "子程序调用" in categories  # CALL → 0.90

    # Verify confidence values per match
    cat_map = {r[0]: (r[1], r[2]) for r in results}
    assert cat_map["DB操作"][0] == 0.95
    assert cat_map["DB操作"][1] == "EXEC SQL"
    assert cat_map["SORT"][0] == 0.95
    assert cat_map["SORT"][1] == "SORT ON KEY"
    assert cat_map["子程序调用"][0] == 0.90
    assert cat_map["子程序调用"][1] == "CALL"


# ── 2. compute_confidence with hybrid (keyword + LLM) result ──

def test_compute_confidence_hybrid():
    """Keyword match below 0.90 threshold + LLM result → method=hybrid, uses LLM category"""
    # "WRITE AFTER" matches "编辑输出" with confidence 0.80 (< 0.90)
    source = "WRITE AFTER ADVANCING 1 LINE."
    llm_result = {"category": "output_heavy", "confidence": 0.75}

    result = compute_confidence(source, llm_result=llm_result)

    assert result["method"] == "hybrid"
    assert result["source"] == "llm"
    assert result["category"] == "output_heavy"
    assert result["confidence"] == 0.75
    # Keyword matches are still attached to the result
    assert len(result["matches"]) > 0
    assert any("WRITE AFTER" in str(m) for m in result["matches"])


def test_compute_confidence_keyword_high_confidence_overrides_llm():
    """Keyword match >= 0.90 → keyword method wins, LLM ignored"""
    # "EXEC SQL" matches "DB操作" with confidence 0.95 (>= 0.90)
    source = "EXEC SQL SELECT * FROM TABLE"
    llm_result = {"category": "something_else", "confidence": 0.50}

    result = compute_confidence(source, llm_result=llm_result)

    assert result["method"] == "keyword"
    assert result["source"] == "l1"
    assert result["category"] == "DB操作"
    assert result["confidence"] == 0.95


# ── 3. compute_confidence boundaries: 0.0, 0.69, 0.70, 0.71, 1.0 ──

def test_confidence_boundary_zero():
    """No keyword match, no LLM → category=unknown, confidence=0.0"""
    source = "       MOVE 1 TO A.\n       ADD 1 TO B.\n       STOP RUN."
    result = compute_confidence(source, llm_result=None)

    assert result["category"] == "unknown"
    assert result["confidence"] == 0.0
    assert result["method"] == "none"
    assert result["matches"] == []


def test_confidence_boundary_069():
    """LLM result with confidence 0.69 (below 0.70 boundary)"""
    source = "       MOVE 1 TO A."
    llm_result = {"category": "custom_category", "confidence": 0.69}
    result = compute_confidence(source, llm_result=llm_result)

    assert result["category"] == "custom_category"
    assert result["confidence"] == 0.69
    assert result["method"] == "hybrid"


def test_confidence_boundary_070():
    """LLM result with confidence 0.70 (at 0.70 boundary)"""
    source = "       MOVE 1 TO A."
    llm_result = {"category": "custom_category", "confidence": 0.70}
    result = compute_confidence(source, llm_result=llm_result)

    assert result["category"] == "custom_category"
    assert result["confidence"] == 0.70
    assert result["method"] == "hybrid"


def test_confidence_boundary_071():
    """LLM result with confidence 0.71 (above 0.70 boundary)"""
    source = "       MOVE 1 TO A."
    llm_result = {"category": "custom_category", "confidence": 0.71}
    result = compute_confidence(source, llm_result=llm_result)

    assert result["category"] == "custom_category"
    assert result["confidence"] == 0.71
    assert result["method"] == "hybrid"


def test_confidence_boundary_max():
    """LLM result with confidence 1.0"""
    source = "       MOVE 1 TO A."
    llm_result = {"category": "perfect", "confidence": 1.0}
    result = compute_confidence(source, llm_result=llm_result)

    assert result["category"] == "perfect"
    assert result["confidence"] == 1.0
    assert result["method"] == "hybrid"


# ── 4. Keyword source text with mixed case, extra whitespace, inline comments ──

def test_detect_keyword_mixed_case_whitespace_comments():
    """Source with mixed case, inline *> comments"""
    source = """
       IDENTIFICATION DIVISION.
           ExEc Sql
               SELECT * FROM TABLE
           END-EXEC.   *> inline comment
           Call 'SUBPGM'   *> some comment
           Sort On Key WS-KEY.
    """
    results = detect_keyword(source)

    categories = {r[0] for r in results}
    assert "DB操作" in categories      # EXEC SQL (mixed case)
    assert "子程序调用" in categories   # CALL (mixed case)
    assert "SORT" in categories         # SORT ON KEY (mixed case)

    # Verify matched keywords were found (function uppercases source)
    matched_keywords = {r[2] for r in results}
    assert "EXEC SQL" in matched_keywords
    assert "CALL" in matched_keywords
    assert "SORT ON KEY" in matched_keywords


# ── 5. No keyword match and no LLM result → unknown ──

def test_detect_keyword_no_match():
    """Source with no known keywords → empty list"""
    source = "       MOVE 1 TO A.\n       ADD 1 TO B.\n       STOP RUN."
    results = detect_keyword(source)
    assert len(results) == 0


def test_compute_confidence_no_match_no_llm():
    """No keyword match and no LLM → category=unknown, confidence=0, method=none"""
    source = "       MOVE 1 TO A.\n       ADD 1 TO B.\n       STOP RUN."
    result = compute_confidence(source, llm_result=None)

    assert result["category"] == "unknown"
    assert result["confidence"] == 0.0
    assert result["method"] == "none"
    assert result["source"] == "unknown"
    assert result["matches"] == []


# ── Additional: verify L1_RULES via detect_keyword ──

def test_detect_keyword_all_rules():
    """Each L1_RULE category is detectable from a representative keyword"""
    test_cases = [
        ("EXEC SQL", "DB操作"),
        ("CALL", "子程序调用"),
        ("IS INITIAL", "IS INITIAL"),
        ("SYSIN", "SYSIN"),
        ("ALPHABETIC", "编码转换"),
        ("DFHCOMMAREA", "online"),
        ("MAP", "online"),
        ("SORT ON KEY", "SORT"),
        ("MERGE ON KEY", "MERGE"),
        ("WRITE AFTER", "编辑输出"),
        ("WRITE BEFORE", "编辑输出"),
        ("ORGANIZATION IS", "文件编成"),
        ("ALTERNATE RECORD KEY", "替代索引"),
    ]
    for keyword, expected_category in test_cases:
        source = f"       {keyword}  DUMMY."
        results = detect_keyword(source)
        categories = {r[0] for r in results}
        assert expected_category in categories, \
            f"Keyword '{keyword}' should trigger category '{expected_category}', got {categories}"