Files
cobol-java-v3/tests/hina/test_classifier_deep.py
hangshuo652 bc1d56d1a4 feat: Phase 2 complete — 13 Phases of COBOL type classification and test benchmark
P0.6: gcov infrastructure
P1: extract_structure output expansion (11 new feature fields)
P2: Confusion group rule engine (8 pairs + contradiction + backtrack)
P3: 4-factor confidence calculation + quality gate update
P4: 33+2 COBOL program type test samples (22 files, 7 categories)
P5: parametrized/ test data generation engine
P6: japanese_data.py lookup tables
P7-10: Type-specific test suites (~159 parametrized tests)
P11: Full classification pipeline (classify_program) + orchestrator integration
P12: Documentation (module-interfaces, test-plan v3.0, coverage-matrix)

Architecture decisions:
- classification_pipeline/ merged to hina/pipeline/
- parametrized/ as independent module
- japanese_data.py as root-level file
- hina/__all__ only exports classify_program()

Co-Authored-By: Claude <noreply@anthropic.com>
2026-06-19 23:51:55 +08:00

206 lines
7.4 KiB
Python

"""Deep classifier tests: keyword detection, confidence boundaries, edge cases"""
import sys, os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from hina.classifier import detect_keyword, compute_confidence
# ── 1. detect_keyword with SQL + SORT + CALL all present ──
def test_detect_keyword_multiple_matches():
"""Source with SQL, SORT and CALL keywords → multiple matches with correct confidence ranking"""
source = """
IDENTIFICATION DIVISION.
PROGRAM-ID. TESTPGM.
DATA DIVISION.
WORKING-STORAGE SECTION.
01 WS-A PIC X(100).
PROCEDURE DIVISION.
EXEC SQL
SELECT * FROM TABLE
END-EXEC.
SORT ON KEY WS-KEY.
CALL 'SUBPGM'.
STOP RUN.
"""
results = detect_keyword(source)
categories = {r[0] for r in results}
assert "DB操作" in categories # EXEC SQL → 0.95
assert "SORT" in categories # SORT ON KEY → 0.95
assert "子程序调用" in categories # CALL → 0.90
# Verify confidence values per match
cat_map = {r[0]: (r[1], r[2]) for r in results}
assert cat_map["DB操作"][0] == 0.95
assert cat_map["DB操作"][1] == "EXEC SQL"
assert cat_map["SORT"][0] == 0.95
assert cat_map["SORT"][1] == "SORT ON KEY"
assert cat_map["子程序调用"][0] == 0.90
assert cat_map["子程序调用"][1] == "CALL"
# ── 2. compute_confidence with hybrid (keyword + LLM) result ──
def test_compute_confidence_hybrid():
"""Keyword match below 0.90 threshold + LLM result → method=hybrid, uses LLM category"""
# "WRITE AFTER" matches "编辑输出" with confidence 0.80 (< 0.90)
source = "WRITE AFTER ADVANCING 1 LINE."
llm_result = {"category": "output_heavy", "confidence": 0.75}
result = compute_confidence(source, llm_result=llm_result)
assert result["method"] == "hybrid"
assert result["source"] == "llm"
assert result["category"] == "output_heavy"
assert result["confidence"] == 0.75
# Keyword matches are still attached to the result
assert len(result["matches"]) > 0
assert any("WRITE AFTER" in str(m) for m in result["matches"])
def test_compute_confidence_keyword_high_confidence_overrides_llm():
"""Keyword match >= 0.90 → keyword method wins, LLM ignored"""
# "EXEC SQL" matches "DB操作" with confidence 0.95 (>= 0.90)
source = "EXEC SQL SELECT * FROM TABLE"
llm_result = {"category": "something_else", "confidence": 0.50}
result = compute_confidence(source, llm_result=llm_result)
assert result["method"] == "keyword"
assert result["source"] == "l1"
assert result["category"] == "DB操作"
assert result["confidence"] == 0.95
# ── 3. compute_confidence boundaries: 0.0, 0.69, 0.70, 0.71, 1.0 ──
def test_confidence_boundary_zero():
"""No keyword match, no LLM → category=unknown, confidence=0.0"""
source = " MOVE 1 TO A.\n ADD 1 TO B.\n STOP RUN."
result = compute_confidence(source, llm_result=None)
assert result["category"] == "unknown"
assert result["confidence"] == 0.0
assert result["method"] == "none"
assert result["matches"] == []
def test_confidence_boundary_069():
"""LLM result with confidence 0.69 (below 0.70 boundary)"""
source = " MOVE 1 TO A."
llm_result = {"category": "custom_category", "confidence": 0.69}
result = compute_confidence(source, llm_result=llm_result)
assert result["category"] == "custom_category"
assert result["confidence"] == 0.69
assert result["method"] == "hybrid"
def test_confidence_boundary_070():
"""LLM result with confidence 0.70 (at 0.70 boundary)"""
source = " MOVE 1 TO A."
llm_result = {"category": "custom_category", "confidence": 0.70}
result = compute_confidence(source, llm_result=llm_result)
assert result["category"] == "custom_category"
assert result["confidence"] == 0.70
assert result["method"] == "hybrid"
def test_confidence_boundary_071():
"""LLM result with confidence 0.71 (above 0.70 boundary)"""
source = " MOVE 1 TO A."
llm_result = {"category": "custom_category", "confidence": 0.71}
result = compute_confidence(source, llm_result=llm_result)
assert result["category"] == "custom_category"
assert result["confidence"] == 0.71
assert result["method"] == "hybrid"
def test_confidence_boundary_max():
"""LLM result with confidence 1.0"""
source = " MOVE 1 TO A."
llm_result = {"category": "perfect", "confidence": 1.0}
result = compute_confidence(source, llm_result=llm_result)
assert result["category"] == "perfect"
assert result["confidence"] == 1.0
assert result["method"] == "hybrid"
# ── 4. Keyword source text with mixed case, extra whitespace, inline comments ──
def test_detect_keyword_mixed_case_whitespace_comments():
"""Source with mixed case, inline *> comments"""
source = """
IDENTIFICATION DIVISION.
ExEc Sql
SELECT * FROM TABLE
END-EXEC. *> inline comment
Call 'SUBPGM' *> some comment
Sort On Key WS-KEY.
"""
results = detect_keyword(source)
categories = {r[0] for r in results}
assert "DB操作" in categories # EXEC SQL (mixed case)
assert "子程序调用" in categories # CALL (mixed case)
assert "SORT" in categories # SORT ON KEY (mixed case)
# Verify matched keywords were found (function uppercases source)
matched_keywords = {r[2] for r in results}
assert "EXEC SQL" in matched_keywords
assert "CALL" in matched_keywords
assert "SORT ON KEY" in matched_keywords
# ── 5. No keyword match and no LLM result → unknown ──
def test_detect_keyword_no_match():
"""Source with no known keywords → empty list"""
source = " MOVE 1 TO A.\n ADD 1 TO B.\n STOP RUN."
results = detect_keyword(source)
assert len(results) == 0
def test_compute_confidence_no_match_no_llm():
"""No keyword match and no LLM → category=unknown, confidence=0, method=none"""
source = " MOVE 1 TO A.\n ADD 1 TO B.\n STOP RUN."
result = compute_confidence(source, llm_result=None)
assert result["category"] == "unknown"
assert result["confidence"] == 0.0
assert result["method"] == "none"
assert result["source"] == "unknown"
assert result["matches"] == []
# ── Additional: verify L1_RULES via detect_keyword ──
def test_detect_keyword_all_rules():
"""Each L1_RULE category is detectable from a representative keyword"""
test_cases = [
("EXEC SQL", "DB操作"),
("CALL", "子程序调用"),
("IS INITIAL", "IS INITIAL"),
("SYSIN", "SYSIN"),
("ALPHABETIC", "编码转换"),
("DFHCOMMAREA", "online"),
("MAP", "online"),
("SORT ON KEY", "SORT"),
("MERGE ON KEY", "MERGE"),
("WRITE AFTER", "编辑输出"),
("WRITE BEFORE", "编辑输出"),
("ORGANIZATION IS", "文件编成"),
("ALTERNATE RECORD KEY", "替代索引"),
]
for keyword, expected_category in test_cases:
source = f" {keyword} DUMMY."
results = detect_keyword(source)
categories = {r[0] for r in results}
assert expected_category in categories, \
f"Keyword '{keyword}' should trigger category '{expected_category}', got {categories}"