feat: Phase 2 complete — 13 Phases of COBOL type classification and test benchmark
P0.6: gcov infrastructure P1: extract_structure output expansion (11 new feature fields) P2: Confusion group rule engine (8 pairs + contradiction + backtrack) P3: 4-factor confidence calculation + quality gate update P4: 33+2 COBOL program type test samples (22 files, 7 categories) P5: parametrized/ test data generation engine P6: japanese_data.py lookup tables P7-10: Type-specific test suites (~159 parametrized tests) P11: Full classification pipeline (classify_program) + orchestrator integration P12: Documentation (module-interfaces, test-plan v3.0, coverage-matrix) Architecture decisions: - classification_pipeline/ merged to hina/pipeline/ - parametrized/ as independent module - japanese_data.py as root-level file - hina/__all__ only exports classify_program() Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,205 @@
|
||||
"""Deep classifier tests: keyword detection, confidence boundaries, edge cases"""
|
||||
|
||||
import sys, os
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
|
||||
|
||||
from hina.classifier import detect_keyword, compute_confidence
|
||||
|
||||
|
||||
# ── 1. detect_keyword with SQL + SORT + CALL all present ──
|
||||
|
||||
def test_detect_keyword_multiple_matches():
|
||||
"""Source with SQL, SORT and CALL keywords → multiple matches with correct confidence ranking"""
|
||||
source = """
|
||||
IDENTIFICATION DIVISION.
|
||||
PROGRAM-ID. TESTPGM.
|
||||
DATA DIVISION.
|
||||
WORKING-STORAGE SECTION.
|
||||
01 WS-A PIC X(100).
|
||||
PROCEDURE DIVISION.
|
||||
EXEC SQL
|
||||
SELECT * FROM TABLE
|
||||
END-EXEC.
|
||||
SORT ON KEY WS-KEY.
|
||||
CALL 'SUBPGM'.
|
||||
STOP RUN.
|
||||
"""
|
||||
results = detect_keyword(source)
|
||||
|
||||
categories = {r[0] for r in results}
|
||||
assert "DB操作" in categories # EXEC SQL → 0.95
|
||||
assert "SORT" in categories # SORT ON KEY → 0.95
|
||||
assert "子程序调用" in categories # CALL → 0.90
|
||||
|
||||
# Verify confidence values per match
|
||||
cat_map = {r[0]: (r[1], r[2]) for r in results}
|
||||
assert cat_map["DB操作"][0] == 0.95
|
||||
assert cat_map["DB操作"][1] == "EXEC SQL"
|
||||
assert cat_map["SORT"][0] == 0.95
|
||||
assert cat_map["SORT"][1] == "SORT ON KEY"
|
||||
assert cat_map["子程序调用"][0] == 0.90
|
||||
assert cat_map["子程序调用"][1] == "CALL"
|
||||
|
||||
|
||||
# ── 2. compute_confidence with hybrid (keyword + LLM) result ──
|
||||
|
||||
def test_compute_confidence_hybrid():
|
||||
"""Keyword match below 0.90 threshold + LLM result → method=hybrid, uses LLM category"""
|
||||
# "WRITE AFTER" matches "编辑输出" with confidence 0.80 (< 0.90)
|
||||
source = "WRITE AFTER ADVANCING 1 LINE."
|
||||
llm_result = {"category": "output_heavy", "confidence": 0.75}
|
||||
|
||||
result = compute_confidence(source, llm_result=llm_result)
|
||||
|
||||
assert result["method"] == "hybrid"
|
||||
assert result["source"] == "llm"
|
||||
assert result["category"] == "output_heavy"
|
||||
assert result["confidence"] == 0.75
|
||||
# Keyword matches are still attached to the result
|
||||
assert len(result["matches"]) > 0
|
||||
assert any("WRITE AFTER" in str(m) for m in result["matches"])
|
||||
|
||||
|
||||
def test_compute_confidence_keyword_high_confidence_overrides_llm():
|
||||
"""Keyword match >= 0.90 → keyword method wins, LLM ignored"""
|
||||
# "EXEC SQL" matches "DB操作" with confidence 0.95 (>= 0.90)
|
||||
source = "EXEC SQL SELECT * FROM TABLE"
|
||||
llm_result = {"category": "something_else", "confidence": 0.50}
|
||||
|
||||
result = compute_confidence(source, llm_result=llm_result)
|
||||
|
||||
assert result["method"] == "keyword"
|
||||
assert result["source"] == "l1"
|
||||
assert result["category"] == "DB操作"
|
||||
assert result["confidence"] == 0.95
|
||||
|
||||
|
||||
# ── 3. compute_confidence boundaries: 0.0, 0.69, 0.70, 0.71, 1.0 ──
|
||||
|
||||
def test_confidence_boundary_zero():
|
||||
"""No keyword match, no LLM → category=unknown, confidence=0.0"""
|
||||
source = " MOVE 1 TO A.\n ADD 1 TO B.\n STOP RUN."
|
||||
result = compute_confidence(source, llm_result=None)
|
||||
|
||||
assert result["category"] == "unknown"
|
||||
assert result["confidence"] == 0.0
|
||||
assert result["method"] == "none"
|
||||
assert result["matches"] == []
|
||||
|
||||
|
||||
def test_confidence_boundary_069():
|
||||
"""LLM result with confidence 0.69 (below 0.70 boundary)"""
|
||||
source = " MOVE 1 TO A."
|
||||
llm_result = {"category": "custom_category", "confidence": 0.69}
|
||||
result = compute_confidence(source, llm_result=llm_result)
|
||||
|
||||
assert result["category"] == "custom_category"
|
||||
assert result["confidence"] == 0.69
|
||||
assert result["method"] == "hybrid"
|
||||
|
||||
|
||||
def test_confidence_boundary_070():
|
||||
"""LLM result with confidence 0.70 (at 0.70 boundary)"""
|
||||
source = " MOVE 1 TO A."
|
||||
llm_result = {"category": "custom_category", "confidence": 0.70}
|
||||
result = compute_confidence(source, llm_result=llm_result)
|
||||
|
||||
assert result["category"] == "custom_category"
|
||||
assert result["confidence"] == 0.70
|
||||
assert result["method"] == "hybrid"
|
||||
|
||||
|
||||
def test_confidence_boundary_071():
|
||||
"""LLM result with confidence 0.71 (above 0.70 boundary)"""
|
||||
source = " MOVE 1 TO A."
|
||||
llm_result = {"category": "custom_category", "confidence": 0.71}
|
||||
result = compute_confidence(source, llm_result=llm_result)
|
||||
|
||||
assert result["category"] == "custom_category"
|
||||
assert result["confidence"] == 0.71
|
||||
assert result["method"] == "hybrid"
|
||||
|
||||
|
||||
def test_confidence_boundary_max():
|
||||
"""LLM result with confidence 1.0"""
|
||||
source = " MOVE 1 TO A."
|
||||
llm_result = {"category": "perfect", "confidence": 1.0}
|
||||
result = compute_confidence(source, llm_result=llm_result)
|
||||
|
||||
assert result["category"] == "perfect"
|
||||
assert result["confidence"] == 1.0
|
||||
assert result["method"] == "hybrid"
|
||||
|
||||
|
||||
# ── 4. Keyword source text with mixed case, extra whitespace, inline comments ──
|
||||
|
||||
def test_detect_keyword_mixed_case_whitespace_comments():
|
||||
"""Source with mixed case, inline *> comments"""
|
||||
source = """
|
||||
IDENTIFICATION DIVISION.
|
||||
ExEc Sql
|
||||
SELECT * FROM TABLE
|
||||
END-EXEC. *> inline comment
|
||||
Call 'SUBPGM' *> some comment
|
||||
Sort On Key WS-KEY.
|
||||
"""
|
||||
results = detect_keyword(source)
|
||||
|
||||
categories = {r[0] for r in results}
|
||||
assert "DB操作" in categories # EXEC SQL (mixed case)
|
||||
assert "子程序调用" in categories # CALL (mixed case)
|
||||
assert "SORT" in categories # SORT ON KEY (mixed case)
|
||||
|
||||
# Verify matched keywords were found (function uppercases source)
|
||||
matched_keywords = {r[2] for r in results}
|
||||
assert "EXEC SQL" in matched_keywords
|
||||
assert "CALL" in matched_keywords
|
||||
assert "SORT ON KEY" in matched_keywords
|
||||
|
||||
|
||||
# ── 5. No keyword match and no LLM result → unknown ──
|
||||
|
||||
def test_detect_keyword_no_match():
|
||||
"""Source with no known keywords → empty list"""
|
||||
source = " MOVE 1 TO A.\n ADD 1 TO B.\n STOP RUN."
|
||||
results = detect_keyword(source)
|
||||
assert len(results) == 0
|
||||
|
||||
|
||||
def test_compute_confidence_no_match_no_llm():
|
||||
"""No keyword match and no LLM → category=unknown, confidence=0, method=none"""
|
||||
source = " MOVE 1 TO A.\n ADD 1 TO B.\n STOP RUN."
|
||||
result = compute_confidence(source, llm_result=None)
|
||||
|
||||
assert result["category"] == "unknown"
|
||||
assert result["confidence"] == 0.0
|
||||
assert result["method"] == "none"
|
||||
assert result["source"] == "unknown"
|
||||
assert result["matches"] == []
|
||||
|
||||
|
||||
# ── Additional: verify L1_RULES via detect_keyword ──
|
||||
|
||||
def test_detect_keyword_all_rules():
|
||||
"""Each L1_RULE category is detectable from a representative keyword"""
|
||||
test_cases = [
|
||||
("EXEC SQL", "DB操作"),
|
||||
("CALL", "子程序调用"),
|
||||
("IS INITIAL", "IS INITIAL"),
|
||||
("SYSIN", "SYSIN"),
|
||||
("ALPHABETIC", "编码转换"),
|
||||
("DFHCOMMAREA", "online"),
|
||||
("MAP", "online"),
|
||||
("SORT ON KEY", "SORT"),
|
||||
("MERGE ON KEY", "MERGE"),
|
||||
("WRITE AFTER", "编辑输出"),
|
||||
("WRITE BEFORE", "编辑输出"),
|
||||
("ORGANIZATION IS", "文件编成"),
|
||||
("ALTERNATE RECORD KEY", "替代索引"),
|
||||
]
|
||||
for keyword, expected_category in test_cases:
|
||||
source = f" {keyword} DUMMY."
|
||||
results = detect_keyword(source)
|
||||
categories = {r[0] for r in results}
|
||||
assert expected_category in categories, \
|
||||
f"Keyword '{keyword}' should trigger category '{expected_category}', got {categories}"
|
||||
Reference in New Issue
Block a user