feat: Phase 2 complete — 13 Phases of COBOL type classification and test benchmark

P0.6: gcov infrastructure P1: extract_structure output expansion (11 new feature fields) P2: Confusion group rule engine (8 pairs + contradiction + backtrack) P3: 4-factor confidence calculation + quality gate update P4: 33+2 COBOL program type test samples (22 files, 7 categories) P5: parametrized/ test data generation engine P6: japanese_data.py lookup tables P7-10: Type-specific test suites (~159 parametrized tests) P11: Full classification pipeline (classify_program) + orchestrator integration P12: Documentation (module-interfaces, test-plan v3.0, coverage-matrix) Architecture decisions: - classification_pipeline/ merged to hina/pipeline/ - parametrized/ as independent module - japanese_data.py as root-level file - hina/__all__ only exports classify_program() Co-Authored-By: Claude <noreply@anthropic.com>
2026-06-19 23:51:55 +08:00
parent 63b5284715
commit bc1d56d1a4
129 changed files with 19378 additions and 261 deletions
@@ -0,0 +1,205 @@
+"""Deep classifier tests: keyword detection, confidence boundaries, edge cases"""
+
+import sys, os
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
+
+from hina.classifier import detect_keyword, compute_confidence
+
+
+# ── 1. detect_keyword with SQL + SORT + CALL all present ──
+
+def test_detect_keyword_multiple_matches():
+    """Source with SQL, SORT and CALL keywords → multiple matches with correct confidence ranking"""
+    source = """
+       IDENTIFICATION DIVISION.
+       PROGRAM-ID. TESTPGM.
+       DATA DIVISION.
+       WORKING-STORAGE SECTION.
+       01 WS-A PIC X(100).
+       PROCEDURE DIVISION.
+           EXEC SQL
+               SELECT * FROM TABLE
+           END-EXEC.
+           SORT ON KEY WS-KEY.
+           CALL 'SUBPGM'.
+           STOP RUN.
+    """
+    results = detect_keyword(source)
+
+    categories = {r[0] for r in results}
+    assert "DB操作" in categories    # EXEC SQL → 0.95
+    assert "SORT" in categories       # SORT ON KEY → 0.95
+    assert "子程序调用" in categories  # CALL → 0.90
+
+    # Verify confidence values per match
+    cat_map = {r[0]: (r[1], r[2]) for r in results}
+    assert cat_map["DB操作"][0] == 0.95
+    assert cat_map["DB操作"][1] == "EXEC SQL"
+    assert cat_map["SORT"][0] == 0.95
+    assert cat_map["SORT"][1] == "SORT ON KEY"
+    assert cat_map["子程序调用"][0] == 0.90
+    assert cat_map["子程序调用"][1] == "CALL"
+
+
+# ── 2. compute_confidence with hybrid (keyword + LLM) result ──
+
+def test_compute_confidence_hybrid():
+    """Keyword match below 0.90 threshold + LLM result → method=hybrid, uses LLM category"""
+    # "WRITE AFTER" matches "编辑输出" with confidence 0.80 (< 0.90)
+    source = "WRITE AFTER ADVANCING 1 LINE."
+    llm_result = {"category": "output_heavy", "confidence": 0.75}
+
+    result = compute_confidence(source, llm_result=llm_result)
+
+    assert result["method"] == "hybrid"
+    assert result["source"] == "llm"
+    assert result["category"] == "output_heavy"
+    assert result["confidence"] == 0.75
+    # Keyword matches are still attached to the result
+    assert len(result["matches"]) > 0
+    assert any("WRITE AFTER" in str(m) for m in result["matches"])
+
+
+def test_compute_confidence_keyword_high_confidence_overrides_llm():
+    """Keyword match >= 0.90 → keyword method wins, LLM ignored"""
+    # "EXEC SQL" matches "DB操作" with confidence 0.95 (>= 0.90)
+    source = "EXEC SQL SELECT * FROM TABLE"
+    llm_result = {"category": "something_else", "confidence": 0.50}
+
+    result = compute_confidence(source, llm_result=llm_result)
+
+    assert result["method"] == "keyword"
+    assert result["source"] == "l1"
+    assert result["category"] == "DB操作"
+    assert result["confidence"] == 0.95
+
+
+# ── 3. compute_confidence boundaries: 0.0, 0.69, 0.70, 0.71, 1.0 ──
+
+def test_confidence_boundary_zero():
+    """No keyword match, no LLM → category=unknown, confidence=0.0"""
+    source = "       MOVE 1 TO A.\n       ADD 1 TO B.\n       STOP RUN."
+    result = compute_confidence(source, llm_result=None)
+
+    assert result["category"] == "unknown"
+    assert result["confidence"] == 0.0
+    assert result["method"] == "none"
+    assert result["matches"] == []
+
+
+def test_confidence_boundary_069():
+    """LLM result with confidence 0.69 (below 0.70 boundary)"""
+    source = "       MOVE 1 TO A."
+    llm_result = {"category": "custom_category", "confidence": 0.69}
+    result = compute_confidence(source, llm_result=llm_result)
+
+    assert result["category"] == "custom_category"
+    assert result["confidence"] == 0.69
+    assert result["method"] == "hybrid"
+
+
+def test_confidence_boundary_070():
+    """LLM result with confidence 0.70 (at 0.70 boundary)"""
+    source = "       MOVE 1 TO A."
+    llm_result = {"category": "custom_category", "confidence": 0.70}
+    result = compute_confidence(source, llm_result=llm_result)
+
+    assert result["category"] == "custom_category"
+    assert result["confidence"] == 0.70
+    assert result["method"] == "hybrid"
+
+
+def test_confidence_boundary_071():
+    """LLM result with confidence 0.71 (above 0.70 boundary)"""
+    source = "       MOVE 1 TO A."
+    llm_result = {"category": "custom_category", "confidence": 0.71}
+    result = compute_confidence(source, llm_result=llm_result)
+
+    assert result["category"] == "custom_category"
+    assert result["confidence"] == 0.71
+    assert result["method"] == "hybrid"
+
+
+def test_confidence_boundary_max():
+    """LLM result with confidence 1.0"""
+    source = "       MOVE 1 TO A."
+    llm_result = {"category": "perfect", "confidence": 1.0}
+    result = compute_confidence(source, llm_result=llm_result)
+
+    assert result["category"] == "perfect"
+    assert result["confidence"] == 1.0
+    assert result["method"] == "hybrid"
+
+
+# ── 4. Keyword source text with mixed case, extra whitespace, inline comments ──
+
+def test_detect_keyword_mixed_case_whitespace_comments():
+    """Source with mixed case, inline *> comments"""
+    source = """
+       IDENTIFICATION DIVISION.
+           ExEc Sql
+               SELECT * FROM TABLE
+           END-EXEC.   *> inline comment
+           Call 'SUBPGM'   *> some comment
+           Sort On Key WS-KEY.
+    """
+    results = detect_keyword(source)
+
+    categories = {r[0] for r in results}
+    assert "DB操作" in categories      # EXEC SQL (mixed case)
+    assert "子程序调用" in categories   # CALL (mixed case)
+    assert "SORT" in categories         # SORT ON KEY (mixed case)
+
+    # Verify matched keywords were found (function uppercases source)
+    matched_keywords = {r[2] for r in results}
+    assert "EXEC SQL" in matched_keywords
+    assert "CALL" in matched_keywords
+    assert "SORT ON KEY" in matched_keywords
+
+
+# ── 5. No keyword match and no LLM result → unknown ──
+
+def test_detect_keyword_no_match():
+    """Source with no known keywords → empty list"""
+    source = "       MOVE 1 TO A.\n       ADD 1 TO B.\n       STOP RUN."
+    results = detect_keyword(source)
+    assert len(results) == 0
+
+
+def test_compute_confidence_no_match_no_llm():
+    """No keyword match and no LLM → category=unknown, confidence=0, method=none"""
+    source = "       MOVE 1 TO A.\n       ADD 1 TO B.\n       STOP RUN."
+    result = compute_confidence(source, llm_result=None)
+
+    assert result["category"] == "unknown"
+    assert result["confidence"] == 0.0
+    assert result["method"] == "none"
+    assert result["source"] == "unknown"
+    assert result["matches"] == []
+
+
+# ── Additional: verify L1_RULES via detect_keyword ──
+
+def test_detect_keyword_all_rules():
+    """Each L1_RULE category is detectable from a representative keyword"""
+    test_cases = [
+        ("EXEC SQL", "DB操作"),
+        ("CALL", "子程序调用"),
+        ("IS INITIAL", "IS INITIAL"),
+        ("SYSIN", "SYSIN"),
+        ("ALPHABETIC", "编码转换"),
+        ("DFHCOMMAREA", "online"),
+        ("MAP", "online"),
+        ("SORT ON KEY", "SORT"),
+        ("MERGE ON KEY", "MERGE"),
+        ("WRITE AFTER", "编辑输出"),
+        ("WRITE BEFORE", "编辑输出"),
+        ("ORGANIZATION IS", "文件编成"),
+        ("ALTERNATE RECORD KEY", "替代索引"),
+    ]
+    for keyword, expected_category in test_cases:
+        source = f"       {keyword}  DUMMY."
+        results = detect_keyword(source)
+        categories = {r[0] for r in results}
+        assert expected_category in categories, \
+            f"Keyword '{keyword}' should trigger category '{expected_category}', got {categories}"