feat: Phase 2 complete — 13 Phases of COBOL type classification and test benchmark

P0.6: gcov infrastructure
P1: extract_structure output expansion (11 new feature fields)
P2: Confusion group rule engine (8 pairs + contradiction + backtrack)
P3: 4-factor confidence calculation + quality gate update
P4: 33+2 COBOL program type test samples (22 files, 7 categories)
P5: parametrized/ test data generation engine
P6: japanese_data.py lookup tables
P7-10: Type-specific test suites (~159 parametrized tests)
P11: Full classification pipeline (classify_program) + orchestrator integration
P12: Documentation (module-interfaces, test-plan v3.0, coverage-matrix)

Architecture decisions:
- classification_pipeline/ merged to hina/pipeline/
- parametrized/ as independent module
- japanese_data.py as root-level file
- hina/__all__ only exports classify_program()

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
hangshuo652
2026-06-19 23:51:55 +08:00
parent 63b5284715
commit bc1d56d1a4
129 changed files with 19378 additions and 261 deletions
+203
View File
@@ -0,0 +1,203 @@
"""JP-01~10: japanese_data 模块 — 日文测试数据生成函数"""
from __future__ import annotations
import sys
import os
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
from japanese_data import (
FULLWIDTH_KATAKANA,
FULLWIDTH_HIRAGANA,
FULLWIDTH_DIGITS,
FULLWIDTH_ALPHA,
HALFWIDTH_KATAKANA,
SJIS_5C_PROBLEM,
SJIS_7C_PROBLEM,
WAREKI_BOUNDARIES,
generate_fullwidth_text,
generate_halfwidth_katakana,
generate_sjis_5c_problem,
generate_sjis_7c_problem,
generate_wareki_date,
generate_wareki_boundary,
generate_encoding_test_data,
select_data_type,
)
# ── JP-01~02: 查找表常量 ──
def test_fullwidth_katakana_constants():
"""JP-01: 全角片假名表不为空"""
assert len(FULLWIDTH_KATAKANA) > 0
assert "" in FULLWIDTH_KATAKANA
assert "" in FULLWIDTH_KATAKANA
def test_fullwidth_hiragana_constants():
"""全角平假名表不为空"""
assert len(FULLWIDTH_HIRAGANA) > 0
assert "" in FULLWIDTH_HIRAGANA
assert "" in FULLWIDTH_HIRAGANA
def test_halfwidth_katakana_constants():
"""半角片假名表不为空"""
assert len(HALFWIDTH_KATAKANA) > 0
assert "" in HALFWIDTH_KATAKANA
def test_sjis_problem_constants():
"""SJIS 5C/7C 问题文字表内容"""
assert "" in SJIS_5C_PROBLEM
assert "" in SJIS_7C_PROBLEM
assert len(SJIS_5C_PROBLEM) > 0
assert len(SJIS_7C_PROBLEM) > 0
def test_wareki_boundaries():
"""和历边界表含有平成条目"""
eras = [e[0] for e in WAREKI_BOUNDARIES]
assert "平成" in eras
assert "昭和" in eras
# ── JP-03~05: generate_fullwidth_text ──
def test_fullwidth_text_type():
"""JP-03: generate_fullwidth_text 返回 str"""
field = {"pic_info": {"type": "national", "length": 10}}
result = generate_fullwidth_text(field)
assert isinstance(result, str)
def test_fullwidth_text_length():
"""JP-04: generate_fullwidth_text 返回指定长度"""
field = {"pic_info": {"type": "national", "length": 5}}
result = generate_fullwidth_text(field)
assert len(result) == 5
def test_fullwidth_text_contents():
"""JP-05: generate_fullwidth_text 内容来自全角片假名表"""
field = {"pic_info": {"type": "national", "length": 20}}
result = generate_fullwidth_text(field)
for ch in result:
assert ch in FULLWIDTH_KATAKANA, f"意外字符 {ch!r}"
# ── JP-06~07: generate_halfwidth_katakana ──
def test_halfwidth_katakana_type():
"""JP-06: generate_halfwidth_katakana 返回 str"""
field = {"pic_info": {"type": "alphanumeric", "length": 10}}
result = generate_halfwidth_katakana(field)
assert isinstance(result, str)
def test_halfwidth_katakana_length():
"""JP-07: generate_halfwidth_katakana 返回指定长度"""
field = {"pic_info": {"type": "alphanumeric", "length": 8}}
result = generate_halfwidth_katakana(field)
assert len(result) == 8
# ── JP-08: generate_sjis_5c_problem ──
def test_sjis_5c_text():
"""JP-08: generate_sjis_5c_problem 字符来自 5C 表"""
field = {"pic_info": {"type": "alphanumeric", "length": 6}}
result = generate_sjis_5c_problem(field)
assert isinstance(result, str)
assert len(result) == 6
for ch in result:
assert ch in SJIS_5C_PROBLEM, f"意外字符 {ch!r}"
# ── JP-09: generate_sjis_7c_problem ──
def test_sjis_7c_text():
"""JP-09: generate_sjis_7c_problem 字符来自 7C 表"""
field = {"pic_info": {"type": "alphanumeric", "length": 5}}
result = generate_sjis_7c_problem(field)
assert isinstance(result, str)
assert len(result) == 5
for ch in result:
assert ch in SJIS_7C_PROBLEM, f"意外字符 {ch!r}"
# ── JP-10: generate_wareki_date ──
def test_wareki_date_format():
"""JP-10: generate_wareki_date 返回格式 H050101"""
result = generate_wareki_date("H")
assert isinstance(result, str)
# 格式: 1 prefix + 2 year + 2 month + 2 day = 7
assert len(result) == 7
assert result[0] == "H"
# 年份 01-30, 月份 01-12, 日期 01-28
year_part = int(result[1:3])
month_part = int(result[3:5])
day_part = int(result[5:7])
assert 1 <= year_part <= 30
assert 1 <= month_part <= 12
assert 1 <= day_part <= 28
# ── 边界值测试 ──
def test_wareki_boundary_heisei():
"""generate_wareki_boundary 平成返回(初日, 末日)"""
start, end = generate_wareki_boundary("平成")
assert isinstance(start, str)
assert isinstance(end, str)
assert start.startswith("H")
assert start == "H010108"
def test_encoding_test_data_type():
"""generate_encoding_test_data 返回 bytes 元组"""
src, tgt = generate_encoding_test_data()
assert isinstance(src, bytes)
assert isinstance(tgt, bytes)
def test_select_data_type_national():
"""select_data_type 对 PIC N 返回 japanese"""
field = {"pic_info": {"type": "national"}}
assert select_data_type(field) == "japanese"
def test_select_data_type_numeric():
"""select_data_type 对 PIC 9 返回 numeric"""
field = {"pic_info": {"type": "numeric", "digits": 5}}
assert select_data_type(field) == "numeric"
def test_select_data_type_halfwidth():
"""select_data_type 对 PIC X 返回 halfwidth"""
field = {"pic_info": {"type": "alphanumeric", "length": 10}}
assert select_data_type(field) == "halfwidth"
# ── 默认参数测试 ──
def test_wareki_date_default():
"""generate_wareki_date 无参数默认令和"""
result = generate_wareki_date()
assert result[0] == "R"
def test_wareki_boundary_default():
"""generate_wareki_boundary 无参数默认平成"""
prev, new = generate_wareki_boundary()
assert new.startswith("H")