feat: Phase 2 complete — 13 Phases of COBOL type classification and test benchmark
P0.6: gcov infrastructure P1: extract_structure output expansion (11 new feature fields) P2: Confusion group rule engine (8 pairs + contradiction + backtrack) P3: 4-factor confidence calculation + quality gate update P4: 33+2 COBOL program type test samples (22 files, 7 categories) P5: parametrized/ test data generation engine P6: japanese_data.py lookup tables P7-10: Type-specific test suites (~159 parametrized tests) P11: Full classification pipeline (classify_program) + orchestrator integration P12: Documentation (module-interfaces, test-plan v3.0, coverage-matrix) Architecture decisions: - classification_pipeline/ merged to hina/pipeline/ - parametrized/ as independent module - japanese_data.py as root-level file - hina/__all__ only exports classify_program() Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,203 @@
|
||||
"""JP-01~10: japanese_data 模块 — 日文测试数据生成函数"""
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
import os
|
||||
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..", "..")))
|
||||
|
||||
from japanese_data import (
|
||||
FULLWIDTH_KATAKANA,
|
||||
FULLWIDTH_HIRAGANA,
|
||||
FULLWIDTH_DIGITS,
|
||||
FULLWIDTH_ALPHA,
|
||||
HALFWIDTH_KATAKANA,
|
||||
SJIS_5C_PROBLEM,
|
||||
SJIS_7C_PROBLEM,
|
||||
WAREKI_BOUNDARIES,
|
||||
generate_fullwidth_text,
|
||||
generate_halfwidth_katakana,
|
||||
generate_sjis_5c_problem,
|
||||
generate_sjis_7c_problem,
|
||||
generate_wareki_date,
|
||||
generate_wareki_boundary,
|
||||
generate_encoding_test_data,
|
||||
select_data_type,
|
||||
)
|
||||
|
||||
|
||||
# ── JP-01~02: 查找表常量 ──
|
||||
|
||||
|
||||
def test_fullwidth_katakana_constants():
|
||||
"""JP-01: 全角片假名表不为空"""
|
||||
assert len(FULLWIDTH_KATAKANA) > 0
|
||||
assert "ア" in FULLWIDTH_KATAKANA
|
||||
assert "ン" in FULLWIDTH_KATAKANA
|
||||
|
||||
|
||||
def test_fullwidth_hiragana_constants():
|
||||
"""全角平假名表不为空"""
|
||||
assert len(FULLWIDTH_HIRAGANA) > 0
|
||||
assert "あ" in FULLWIDTH_HIRAGANA
|
||||
assert "ん" in FULLWIDTH_HIRAGANA
|
||||
|
||||
|
||||
def test_halfwidth_katakana_constants():
|
||||
"""半角片假名表不为空"""
|
||||
assert len(HALFWIDTH_KATAKANA) > 0
|
||||
assert "ア" in HALFWIDTH_KATAKANA
|
||||
|
||||
|
||||
def test_sjis_problem_constants():
|
||||
"""SJIS 5C/7C 问题文字表内容"""
|
||||
assert "ソ" in SJIS_5C_PROBLEM
|
||||
assert "本" in SJIS_7C_PROBLEM
|
||||
assert len(SJIS_5C_PROBLEM) > 0
|
||||
assert len(SJIS_7C_PROBLEM) > 0
|
||||
|
||||
|
||||
def test_wareki_boundaries():
|
||||
"""和历边界表含有平成条目"""
|
||||
eras = [e[0] for e in WAREKI_BOUNDARIES]
|
||||
assert "平成" in eras
|
||||
assert "昭和" in eras
|
||||
|
||||
|
||||
# ── JP-03~05: generate_fullwidth_text ──
|
||||
|
||||
|
||||
def test_fullwidth_text_type():
|
||||
"""JP-03: generate_fullwidth_text 返回 str"""
|
||||
field = {"pic_info": {"type": "national", "length": 10}}
|
||||
result = generate_fullwidth_text(field)
|
||||
assert isinstance(result, str)
|
||||
|
||||
|
||||
def test_fullwidth_text_length():
|
||||
"""JP-04: generate_fullwidth_text 返回指定长度"""
|
||||
field = {"pic_info": {"type": "national", "length": 5}}
|
||||
result = generate_fullwidth_text(field)
|
||||
assert len(result) == 5
|
||||
|
||||
|
||||
def test_fullwidth_text_contents():
|
||||
"""JP-05: generate_fullwidth_text 内容来自全角片假名表"""
|
||||
field = {"pic_info": {"type": "national", "length": 20}}
|
||||
result = generate_fullwidth_text(field)
|
||||
for ch in result:
|
||||
assert ch in FULLWIDTH_KATAKANA, f"意外字符 {ch!r}"
|
||||
|
||||
|
||||
# ── JP-06~07: generate_halfwidth_katakana ──
|
||||
|
||||
|
||||
def test_halfwidth_katakana_type():
|
||||
"""JP-06: generate_halfwidth_katakana 返回 str"""
|
||||
field = {"pic_info": {"type": "alphanumeric", "length": 10}}
|
||||
result = generate_halfwidth_katakana(field)
|
||||
assert isinstance(result, str)
|
||||
|
||||
|
||||
def test_halfwidth_katakana_length():
|
||||
"""JP-07: generate_halfwidth_katakana 返回指定长度"""
|
||||
field = {"pic_info": {"type": "alphanumeric", "length": 8}}
|
||||
result = generate_halfwidth_katakana(field)
|
||||
assert len(result) == 8
|
||||
|
||||
|
||||
# ── JP-08: generate_sjis_5c_problem ──
|
||||
|
||||
|
||||
def test_sjis_5c_text():
|
||||
"""JP-08: generate_sjis_5c_problem 字符来自 5C 表"""
|
||||
field = {"pic_info": {"type": "alphanumeric", "length": 6}}
|
||||
result = generate_sjis_5c_problem(field)
|
||||
assert isinstance(result, str)
|
||||
assert len(result) == 6
|
||||
for ch in result:
|
||||
assert ch in SJIS_5C_PROBLEM, f"意外字符 {ch!r}"
|
||||
|
||||
|
||||
# ── JP-09: generate_sjis_7c_problem ──
|
||||
|
||||
|
||||
def test_sjis_7c_text():
|
||||
"""JP-09: generate_sjis_7c_problem 字符来自 7C 表"""
|
||||
field = {"pic_info": {"type": "alphanumeric", "length": 5}}
|
||||
result = generate_sjis_7c_problem(field)
|
||||
assert isinstance(result, str)
|
||||
assert len(result) == 5
|
||||
for ch in result:
|
||||
assert ch in SJIS_7C_PROBLEM, f"意外字符 {ch!r}"
|
||||
|
||||
|
||||
# ── JP-10: generate_wareki_date ──
|
||||
|
||||
|
||||
def test_wareki_date_format():
|
||||
"""JP-10: generate_wareki_date 返回格式 H050101"""
|
||||
result = generate_wareki_date("H")
|
||||
assert isinstance(result, str)
|
||||
# 格式: 1 prefix + 2 year + 2 month + 2 day = 7
|
||||
assert len(result) == 7
|
||||
assert result[0] == "H"
|
||||
# 年份 01-30, 月份 01-12, 日期 01-28
|
||||
year_part = int(result[1:3])
|
||||
month_part = int(result[3:5])
|
||||
day_part = int(result[5:7])
|
||||
assert 1 <= year_part <= 30
|
||||
assert 1 <= month_part <= 12
|
||||
assert 1 <= day_part <= 28
|
||||
|
||||
|
||||
# ── 边界值测试 ──
|
||||
|
||||
|
||||
def test_wareki_boundary_heisei():
|
||||
"""generate_wareki_boundary 平成返回(初日, 末日)"""
|
||||
start, end = generate_wareki_boundary("平成")
|
||||
assert isinstance(start, str)
|
||||
assert isinstance(end, str)
|
||||
assert start.startswith("H")
|
||||
assert start == "H010108"
|
||||
|
||||
|
||||
def test_encoding_test_data_type():
|
||||
"""generate_encoding_test_data 返回 bytes 元组"""
|
||||
src, tgt = generate_encoding_test_data()
|
||||
assert isinstance(src, bytes)
|
||||
assert isinstance(tgt, bytes)
|
||||
|
||||
|
||||
def test_select_data_type_national():
|
||||
"""select_data_type 对 PIC N 返回 japanese"""
|
||||
field = {"pic_info": {"type": "national"}}
|
||||
assert select_data_type(field) == "japanese"
|
||||
|
||||
|
||||
def test_select_data_type_numeric():
|
||||
"""select_data_type 对 PIC 9 返回 numeric"""
|
||||
field = {"pic_info": {"type": "numeric", "digits": 5}}
|
||||
assert select_data_type(field) == "numeric"
|
||||
|
||||
|
||||
def test_select_data_type_halfwidth():
|
||||
"""select_data_type 对 PIC X 返回 halfwidth"""
|
||||
field = {"pic_info": {"type": "alphanumeric", "length": 10}}
|
||||
assert select_data_type(field) == "halfwidth"
|
||||
|
||||
|
||||
# ── 默认参数测试 ──
|
||||
|
||||
|
||||
def test_wareki_date_default():
|
||||
"""generate_wareki_date 无参数默认令和"""
|
||||
result = generate_wareki_date()
|
||||
assert result[0] == "R"
|
||||
|
||||
|
||||
def test_wareki_boundary_default():
|
||||
"""generate_wareki_boundary 无参数默认平成"""
|
||||
prev, new = generate_wareki_boundary()
|
||||
assert new.startswith("H")
|
||||
Reference in New Issue
Block a user