feat: Phase 2 complete — 13 Phases of COBOL type classification and test benchmark

P0.6: gcov infrastructure
P1: extract_structure output expansion (11 new feature fields)
P2: Confusion group rule engine (8 pairs + contradiction + backtrack)
P3: 4-factor confidence calculation + quality gate update
P4: 33+2 COBOL program type test samples (22 files, 7 categories)
P5: parametrized/ test data generation engine
P6: japanese_data.py lookup tables
P7-10: Type-specific test suites (~159 parametrized tests)
P11: Full classification pipeline (classify_program) + orchestrator integration
P12: Documentation (module-interfaces, test-plan v3.0, coverage-matrix)

Architecture decisions:
- classification_pipeline/ merged to hina/pipeline/
- parametrized/ as independent module
- japanese_data.py as root-level file
- hina/__all__ only exports classify_program()

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
hangshuo652
2026-06-19 23:51:55 +08:00
parent 63b5284715
commit bc1d56d1a4
129 changed files with 19378 additions and 261 deletions
+234
View File
@@ -0,0 +1,234 @@
"""Japanese test data generation lookup tables and helper functions.
Provides constants and generators for creating Japanese-language test data
including fullwidth/halfwidth characters, Shift-JIS encoding edge cases,
wareki (Japanese era) dates, and encoding round-trip test data.
"""
from __future__ import annotations
import random
from datetime import date, timedelta
# ── Lookup table constants ──
FULLWIDTH_KATAKANA = "アイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワヲン"
FULLWIDTH_HIRAGANA = "あいうえおかきくけこさしすせそたちつてとなにぬねのはひふへほまみむめもやゆよらりるれろわをん"
FULLWIDTH_DIGITS = "0123456789"
FULLWIDTH_ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
HALFWIDTH_KATAKANA = "アイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワヲン"
# Shift-JIS 第2字节 0x5C 问题文字
SJIS_5C_PROBLEM = ["", "", "", "", "", ""]
# Shift-JIS 第2字节 0x7C 问题文字
SJIS_7C_PROBLEM = ["", "", "", "", ""]
WAREKI_BOUNDARIES = [
("令和", 2019, "R010501", None, None),
("平成", 1989, "H010108", "2019/04/30", "H310430"),
("昭和", 1926, "S611231", "1989/01/07", "S640107"),
("大正", 1912, "T011231", "1926/12/25", "T151225"),
("明治", 1868, "M451229", "1912/01/29", "M450129"),
]
# ── Helper: simulate COBOL PIC clause field for length ──
def _field_length(field: dict) -> int:
"""Get the storage length from a PIC field definition dict."""
if "pic_info" in field and field["pic_info"]:
pi = field["pic_info"]
if pi.get("length", 0) > 0:
return pi["length"]
dig = pi.get("digits", 0)
dec = pi.get("decimal", 0)
return dig + dec
if "length" in field:
return field["length"]
if "digits" in field:
d = field["digits"]
dec = field.get("decimal", 0)
return d + dec
return 10 # fallback
# ── Generation functions ──
def generate_fullwidth_text(field: dict) -> str:
"""Generate fullwidth text filling a PIC N field.
Returns a string of fullwidth katakana characters padded to the field length.
Each PIC N character is 2 bytes (fullwidth), so the number of characters
equals the field length.
"""
length = _field_length(field)
if length <= 0:
length = 10
chars = list(FULLWIDTH_KATAKANA)
return "".join(random.choice(chars) for _ in range(length))
def generate_halfwidth_katakana(field: dict) -> str:
"""Generate halfwidth katakana filling a PIC X field.
Returns a string of halfwidth katakana characters to fit the field byte length.
Halfwidth katakana are single-byte in Shift-JIS, so the character count
equals the field length.
"""
length = _field_length(field)
if length <= 0:
length = 10
chars = list(HALFWIDTH_KATAKANA)
return "".join(random.choice(chars) for _ in range(length))
def generate_sjis_5c_problem(field: dict) -> str:
"""Generate a string containing Shift-JIS 0x5C problem characters.
These characters have 0x5C (backslash) as their second byte in Shift-JIS,
which can be misinterpreted as an escape character.
"""
length = _field_length(field)
if length <= 0:
length = 6
result = []
chars = list(SJIS_5C_PROBLEM)
while len(result) < length:
result.append(random.choice(chars))
return "".join(result)
def generate_sjis_7c_problem(field: dict) -> str:
"""Generate a string containing Shift-JIS 0x7C problem characters.
These characters have 0x7C (pipe) as their second byte in Shift-JIS,
which can be misinterpreted as a field separator.
"""
length = _field_length(field)
if length <= 0:
length = 5
result = []
chars = list(SJIS_7C_PROBLEM)
while len(result) < length:
result.append(random.choice(chars))
return "".join(result)
def generate_wareki_date(wareki_type: str = "R") -> str:
"""Generate a Japanese era (wareki) date string.
Args:
wareki_type: Era prefix letter:
"R" = Reiwa (令和), "H" = Heisei (平成),
"S" = Showa (昭和), "T" = Taisho (大正),
"M" = Meiji (明治)
Returns:
Wareki date string formatted as e.g. "R050101" (Reiwa 5, Jan 1).
The year part is zero-padded to 2 digits (e.g. 01, 05, 12).
"""
era_map = {
"R": ("令和", 2019),
"H": ("平成", 1989),
"S": ("昭和", 1926),
"T": ("大正", 1912),
"M": ("明治", 1868),
}
if wareki_type not in era_map:
wareki_type = "R"
era_name, base_year = era_map[wareki_type]
# Generate a random date within the era's range (assuming at least 30 years)
year_offset = random.randint(1, 30)
month = random.randint(1, 12)
day = random.randint(1, 28)
return f"{wareki_type}{year_offset:02d}{month:02d}{day:02d}"
def generate_wareki_boundary(era: str = "平成") -> tuple[str, str]:
"""Generate a pair of wareki date strings representing an era boundary.
Args:
era: Era name in Japanese: "令和", "平成", "昭和", "大正", "明治"
Returns:
Tuple of (end_date_of_previous_era, start_date_of_new_era),
e.g. for "平成": ("S640107", "H010108")
"""
boundaries = {name: (prev_end, new_start) for name, _, prev_end, _, new_start in WAREKI_BOUNDARIES if prev_end and new_start}
if era not in boundaries:
# Default to Heisei boundary
era = "平成"
return boundaries[era]
def generate_encoding_test_data(from_enc: str = "shift_jis", to_enc: str = "utf-8") -> tuple[bytes, bytes]:
"""Generate encoding test data with round-trip verification.
Creates a known string, encodes it to the source encoding, decodes to
the target encoding, and returns both for comparison.
Args:
from_enc: Source encoding name (default: "shift_jis")
to_enc: Target encoding name (default: "utf-8")
Returns:
Tuple of (source_bytes, target_bytes) for comparison.
"""
test_string = "あいうえおアイウエオ亜唖娃阿"
source_bytes = test_string.encode(from_enc, errors="replace")
decoded = source_bytes.decode(from_enc, errors="replace")
target_bytes = decoded.encode(to_enc, errors="replace")
return source_bytes, target_bytes
def generate_encoding_test_data_bytes(text: str = None, from_enc: str = "shift_jis", to_enc: str = "utf-8") -> tuple[bytes, bytes]:
"""Generate encoding test data from explicit text.
Args:
text: Source text to encode; defaults to Japanese test phrase
from_enc: Source encoding name
to_enc: Target encoding name
Returns:
Tuple of (source_bytes, target_bytes)
"""
if text is None:
text = "あいうえおアイウエオ亜唖娃阿"
source_bytes = text.encode(from_enc, errors="replace")
decoded = source_bytes.decode(from_enc, errors="replace")
target_bytes = decoded.encode(to_enc, errors="replace")
return source_bytes, target_bytes
def select_data_type(field: dict) -> str:
"""Select the appropriate data type label for a field.
Examines the field definition and returns a label indicating
the kind of test data to generate.
Args:
field: Field definition dict with 'pic_info' containing 'type' and 'usage'
Returns:
One of: "japanese", "numeric", "halfwidth"
"""
pi = field.get("pic_info", {})
typ = pi.get("type", "unknown")
usage = pi.get("usage", "").upper() if pi.get("usage") else ""
# PIC N (national) or USAGE NATIONAL → Japanese fullwidth
if typ == "national" or usage == "NATIONAL":
return "japanese"
# PIC 9 or numeric usage → numeric
if typ in ("numeric", "numeric_edited", "numeric_float") or "COMP" in usage:
return "numeric"
# PIC X with DISPLAY usage → halfwidth katakana candidate
if typ == "alphanumeric" or typ == "alphabetic":
return "halfwidth"
return "halfwidth"