feat: Phase 2 complete — 13 Phases of COBOL type classification and test benchmark

P0.6: gcov infrastructure P1: extract_structure output expansion (11 new feature fields) P2: Confusion group rule engine (8 pairs + contradiction + backtrack) P3: 4-factor confidence calculation + quality gate update P4: 33+2 COBOL program type test samples (22 files, 7 categories) P5: parametrized/ test data generation engine P6: japanese_data.py lookup tables P7-10: Type-specific test suites (~159 parametrized tests) P11: Full classification pipeline (classify_program) + orchestrator integration P12: Documentation (module-interfaces, test-plan v3.0, coverage-matrix) Architecture decisions: - classification_pipeline/ merged to hina/pipeline/ - parametrized/ as independent module - japanese_data.py as root-level file - hina/__all__ only exports classify_program() Co-Authored-By: Claude <noreply@anthropic.com>
2026-06-19 23:51:55 +08:00
parent 63b5284715
commit bc1d56d1a4
129 changed files with 19378 additions and 261 deletions
@@ -0,0 +1,234 @@
+"""Japanese test data generation lookup tables and helper functions.
+
+Provides constants and generators for creating Japanese-language test data
+including fullwidth/halfwidth characters, Shift-JIS encoding edge cases,
+wareki (Japanese era) dates, and encoding round-trip test data.
+"""
+
+from __future__ import annotations
+
+import random
+from datetime import date, timedelta
+
+# ── Lookup table constants ──
+
+FULLWIDTH_KATAKANA = "アイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワヲン"
+FULLWIDTH_HIRAGANA = "あいうえおかきくけこさしすせそたちつてとなにぬねのはひふへほまみむめもやゆよらりるれろわをん"
+FULLWIDTH_DIGITS = "０１２３４５６７８９"
+FULLWIDTH_ALPHA = "ＡＢＣＤＥＦＧＨＩＪＫＬＭＮＯＰＱＲＳＴＵＶＷＸＹＺ"
+HALFWIDTH_KATAKANA = "ｱｲｳｴｵｶｷｸｹｺｻｼｽｾｿﾀﾁﾂﾃﾄﾅﾆﾇﾈﾉﾊﾋﾌﾍﾎﾏﾐﾑﾒﾓﾔﾕﾖﾗﾘﾙﾚﾛﾜｦﾝ"
+
+# Shift-JIS 第2字节 0x5C 问题文字
+SJIS_5C_PROBLEM = ["ソ", "噂", "能", "刈", "搾", "汚"]
+# Shift-JIS 第2字节 0x7C 问题文字
+SJIS_7C_PROBLEM = ["本", "問", "王", "歓", "輸"]
+
+WAREKI_BOUNDARIES = [
+    ("令和", 2019, "R010501", None, None),
+    ("平成", 1989, "H010108", "2019/04/30", "H310430"),
+    ("昭和", 1926, "S611231", "1989/01/07", "S640107"),
+    ("大正", 1912, "T011231", "1926/12/25", "T151225"),
+    ("明治", 1868, "M451229", "1912/01/29", "M450129"),
+]
+
+
+# ── Helper: simulate COBOL PIC clause field for length ──
+
+
+def _field_length(field: dict) -> int:
+    """Get the storage length from a PIC field definition dict."""
+    if "pic_info" in field and field["pic_info"]:
+        pi = field["pic_info"]
+        if pi.get("length", 0) > 0:
+            return pi["length"]
+        dig = pi.get("digits", 0)
+        dec = pi.get("decimal", 0)
+        return dig + dec
+    if "length" in field:
+        return field["length"]
+    if "digits" in field:
+        d = field["digits"]
+        dec = field.get("decimal", 0)
+        return d + dec
+    return 10  # fallback
+
+
+# ── Generation functions ──
+
+
+def generate_fullwidth_text(field: dict) -> str:
+    """Generate fullwidth text filling a PIC N field.
+
+    Returns a string of fullwidth katakana characters padded to the field length.
+    Each PIC N character is 2 bytes (fullwidth), so the number of characters
+    equals the field length.
+    """
+    length = _field_length(field)
+    if length <= 0:
+        length = 10
+    chars = list(FULLWIDTH_KATAKANA)
+    return "".join(random.choice(chars) for _ in range(length))
+
+
+def generate_halfwidth_katakana(field: dict) -> str:
+    """Generate halfwidth katakana filling a PIC X field.
+
+    Returns a string of halfwidth katakana characters to fit the field byte length.
+    Halfwidth katakana are single-byte in Shift-JIS, so the character count
+    equals the field length.
+    """
+    length = _field_length(field)
+    if length <= 0:
+        length = 10
+    chars = list(HALFWIDTH_KATAKANA)
+    return "".join(random.choice(chars) for _ in range(length))
+
+
+def generate_sjis_5c_problem(field: dict) -> str:
+    """Generate a string containing Shift-JIS 0x5C problem characters.
+
+    These characters have 0x5C (backslash) as their second byte in Shift-JIS,
+    which can be misinterpreted as an escape character.
+    """
+    length = _field_length(field)
+    if length <= 0:
+        length = 6
+    result = []
+    chars = list(SJIS_5C_PROBLEM)
+    while len(result) < length:
+        result.append(random.choice(chars))
+    return "".join(result)
+
+
+def generate_sjis_7c_problem(field: dict) -> str:
+    """Generate a string containing Shift-JIS 0x7C problem characters.
+
+    These characters have 0x7C (pipe) as their second byte in Shift-JIS,
+    which can be misinterpreted as a field separator.
+    """
+    length = _field_length(field)
+    if length <= 0:
+        length = 5
+    result = []
+    chars = list(SJIS_7C_PROBLEM)
+    while len(result) < length:
+        result.append(random.choice(chars))
+    return "".join(result)
+
+
+def generate_wareki_date(wareki_type: str = "R") -> str:
+    """Generate a Japanese era (wareki) date string.
+
+    Args:
+        wareki_type: Era prefix letter:
+            "R" = Reiwa (令和), "H" = Heisei (平成),
+            "S" = Showa (昭和), "T" = Taisho (大正),
+            "M" = Meiji (明治)
+
+    Returns:
+        Wareki date string formatted as e.g. "R050101" (Reiwa 5, Jan 1).
+        The year part is zero-padded to 2 digits (e.g. 01, 05, 12).
+    """
+    era_map = {
+        "R": ("令和", 2019),
+        "H": ("平成", 1989),
+        "S": ("昭和", 1926),
+        "T": ("大正", 1912),
+        "M": ("明治", 1868),
+    }
+    if wareki_type not in era_map:
+        wareki_type = "R"
+
+    era_name, base_year = era_map[wareki_type]
+    # Generate a random date within the era's range (assuming at least 30 years)
+    year_offset = random.randint(1, 30)
+    month = random.randint(1, 12)
+    day = random.randint(1, 28)
+    return f"{wareki_type}{year_offset:02d}{month:02d}{day:02d}"
+
+
+def generate_wareki_boundary(era: str = "平成") -> tuple[str, str]:
+    """Generate a pair of wareki date strings representing an era boundary.
+
+    Args:
+        era: Era name in Japanese: "令和", "平成", "昭和", "大正", "明治"
+
+    Returns:
+        Tuple of (end_date_of_previous_era, start_date_of_new_era),
+        e.g. for "平成": ("S640107", "H010108")
+    """
+    boundaries = {name: (prev_end, new_start) for name, _, prev_end, _, new_start in WAREKI_BOUNDARIES if prev_end and new_start}
+    if era not in boundaries:
+        # Default to Heisei boundary
+        era = "平成"
+    return boundaries[era]
+
+
+def generate_encoding_test_data(from_enc: str = "shift_jis", to_enc: str = "utf-8") -> tuple[bytes, bytes]:
+    """Generate encoding test data with round-trip verification.
+
+    Creates a known string, encodes it to the source encoding, decodes to
+    the target encoding, and returns both for comparison.
+
+    Args:
+        from_enc: Source encoding name (default: "shift_jis")
+        to_enc: Target encoding name (default: "utf-8")
+
+    Returns:
+        Tuple of (source_bytes, target_bytes) for comparison.
+    """
+    test_string = "あいうえおアイウエオ亜唖娃阿"
+    source_bytes = test_string.encode(from_enc, errors="replace")
+    decoded = source_bytes.decode(from_enc, errors="replace")
+    target_bytes = decoded.encode(to_enc, errors="replace")
+    return source_bytes, target_bytes
+
+
+def generate_encoding_test_data_bytes(text: str = None, from_enc: str = "shift_jis", to_enc: str = "utf-8") -> tuple[bytes, bytes]:
+    """Generate encoding test data from explicit text.
+
+    Args:
+        text: Source text to encode; defaults to Japanese test phrase
+        from_enc: Source encoding name
+        to_enc: Target encoding name
+
+    Returns:
+        Tuple of (source_bytes, target_bytes)
+    """
+    if text is None:
+        text = "あいうえおアイウエオ亜唖娃阿"
+    source_bytes = text.encode(from_enc, errors="replace")
+    decoded = source_bytes.decode(from_enc, errors="replace")
+    target_bytes = decoded.encode(to_enc, errors="replace")
+    return source_bytes, target_bytes
+
+
+def select_data_type(field: dict) -> str:
+    """Select the appropriate data type label for a field.
+
+    Examines the field definition and returns a label indicating
+    the kind of test data to generate.
+
+    Args:
+        field: Field definition dict with 'pic_info' containing 'type' and 'usage'
+
+    Returns:
+        One of: "japanese", "numeric", "halfwidth"
+    """
+    pi = field.get("pic_info", {})
+    typ = pi.get("type", "unknown")
+    usage = pi.get("usage", "").upper() if pi.get("usage") else ""
+
+    # PIC N (national) or USAGE NATIONAL → Japanese fullwidth
+    if typ == "national" or usage == "NATIONAL":
+        return "japanese"
+
+    # PIC 9 or numeric usage → numeric
+    if typ in ("numeric", "numeric_edited", "numeric_float") or "COMP" in usage:
+        return "numeric"
+
+    # PIC X with DISPLAY usage → halfwidth katakana candidate
+    if typ == "alphanumeric" or typ == "alphabetic":
+        return "halfwidth"
+
+    return "halfwidth"