"""Japanese test data generation lookup tables and helper functions. Provides constants and generators for creating Japanese-language test data including fullwidth/halfwidth characters, Shift-JIS encoding edge cases, wareki (Japanese era) dates, and encoding round-trip test data. """ from __future__ import annotations import random from datetime import date, timedelta # ── Lookup table constants ── FULLWIDTH_KATAKANA = "アイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワヲン" FULLWIDTH_HIRAGANA = "あいうえおかきくけこさしすせそたちつてとなにぬねのはひふへほまみむめもやゆよらりるれろわをん" FULLWIDTH_DIGITS = "0123456789" FULLWIDTH_ALPHA = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" HALFWIDTH_KATAKANA = "アイウエオカキクケコサシスセソタチツテトナニヌネノハヒフヘホマミムメモヤユヨラリルレロワヲン" # Shift-JIS 第2字节 0x5C 问题文字 SJIS_5C_PROBLEM = ["ソ", "噂", "能", "刈", "搾", "汚"] # Shift-JIS 第2字节 0x7C 问题文字 SJIS_7C_PROBLEM = ["本", "問", "王", "歓", "輸"] WAREKI_BOUNDARIES = [ ("令和", 2019, "R010501", None, None), ("平成", 1989, "H010108", "2019/04/30", "H310430"), ("昭和", 1926, "S611231", "1989/01/07", "S640107"), ("大正", 1912, "T011231", "1926/12/25", "T151225"), ("明治", 1868, "M451229", "1912/01/29", "M450129"), ] # ── Helper: simulate COBOL PIC clause field for length ── def _field_length(field: dict) -> int: """Get the storage length from a PIC field definition dict.""" if "pic_info" in field and field["pic_info"]: pi = field["pic_info"] if pi.get("length", 0) > 0: return pi["length"] dig = pi.get("digits", 0) dec = pi.get("decimal", 0) return dig + dec if "length" in field: return field["length"] if "digits" in field: d = field["digits"] dec = field.get("decimal", 0) return d + dec return 10 # fallback # ── Generation functions ── def generate_fullwidth_text(field: dict) -> str: """Generate fullwidth text filling a PIC N field. Returns a string of fullwidth katakana characters padded to the field length. Each PIC N character is 2 bytes (fullwidth), so the number of characters equals the field length. """ length = _field_length(field) if length <= 0: length = 10 chars = list(FULLWIDTH_KATAKANA) return "".join(random.choice(chars) for _ in range(length)) def generate_halfwidth_katakana(field: dict) -> str: """Generate halfwidth katakana filling a PIC X field. Returns a string of halfwidth katakana characters to fit the field byte length. Halfwidth katakana are single-byte in Shift-JIS, so the character count equals the field length. """ length = _field_length(field) if length <= 0: length = 10 chars = list(HALFWIDTH_KATAKANA) return "".join(random.choice(chars) for _ in range(length)) def generate_sjis_5c_problem(field: dict) -> str: """Generate a string containing Shift-JIS 0x5C problem characters. These characters have 0x5C (backslash) as their second byte in Shift-JIS, which can be misinterpreted as an escape character. """ length = _field_length(field) if length <= 0: length = 6 result = [] chars = list(SJIS_5C_PROBLEM) while len(result) < length: result.append(random.choice(chars)) return "".join(result) def generate_sjis_7c_problem(field: dict) -> str: """Generate a string containing Shift-JIS 0x7C problem characters. These characters have 0x7C (pipe) as their second byte in Shift-JIS, which can be misinterpreted as a field separator. """ length = _field_length(field) if length <= 0: length = 5 result = [] chars = list(SJIS_7C_PROBLEM) while len(result) < length: result.append(random.choice(chars)) return "".join(result) def generate_wareki_date(wareki_type: str = "R") -> str: """Generate a Japanese era (wareki) date string. Args: wareki_type: Era prefix letter: "R" = Reiwa (令和), "H" = Heisei (平成), "S" = Showa (昭和), "T" = Taisho (大正), "M" = Meiji (明治) Returns: Wareki date string formatted as e.g. "R050101" (Reiwa 5, Jan 1). The year part is zero-padded to 2 digits (e.g. 01, 05, 12). """ era_map = { "R": ("令和", 2019), "H": ("平成", 1989), "S": ("昭和", 1926), "T": ("大正", 1912), "M": ("明治", 1868), } if wareki_type not in era_map: wareki_type = "R" era_name, base_year = era_map[wareki_type] # Generate a random date within the era's range (assuming at least 30 years) year_offset = random.randint(1, 30) month = random.randint(1, 12) day = random.randint(1, 28) return f"{wareki_type}{year_offset:02d}{month:02d}{day:02d}" def generate_wareki_boundary(era: str = "平成") -> tuple[str, str]: """Generate a pair of wareki date strings representing an era boundary. Args: era: Era name in Japanese: "令和", "平成", "昭和", "大正", "明治" Returns: Tuple of (end_date_of_previous_era, start_date_of_new_era), e.g. for "平成": ("S640107", "H010108") """ boundaries = {name: (prev_end, new_start) for name, _, prev_end, _, new_start in WAREKI_BOUNDARIES if prev_end and new_start} if era not in boundaries: # Default to Heisei boundary era = "平成" return boundaries[era] def generate_encoding_test_data(from_enc: str = "shift_jis", to_enc: str = "utf-8") -> tuple[bytes, bytes]: """Generate encoding test data with round-trip verification. Creates a known string, encodes it to the source encoding, decodes to the target encoding, and returns both for comparison. Args: from_enc: Source encoding name (default: "shift_jis") to_enc: Target encoding name (default: "utf-8") Returns: Tuple of (source_bytes, target_bytes) for comparison. """ test_string = "あいうえおアイウエオ亜唖娃阿" source_bytes = test_string.encode(from_enc, errors="replace") decoded = source_bytes.decode(from_enc, errors="replace") target_bytes = decoded.encode(to_enc, errors="replace") return source_bytes, target_bytes def generate_encoding_test_data_bytes(text: str = None, from_enc: str = "shift_jis", to_enc: str = "utf-8") -> tuple[bytes, bytes]: """Generate encoding test data from explicit text. Args: text: Source text to encode; defaults to Japanese test phrase from_enc: Source encoding name to_enc: Target encoding name Returns: Tuple of (source_bytes, target_bytes) """ if text is None: text = "あいうえおアイウエオ亜唖娃阿" source_bytes = text.encode(from_enc, errors="replace") decoded = source_bytes.decode(from_enc, errors="replace") target_bytes = decoded.encode(to_enc, errors="replace") return source_bytes, target_bytes def select_data_type(field: dict) -> str: """Select the appropriate data type label for a field. Examines the field definition and returns a label indicating the kind of test data to generate. Args: field: Field definition dict with 'pic_info' containing 'type' and 'usage' Returns: One of: "japanese", "numeric", "halfwidth" """ pi = field.get("pic_info", {}) typ = pi.get("type", "unknown") usage = pi.get("usage", "").upper() if pi.get("usage") else "" # PIC N (national) or USAGE NATIONAL → Japanese fullwidth if typ == "national" or usage == "NATIONAL": return "japanese" # PIC 9 or numeric usage → numeric if typ in ("numeric", "numeric_edited", "numeric_float") or "COMP" in usage: return "numeric" # PIC X with DISPLAY usage → halfwidth katakana candidate if typ == "alphanumeric" or typ == "alphabetic": return "halfwidth" return "halfwidth"