feat: Phase 2 complete — 13 Phases of COBOL type classification and test benchmark

P0.6: gcov infrastructure P1: extract_structure output expansion (11 new feature fields) P2: Confusion group rule engine (8 pairs + contradiction + backtrack) P3: 4-factor confidence calculation + quality gate update P4: 33+2 COBOL program type test samples (22 files, 7 categories) P5: parametrized/ test data generation engine P6: japanese_data.py lookup tables P7-10: Type-specific test suites (~159 parametrized tests) P11: Full classification pipeline (classify_program) + orchestrator integration P12: Documentation (module-interfaces, test-plan v3.0, coverage-matrix) Architecture decisions: - classification_pipeline/ merged to hina/pipeline/ - parametrized/ as independent module - japanese_data.py as root-level file - hina/__all__ only exports classify_program() Co-Authored-By: Claude <noreply@anthropic.com>
2026-06-19 23:51:55 +08:00
parent 63b5284715
commit bc1d56d1a4
129 changed files with 19378 additions and 261 deletions
@@ -0,0 +1,9 @@
+__all__ = [
+    "generate_matching_data", "generate_keybreak_data",
+    "generate_division_data", "generate_zero_byte_file",
+    "generate_boundary_values", "generate_minimal_records",
+    "generate_sorted_records", "generate_duplicate_keys",
+]
+from .matching import generate_matching_data, generate_keybreak_data
+from .division import generate_division_data
+from .common import generate_zero_byte_file, generate_boundary_values, generate_minimal_records, generate_sorted_records, generate_duplicate_keys
@@ -0,0 +1,275 @@
+"""通用测试数据生成工具函数模块。
+"""
+
+from __future__ import annotations
+
+import pathlib
+import re
+from typing import Any
+
+
+def generate_zero_byte_file(path: str) -> None:
+    """生成一个 0 字节的空文件。
+
+    自动创建父目录（如果不存在）。
+
+    参数
+    ----------
+    path : str
+        待创建的空文件路径。
+    """
+    p = pathlib.Path(path)
+    p.parent.mkdir(parents=True, exist_ok=True)
+    p.write_bytes(b"")
+
+
+def generate_minimal_records(fields: list[dict[str, Any]]) -> list[dict[str, Any]]:
+    """为给定的字段定义生成 1 条正常记录（最小数据量）。
+
+    参数
+    ----------
+    fields : list[dict]
+        字段定义列表，每个字典可包含以下键（均为可选）:
+          - "name" : str  字段名，默认为 "FIELD_{i}"
+          - "type" : str  类型: "numeric" / "string" / "date"，默认 "string"
+          - "length" : int  长度，字符串用，默认 10
+          - "default" : Any  默认值，优先使用
+
+    返回
+    -------
+    list[dict]
+        包含一条记录的列表，记录中每个字段的值为类型合理的默认值。
+    """
+    if not fields:
+        return [{}]
+
+    record: dict[str, Any] = {}
+    for i, f in enumerate(fields):
+        name = f.get("name", f"FIELD_{i}")
+        if "default" in f:
+            record[name] = f["default"]
+        else:
+            typ = f.get("type", "string")
+            if typ == "numeric":
+                record[name] = 0
+            elif typ == "date":
+                record[name] = "0001-01-01"
+            else:  # string
+                length = f.get("length", 10)
+                record[name] = "A" * length
+
+    return [record]
+
+
+def _parse_pic(pic: str) -> dict[str, Any]:
+    """解析 COBOL PIC 子句，返回类型、位数、小数位等信息。
+
+    支持的格式:
+      - S9(7)V99    signed, 7 整数位, 2 小数位
+      - 9(4)        无符号, 4 整数位
+      - S9(3)       signed, 3 整数位
+      - 9(4)V9(2)   无符号, 4 整数位, 2 小数位
+      - X(10)       字符串, 10 字符
+      - 9(7)V99     无符号, 7 整数位, 2 小数位
+
+    参数
+    ----------
+    pic : str
+        COBOL PIC 字符串，如 "S9(7)V99"。
+
+    返回
+    -------
+    dict
+        包含 type, digits, decimal, signed, total_digits 等信息的字典。
+    """
+    pic = pic.strip().upper()
+    result: dict[str, Any] = {
+        "type": "unknown",
+        "digits": 0,
+        "decimal": 0,
+        "signed": False,
+        "total_digits": 0,
+    }
+
+    # 字符串类型
+    if pic.startswith("X") or pic.startswith("A"):
+        result["type"] = "string"
+        m = re.match(r'[XA]\((\d+)\)', pic)
+        if m:
+            result["digits"] = int(m.group(1))
+        else:
+            result["digits"] = 1
+        return result
+
+    # 数值类型
+    if "9" in pic or pic.startswith("S"):
+        result["type"] = "numeric"
+        signed_match = re.match(r'S(.*)', pic)
+        if signed_match:
+            result["signed"] = True
+            pic_body = signed_match.group(1)
+        else:
+            result["signed"] = False
+            pic_body = pic
+
+        # 解析整数和小数部分
+        # 9(7)V99 或 9(7)V9(2)
+        v_match = re.match(r'9\((\d+)\)V9\((\d+)\)', pic_body)
+        if v_match:
+            result["digits"] = int(v_match.group(1))
+            result["decimal"] = int(v_match.group(2))
+        else:
+            # 尝试 9(4) 或 9(7)V99
+            m2 = re.match(r'9\((\d+)\)', pic_body)
+            if m2:
+                result["digits"] = int(m2.group(1))
+                rest = pic_body[m2.end():]
+                if rest.startswith("V"):
+                    dec_str = rest[1:]
+                    dm = re.match(r'9\((\d+)\)', dec_str)
+                    if dm:
+                        result["decimal"] = int(dm.group(1))
+                    elif re.match(r'9+', dec_str):
+                        result["decimal"] = len(dec_str)
+                # 处理简写: 9(7)V99
+                elif rest.startswith("V"):
+                    dec_part = rest[1:]
+                    dm = re.match(r'9\((\d+)\)', dec_part)
+                    if dm:
+                        result["decimal"] = int(dm.group(1))
+                    elif re.match(r'9+', dec_part):
+                        result["decimal"] = len(dec_part)
+
+        result["total_digits"] = result["digits"] + result["decimal"]
+        return result
+
+    return result
+
+
+def generate_boundary_values(pic: str) -> dict[str, Any]:
+    """从 PIC 子句解析出最大值、最小值和溢出值。
+
+    参数
+    ----------
+    pic : str
+        COBOL PIC 字符串，如 "S9(7)V99"。
+
+    返回
+    -------
+    dict
+        {
+          "max": 类型最大值,
+          "min": 类型最小值,
+          "overflow": 溢出值（超出最大位数的值）,
+          "zero": 0,
+          "pic_info": 解析出的 PIC 信息,
+        }
+    """
+    info = _parse_pic(pic)
+
+    if info["type"] == "string":
+        length = info["digits"]
+        return {
+            "max": "X" * length,
+            "min": "" if length == 0 else "A" + " " * (length - 1),
+            "overflow": "X" * (length + 1) if length > 0 else "X",
+            "zero": "" if length == 0 else " " * length,
+            "pic_info": info,
+        }
+
+    if info["type"] == "numeric":
+        digits = info["digits"]
+        decimal = info["decimal"]
+        signed = info["signed"]
+        factor = 10 ** decimal
+
+        total_digits = info.get("total_digits", digits + decimal)
+        max_val = (10 ** total_digits - 1) / factor
+        overflow_val = (10 ** (total_digits + 1)) / factor
+
+        if signed:
+            min_val = -max_val
+        else:
+            min_val = 0
+
+        return {
+            "max": max_val,
+            "min": min_val,
+            "overflow": overflow_val,
+            "zero": 0.0 if decimal > 0 else 0,
+            "pic_info": info,
+        }
+
+    return {
+        "max": None,
+        "min": None,
+        "overflow": None,
+        "zero": None,
+        "pic_info": info,
+    }
+
+
+def generate_sorted_records(
+    record_count: int = 10,
+    key_field: str = "KEY",
+) -> list[dict[str, Any]]:
+    """按 KEY 升序生成记录。
+
+    参数
+    ----------
+    record_count : int
+        生成记录数，默认 10。
+    key_field : str
+        键字段名，默认 "KEY"。
+
+    返回
+    -------
+    list[dict]
+        已按 key_field 排序的记录列表。
+    """
+    if record_count < 1:
+        raise ValueError(f"record_count 必须 >= 1，收到 {record_count}")
+
+    records: list[dict[str, Any]] = []
+    for i in range(record_count):
+        records.append({
+            key_field: f"KEY-{i:04d}",
+            "DATA": f"sorted_data_{i}",
+            "SEQ": i + 1,
+        })
+
+    return records
+
+
+def generate_duplicate_keys(
+    records: list[dict[str, Any]],
+    key_field: str = "KEY",
+) -> list[dict[str, Any]]:
+    """在已有记录基础上，为每条记录追加一条或多条同键值记录。
+
+    适用于测试重复键处理逻辑（如 SORT MERGE / 去重检查）。
+
+    参数
+    ----------
+    records : list[dict]
+        原始记录列表。
+    key_field : str
+        作为重复键的字段名，默认 "KEY"。
+
+    返回
+    -------
+    list[dict]
+        追加了重复键记录的完整列表。
+    """
+    if not records:
+        return []
+
+    duplicates: list[dict[str, Any]] = []
+    for rec in records:
+        dup = dict(rec)
+        dup[key_field] = rec[key_field]
+        dup["DATA"] = rec.get("DATA", "") + "_DUP"
+        dup["SEQ"] = rec.get("SEQ", 0) + 10000
+        duplicates.append(dup)
+
+    return records + duplicates
@@ -0,0 +1,80 @@
+"""分割系测试数据生成模块。
+
+提供 generate_division_data() 用于生成按比例/规则分割到多个文件的测试数据，
+模拟 COBOL SORT 或 OUTPUT 分件场景。
+"""
+
+from __future__ import annotations
+
+import math
+from typing import Any
+
+
+def generate_division_data(
+    division_type: int = 50,
+    record_count: int = 50,
+) -> list[list[dict[str, Any]]]:
+    """生成分割系测试数据。
+
+    按指定的分割方式和记录总数，将记录分配到多个文件中。
+
+    参数
+    ----------
+    division_type : int
+        分割方式:
+          - 50  对半分割 → 2 个文件，各 50%
+          - 25  四等分分割 → 4 个文件，各 25%
+          - 100 全量（不分）→ 1 个文件，100%
+    record_count : int
+        记录总数（将在各文件间分配）。
+
+    返回
+    -------
+    list[list[dict]]
+        按文件分组的记录列表: [[文件1记录], [文件2记录], ...]
+        每条记录包含 KEY, DATA, FILE_NO 等字段。
+    """
+    if division_type not in (50, 25, 100):
+        raise ValueError(f"division_type 必须为 50 / 25 / 100，收到 {division_type}")
+    if record_count < 1:
+        raise ValueError(f"record_count 必须 >= 1，收到 {record_count}")
+
+    if division_type == 100:
+        n_files = 1
+        ratios = [1.0]
+    elif division_type == 50:
+        n_files = 2
+        ratios = [0.5, 0.5]
+    elif division_type == 25:
+        n_files = 4
+        ratios = [0.25, 0.25, 0.25, 0.25]
+    else:
+        raise AssertionError("unreachable")
+
+    result: list[list[dict[str, Any]]] = []
+    allocated = 0
+
+    for file_no in range(n_files):
+        if file_no == n_files - 1:
+            # 最后一个文件获取剩余全部记录
+            file_count = record_count - allocated
+        else:
+            file_count = max(0, int(math.floor(record_count * ratios[file_no])))
+            # 确保不会超出总记录数
+            if allocated + file_count > record_count:
+                file_count = record_count - allocated
+
+        file_records: list[dict[str, Any]] = []
+        for i in range(file_count):
+            seq = allocated + i + 1
+            file_records.append({
+                "KEY": f"DIV-{seq:04d}",
+                "DATA": f"div_data_{seq}",
+                "FILE_NO": file_no + 1,
+                "SEQ": seq,
+            })
+
+        result.append(file_records)
+        allocated += file_count
+
+    return result
@@ -0,0 +1,194 @@
+"""匹配系测试数据生成模块。
+
+提供两种生成器：
+  - generate_matching_data()  — 生成主/从匹配测试数据
+  - generate_keybreak_data()  — 生成 KEY 切中断测试数据
+"""
+
+from __future__ import annotations
+
+import random
+from typing import Any
+
+
+def generate_matching_data(
+    matching_type: str = "1:1",
+    record_count_r01: int = 10,
+    record_count_r02: int = 10,
+    key_match_ratio: float = 1.0,
+) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
+    """生成匹配系测试数据。
+
+    参数
+    ----------
+    matching_type : str
+        匹配模式，支持:
+          - "1:1"  主件每条在从件最多命中一条
+          - "1:N"  主件每条在从件可能命中多条
+          - "N:1"  从件每条在主件可能命中多条
+    record_count_r01 : int
+        主文件(R01)记录条数
+    record_count_r02 : int
+        从文件(R02)记录条数
+    key_match_ratio : float
+        键值匹配比例，0.0~1.0 之间。
+        1.0 表示全部匹配，0.0 表示全部不匹配。
+
+    返回
+    -------
+    tuple[list[dict], list[dict]]
+        (主文件记录列表, 从文件记录列表)
+    """
+    if matching_type not in ("1:1", "1:N", "N:1"):
+        raise ValueError(f"不支持的 matching_type：{matching_type!r}，应为 '1:1' / '1:N' / 'N:1'")
+    if not 0.0 <= key_match_ratio <= 1.0:
+        raise ValueError(f"key_match_ratio 必须在 0.0~1.0 之间，收到 {key_match_ratio}")
+    if record_count_r01 < 0 or record_count_r02 < 0:
+        raise ValueError("记录数不能为负数")
+
+    main_records: list[dict[str, Any]] = []
+    sub_records: list[dict[str, Any]] = []
+
+    # 生成主文件记录
+    for i in range(record_count_r01):
+        main_records.append({
+            "KEY": f"MAIN-{i:04d}",
+            "DATA": f"main_data_{i}",
+            "SEQ": i + 1,
+        })
+
+    # 生成从文件记录
+    matched = 0
+    unmatched = 0
+    if matching_type == "1:1":
+        # 1:1 — 最多让 record_count_r01 条从件匹配
+        max_match = min(record_count_r01, record_count_r02)
+        match_count = int(max_match * key_match_ratio)
+        for i in range(record_count_r02):
+            if i < match_count and i < record_count_r01:
+                sub_records.append({
+                    "KEY": f"MAIN-{i:04d}",
+                    "DATA": f"sub_data_{i}",
+                    "SEQ": i + 1,
+                })
+                matched += 1
+            else:
+                sub_records.append({
+                    "KEY": f"UNMATCHED-SUB-{unmatched:04d}",
+                    "DATA": f"sub_unmatched_{unmatched}",
+                    "SEQ": record_count_r01 + unmatched + 1,
+                })
+                unmatched += 1
+
+    elif matching_type == "1:N":
+        # 1:N — 每条主件可能对应多条从件
+        match_count = int(record_count_r01 * key_match_ratio)
+        idx = 0
+        for i in range(record_count_r01):
+            if i < match_count:
+                n_per_main = max(1, record_count_r02 // max(1, match_count))
+                for _ in range(n_per_main):
+                    if idx < record_count_r02:
+                        sub_records.append({
+                            "KEY": f"MAIN-{i:04d}",
+                            "DATA": f"sub_data_{idx}",
+                            "SEQ": idx + 1,
+                        })
+                        idx += 1
+            else:
+                if idx < record_count_r02:
+                    sub_records.append({
+                        "KEY": f"UNMATCHED-SUB-{unmatched:04d}",
+                        "DATA": f"sub_unmatched_{unmatched}",
+                        "SEQ": idx + 1,
+                    })
+                    idx += 1
+                    unmatched += 1
+        # 补齐剩余
+        while idx < record_count_r02:
+            sub_records.append({
+                "KEY": f"UNMATCHED-SUB-{unmatched:04d}",
+                "DATA": f"sub_unmatched_{unmatched}",
+                "SEQ": idx + 1,
+            })
+            idx += 1
+            unmatched += 1
+
+    elif matching_type == "N:1":
+        # N:1 — 多条主件对应同一条从件
+        match_count = int(record_count_r02 * key_match_ratio)
+        for i in range(record_count_r02):
+            if i < match_count:
+                sub_records.append({
+                    "KEY": f"MAIN-{i % max(1, record_count_r01):04d}",
+                    "DATA": f"sub_data_{i}",
+                    "SEQ": i + 1,
+                })
+                matched += 1
+            else:
+                sub_records.append({
+                    "KEY": f"UNMATCHED-SUB-{unmatched:04d}",
+                    "DATA": f"sub_unmatched_{unmatched}",
+                    "SEQ": i + 1,
+                })
+                unmatched += 1
+
+    return main_records, sub_records
+
+
+def generate_keybreak_data(
+    group_count: int = 3,
+    records_per_group: int = 2,
+    sum_type: str = "accumulate",
+) -> list[dict[str, Any]]:
+    """生成 KEY 切测试数据，组间 KEY 值变化触发中断。
+
+    每组内的记录 KEY 值相同；组间 KEY 值递增。
+    适用于测试 AT END / BREAK / 集计功能。
+
+    参数
+    ----------
+    group_count : int
+        分组数量，默认 3。
+    records_per_group : int
+        每组记录数，默认 2。
+    sum_type : str
+        集计类型:
+          - "accumulate"  累加型（FIELD 值递增）
+          - "aggregate"   集计型（FIELD 值相同）
+          - "mark"        标记型（FIELD 为固定标记值）
+
+    返回
+    -------
+    list[dict]
+        包含 KEY、FIELD、GROUP、SEQ 等字段的记录列表。
+    """
+    if group_count < 1:
+        raise ValueError(f"group_count 必须 >= 1，收到 {group_count}")
+    if records_per_group < 1:
+        raise ValueError(f"records_per_group 必须 >= 1，收到 {records_per_group}")
+    if sum_type not in ("accumulate", "aggregate", "mark"):
+        raise ValueError(f"不支持的 sum_type：{sum_type!r}")
+
+    records: list[dict[str, Any]] = []
+    seq = 0
+
+    for g in range(group_count):
+        group_key = f"KEY-{chr(65 + g) if g < 26 else g}"  # KEY-A, KEY-B, ...
+        for r in range(records_per_group):
+            seq += 1
+            if sum_type == "accumulate":
+                field_val = (g + 1) * 100 + r + 1
+            elif sum_type == "aggregate":
+                field_val = (g + 1) * 100
+            else:  # mark
+                field_val = f"MARK-{chr(65 + g)}"
+
+            records.append({
+                "KEY": group_key,
+                "FIELD": field_val,
+                "GROUP": g + 1,
+                "SEQ": seq,
+            })
+
+    return records