cobol-java-v3/parametrized/division.py

"""分割系测试数据生成模块。

提供 generate_division_data() 用于生成按比例/规则分割到多个文件的测试数据，
模拟 COBOL SORT 或 OUTPUT 分件场景。
"""

from __future__ import annotations

import math
from typing import Any


def generate_division_data(
    division_type: int = 50,
    record_count: int = 50,
) -> list[list[dict[str, Any]]]:
    """生成分割系测试数据。

    按指定的分割方式和记录总数，将记录分配到多个文件中。

    参数
    ----------
    division_type : int
        分割方式:
          - 50  对半分割 → 2 个文件，各 50%
          - 25  四等分分割 → 4 个文件，各 25%
          - 100 全量（不分）→ 1 个文件，100%
    record_count : int
        记录总数（将在各文件间分配）。

    返回
    -------
    list[list[dict]]
        按文件分组的记录列表: [[文件1记录], [文件2记录], ...]
        每条记录包含 KEY, DATA, FILE_NO 等字段。
    """
    if division_type not in (50, 25, 100):
        raise ValueError(f"division_type 必须为 50 / 25 / 100，收到 {division_type}")
    if record_count < 1:
        raise ValueError(f"record_count 必须 >= 1，收到 {record_count}")

    if division_type == 100:
        n_files = 1
        ratios = [1.0]
    elif division_type == 50:
        n_files = 2
        ratios = [0.5, 0.5]
    elif division_type == 25:
        n_files = 4
        ratios = [0.25, 0.25, 0.25, 0.25]
    else:
        raise AssertionError("unreachable")

    result: list[list[dict[str, Any]]] = []
    allocated = 0

    for file_no in range(n_files):
        if file_no == n_files - 1:
            # 最后一个文件获取剩余全部记录
            file_count = record_count - allocated
        else:
            file_count = max(0, int(math.floor(record_count * ratios[file_no])))
            # 确保不会超出总记录数
            if allocated + file_count > record_count:
                file_count = record_count - allocated

        file_records: list[dict[str, Any]] = []
        for i in range(file_count):
            seq = allocated + i + 1
            file_records.append({
                "KEY": f"DIV-{seq:04d}",
                "DATA": f"div_data_{seq}",
                "FILE_NO": file_no + 1,
                "SEQ": seq,
            })

        result.append(file_records)
        allocated += file_count

    return result