bc1d56d1a4
P0.6: gcov infrastructure P1: extract_structure output expansion (11 new feature fields) P2: Confusion group rule engine (8 pairs + contradiction + backtrack) P3: 4-factor confidence calculation + quality gate update P4: 33+2 COBOL program type test samples (22 files, 7 categories) P5: parametrized/ test data generation engine P6: japanese_data.py lookup tables P7-10: Type-specific test suites (~159 parametrized tests) P11: Full classification pipeline (classify_program) + orchestrator integration P12: Documentation (module-interfaces, test-plan v3.0, coverage-matrix) Architecture decisions: - classification_pipeline/ merged to hina/pipeline/ - parametrized/ as independent module - japanese_data.py as root-level file - hina/__all__ only exports classify_program() Co-Authored-By: Claude <noreply@anthropic.com>
81 lines
2.4 KiB
Python
81 lines
2.4 KiB
Python
"""分割系测试数据生成模块。
|
|
|
|
提供 generate_division_data() 用于生成按比例/规则分割到多个文件的测试数据,
|
|
模拟 COBOL SORT 或 OUTPUT 分件场景。
|
|
"""
|
|
|
|
from __future__ import annotations
|
|
|
|
import math
|
|
from typing import Any
|
|
|
|
|
|
def generate_division_data(
|
|
division_type: int = 50,
|
|
record_count: int = 50,
|
|
) -> list[list[dict[str, Any]]]:
|
|
"""生成分割系测试数据。
|
|
|
|
按指定的分割方式和记录总数,将记录分配到多个文件中。
|
|
|
|
参数
|
|
----------
|
|
division_type : int
|
|
分割方式:
|
|
- 50 对半分割 → 2 个文件,各 50%
|
|
- 25 四等分分割 → 4 个文件,各 25%
|
|
- 100 全量(不分)→ 1 个文件,100%
|
|
record_count : int
|
|
记录总数(将在各文件间分配)。
|
|
|
|
返回
|
|
-------
|
|
list[list[dict]]
|
|
按文件分组的记录列表: [[文件1记录], [文件2记录], ...]
|
|
每条记录包含 KEY, DATA, FILE_NO 等字段。
|
|
"""
|
|
if division_type not in (50, 25, 100):
|
|
raise ValueError(f"division_type 必须为 50 / 25 / 100,收到 {division_type}")
|
|
if record_count < 1:
|
|
raise ValueError(f"record_count 必须 >= 1,收到 {record_count}")
|
|
|
|
if division_type == 100:
|
|
n_files = 1
|
|
ratios = [1.0]
|
|
elif division_type == 50:
|
|
n_files = 2
|
|
ratios = [0.5, 0.5]
|
|
elif division_type == 25:
|
|
n_files = 4
|
|
ratios = [0.25, 0.25, 0.25, 0.25]
|
|
else:
|
|
raise AssertionError("unreachable")
|
|
|
|
result: list[list[dict[str, Any]]] = []
|
|
allocated = 0
|
|
|
|
for file_no in range(n_files):
|
|
if file_no == n_files - 1:
|
|
# 最后一个文件获取剩余全部记录
|
|
file_count = record_count - allocated
|
|
else:
|
|
file_count = max(0, int(math.floor(record_count * ratios[file_no])))
|
|
# 确保不会超出总记录数
|
|
if allocated + file_count > record_count:
|
|
file_count = record_count - allocated
|
|
|
|
file_records: list[dict[str, Any]] = []
|
|
for i in range(file_count):
|
|
seq = allocated + i + 1
|
|
file_records.append({
|
|
"KEY": f"DIV-{seq:04d}",
|
|
"DATA": f"div_data_{seq}",
|
|
"FILE_NO": file_no + 1,
|
|
"SEQ": seq,
|
|
})
|
|
|
|
result.append(file_records)
|
|
allocated += file_count
|
|
|
|
return result
|