feat: Phase 2 complete — 13 Phases of COBOL type classification and test benchmark
P0.6: gcov infrastructure P1: extract_structure output expansion (11 new feature fields) P2: Confusion group rule engine (8 pairs + contradiction + backtrack) P3: 4-factor confidence calculation + quality gate update P4: 33+2 COBOL program type test samples (22 files, 7 categories) P5: parametrized/ test data generation engine P6: japanese_data.py lookup tables P7-10: Type-specific test suites (~159 parametrized tests) P11: Full classification pipeline (classify_program) + orchestrator integration P12: Documentation (module-interfaces, test-plan v3.0, coverage-matrix) Architecture decisions: - classification_pipeline/ merged to hina/pipeline/ - parametrized/ as independent module - japanese_data.py as root-level file - hina/__all__ only exports classify_program() Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,80 @@
|
||||
"""分割系测试数据生成模块。
|
||||
|
||||
提供 generate_division_data() 用于生成按比例/规则分割到多个文件的测试数据,
|
||||
模拟 COBOL SORT 或 OUTPUT 分件场景。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
from typing import Any
|
||||
|
||||
|
||||
def generate_division_data(
|
||||
division_type: int = 50,
|
||||
record_count: int = 50,
|
||||
) -> list[list[dict[str, Any]]]:
|
||||
"""生成分割系测试数据。
|
||||
|
||||
按指定的分割方式和记录总数,将记录分配到多个文件中。
|
||||
|
||||
参数
|
||||
----------
|
||||
division_type : int
|
||||
分割方式:
|
||||
- 50 对半分割 → 2 个文件,各 50%
|
||||
- 25 四等分分割 → 4 个文件,各 25%
|
||||
- 100 全量(不分)→ 1 个文件,100%
|
||||
record_count : int
|
||||
记录总数(将在各文件间分配)。
|
||||
|
||||
返回
|
||||
-------
|
||||
list[list[dict]]
|
||||
按文件分组的记录列表: [[文件1记录], [文件2记录], ...]
|
||||
每条记录包含 KEY, DATA, FILE_NO 等字段。
|
||||
"""
|
||||
if division_type not in (50, 25, 100):
|
||||
raise ValueError(f"division_type 必须为 50 / 25 / 100,收到 {division_type}")
|
||||
if record_count < 1:
|
||||
raise ValueError(f"record_count 必须 >= 1,收到 {record_count}")
|
||||
|
||||
if division_type == 100:
|
||||
n_files = 1
|
||||
ratios = [1.0]
|
||||
elif division_type == 50:
|
||||
n_files = 2
|
||||
ratios = [0.5, 0.5]
|
||||
elif division_type == 25:
|
||||
n_files = 4
|
||||
ratios = [0.25, 0.25, 0.25, 0.25]
|
||||
else:
|
||||
raise AssertionError("unreachable")
|
||||
|
||||
result: list[list[dict[str, Any]]] = []
|
||||
allocated = 0
|
||||
|
||||
for file_no in range(n_files):
|
||||
if file_no == n_files - 1:
|
||||
# 最后一个文件获取剩余全部记录
|
||||
file_count = record_count - allocated
|
||||
else:
|
||||
file_count = max(0, int(math.floor(record_count * ratios[file_no])))
|
||||
# 确保不会超出总记录数
|
||||
if allocated + file_count > record_count:
|
||||
file_count = record_count - allocated
|
||||
|
||||
file_records: list[dict[str, Any]] = []
|
||||
for i in range(file_count):
|
||||
seq = allocated + i + 1
|
||||
file_records.append({
|
||||
"KEY": f"DIV-{seq:04d}",
|
||||
"DATA": f"div_data_{seq}",
|
||||
"FILE_NO": file_no + 1,
|
||||
"SEQ": seq,
|
||||
})
|
||||
|
||||
result.append(file_records)
|
||||
allocated += file_count
|
||||
|
||||
return result
|
||||
Reference in New Issue
Block a user