feat: Phase 2 complete — 13 Phases of COBOL type classification and test benchmark
P0.6: gcov infrastructure P1: extract_structure output expansion (11 new feature fields) P2: Confusion group rule engine (8 pairs + contradiction + backtrack) P3: 4-factor confidence calculation + quality gate update P4: 33+2 COBOL program type test samples (22 files, 7 categories) P5: parametrized/ test data generation engine P6: japanese_data.py lookup tables P7-10: Type-specific test suites (~159 parametrized tests) P11: Full classification pipeline (classify_program) + orchestrator integration P12: Documentation (module-interfaces, test-plan v3.0, coverage-matrix) Architecture decisions: - classification_pipeline/ merged to hina/pipeline/ - parametrized/ as independent module - japanese_data.py as root-level file - hina/__all__ only exports classify_program() Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,9 @@
|
||||
__all__ = [
|
||||
"generate_matching_data", "generate_keybreak_data",
|
||||
"generate_division_data", "generate_zero_byte_file",
|
||||
"generate_boundary_values", "generate_minimal_records",
|
||||
"generate_sorted_records", "generate_duplicate_keys",
|
||||
]
|
||||
from .matching import generate_matching_data, generate_keybreak_data
|
||||
from .division import generate_division_data
|
||||
from .common import generate_zero_byte_file, generate_boundary_values, generate_minimal_records, generate_sorted_records, generate_duplicate_keys
|
||||
@@ -0,0 +1,275 @@
|
||||
"""通用测试数据生成工具函数模块。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import pathlib
|
||||
import re
|
||||
from typing import Any
|
||||
|
||||
|
||||
def generate_zero_byte_file(path: str) -> None:
|
||||
"""生成一个 0 字节的空文件。
|
||||
|
||||
自动创建父目录(如果不存在)。
|
||||
|
||||
参数
|
||||
----------
|
||||
path : str
|
||||
待创建的空文件路径。
|
||||
"""
|
||||
p = pathlib.Path(path)
|
||||
p.parent.mkdir(parents=True, exist_ok=True)
|
||||
p.write_bytes(b"")
|
||||
|
||||
|
||||
def generate_minimal_records(fields: list[dict[str, Any]]) -> list[dict[str, Any]]:
|
||||
"""为给定的字段定义生成 1 条正常记录(最小数据量)。
|
||||
|
||||
参数
|
||||
----------
|
||||
fields : list[dict]
|
||||
字段定义列表,每个字典可包含以下键(均为可选):
|
||||
- "name" : str 字段名,默认为 "FIELD_{i}"
|
||||
- "type" : str 类型: "numeric" / "string" / "date",默认 "string"
|
||||
- "length" : int 长度,字符串用,默认 10
|
||||
- "default" : Any 默认值,优先使用
|
||||
|
||||
返回
|
||||
-------
|
||||
list[dict]
|
||||
包含一条记录的列表,记录中每个字段的值为类型合理的默认值。
|
||||
"""
|
||||
if not fields:
|
||||
return [{}]
|
||||
|
||||
record: dict[str, Any] = {}
|
||||
for i, f in enumerate(fields):
|
||||
name = f.get("name", f"FIELD_{i}")
|
||||
if "default" in f:
|
||||
record[name] = f["default"]
|
||||
else:
|
||||
typ = f.get("type", "string")
|
||||
if typ == "numeric":
|
||||
record[name] = 0
|
||||
elif typ == "date":
|
||||
record[name] = "0001-01-01"
|
||||
else: # string
|
||||
length = f.get("length", 10)
|
||||
record[name] = "A" * length
|
||||
|
||||
return [record]
|
||||
|
||||
|
||||
def _parse_pic(pic: str) -> dict[str, Any]:
|
||||
"""解析 COBOL PIC 子句,返回类型、位数、小数位等信息。
|
||||
|
||||
支持的格式:
|
||||
- S9(7)V99 signed, 7 整数位, 2 小数位
|
||||
- 9(4) 无符号, 4 整数位
|
||||
- S9(3) signed, 3 整数位
|
||||
- 9(4)V9(2) 无符号, 4 整数位, 2 小数位
|
||||
- X(10) 字符串, 10 字符
|
||||
- 9(7)V99 无符号, 7 整数位, 2 小数位
|
||||
|
||||
参数
|
||||
----------
|
||||
pic : str
|
||||
COBOL PIC 字符串,如 "S9(7)V99"。
|
||||
|
||||
返回
|
||||
-------
|
||||
dict
|
||||
包含 type, digits, decimal, signed, total_digits 等信息的字典。
|
||||
"""
|
||||
pic = pic.strip().upper()
|
||||
result: dict[str, Any] = {
|
||||
"type": "unknown",
|
||||
"digits": 0,
|
||||
"decimal": 0,
|
||||
"signed": False,
|
||||
"total_digits": 0,
|
||||
}
|
||||
|
||||
# 字符串类型
|
||||
if pic.startswith("X") or pic.startswith("A"):
|
||||
result["type"] = "string"
|
||||
m = re.match(r'[XA]\((\d+)\)', pic)
|
||||
if m:
|
||||
result["digits"] = int(m.group(1))
|
||||
else:
|
||||
result["digits"] = 1
|
||||
return result
|
||||
|
||||
# 数值类型
|
||||
if "9" in pic or pic.startswith("S"):
|
||||
result["type"] = "numeric"
|
||||
signed_match = re.match(r'S(.*)', pic)
|
||||
if signed_match:
|
||||
result["signed"] = True
|
||||
pic_body = signed_match.group(1)
|
||||
else:
|
||||
result["signed"] = False
|
||||
pic_body = pic
|
||||
|
||||
# 解析整数和小数部分
|
||||
# 9(7)V99 或 9(7)V9(2)
|
||||
v_match = re.match(r'9\((\d+)\)V9\((\d+)\)', pic_body)
|
||||
if v_match:
|
||||
result["digits"] = int(v_match.group(1))
|
||||
result["decimal"] = int(v_match.group(2))
|
||||
else:
|
||||
# 尝试 9(4) 或 9(7)V99
|
||||
m2 = re.match(r'9\((\d+)\)', pic_body)
|
||||
if m2:
|
||||
result["digits"] = int(m2.group(1))
|
||||
rest = pic_body[m2.end():]
|
||||
if rest.startswith("V"):
|
||||
dec_str = rest[1:]
|
||||
dm = re.match(r'9\((\d+)\)', dec_str)
|
||||
if dm:
|
||||
result["decimal"] = int(dm.group(1))
|
||||
elif re.match(r'9+', dec_str):
|
||||
result["decimal"] = len(dec_str)
|
||||
# 处理简写: 9(7)V99
|
||||
elif rest.startswith("V"):
|
||||
dec_part = rest[1:]
|
||||
dm = re.match(r'9\((\d+)\)', dec_part)
|
||||
if dm:
|
||||
result["decimal"] = int(dm.group(1))
|
||||
elif re.match(r'9+', dec_part):
|
||||
result["decimal"] = len(dec_part)
|
||||
|
||||
result["total_digits"] = result["digits"] + result["decimal"]
|
||||
return result
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def generate_boundary_values(pic: str) -> dict[str, Any]:
|
||||
"""从 PIC 子句解析出最大值、最小值和溢出值。
|
||||
|
||||
参数
|
||||
----------
|
||||
pic : str
|
||||
COBOL PIC 字符串,如 "S9(7)V99"。
|
||||
|
||||
返回
|
||||
-------
|
||||
dict
|
||||
{
|
||||
"max": 类型最大值,
|
||||
"min": 类型最小值,
|
||||
"overflow": 溢出值(超出最大位数的值),
|
||||
"zero": 0,
|
||||
"pic_info": 解析出的 PIC 信息,
|
||||
}
|
||||
"""
|
||||
info = _parse_pic(pic)
|
||||
|
||||
if info["type"] == "string":
|
||||
length = info["digits"]
|
||||
return {
|
||||
"max": "X" * length,
|
||||
"min": "" if length == 0 else "A" + " " * (length - 1),
|
||||
"overflow": "X" * (length + 1) if length > 0 else "X",
|
||||
"zero": "" if length == 0 else " " * length,
|
||||
"pic_info": info,
|
||||
}
|
||||
|
||||
if info["type"] == "numeric":
|
||||
digits = info["digits"]
|
||||
decimal = info["decimal"]
|
||||
signed = info["signed"]
|
||||
factor = 10 ** decimal
|
||||
|
||||
total_digits = info.get("total_digits", digits + decimal)
|
||||
max_val = (10 ** total_digits - 1) / factor
|
||||
overflow_val = (10 ** (total_digits + 1)) / factor
|
||||
|
||||
if signed:
|
||||
min_val = -max_val
|
||||
else:
|
||||
min_val = 0
|
||||
|
||||
return {
|
||||
"max": max_val,
|
||||
"min": min_val,
|
||||
"overflow": overflow_val,
|
||||
"zero": 0.0 if decimal > 0 else 0,
|
||||
"pic_info": info,
|
||||
}
|
||||
|
||||
return {
|
||||
"max": None,
|
||||
"min": None,
|
||||
"overflow": None,
|
||||
"zero": None,
|
||||
"pic_info": info,
|
||||
}
|
||||
|
||||
|
||||
def generate_sorted_records(
|
||||
record_count: int = 10,
|
||||
key_field: str = "KEY",
|
||||
) -> list[dict[str, Any]]:
|
||||
"""按 KEY 升序生成记录。
|
||||
|
||||
参数
|
||||
----------
|
||||
record_count : int
|
||||
生成记录数,默认 10。
|
||||
key_field : str
|
||||
键字段名,默认 "KEY"。
|
||||
|
||||
返回
|
||||
-------
|
||||
list[dict]
|
||||
已按 key_field 排序的记录列表。
|
||||
"""
|
||||
if record_count < 1:
|
||||
raise ValueError(f"record_count 必须 >= 1,收到 {record_count}")
|
||||
|
||||
records: list[dict[str, Any]] = []
|
||||
for i in range(record_count):
|
||||
records.append({
|
||||
key_field: f"KEY-{i:04d}",
|
||||
"DATA": f"sorted_data_{i}",
|
||||
"SEQ": i + 1,
|
||||
})
|
||||
|
||||
return records
|
||||
|
||||
|
||||
def generate_duplicate_keys(
|
||||
records: list[dict[str, Any]],
|
||||
key_field: str = "KEY",
|
||||
) -> list[dict[str, Any]]:
|
||||
"""在已有记录基础上,为每条记录追加一条或多条同键值记录。
|
||||
|
||||
适用于测试重复键处理逻辑(如 SORT MERGE / 去重检查)。
|
||||
|
||||
参数
|
||||
----------
|
||||
records : list[dict]
|
||||
原始记录列表。
|
||||
key_field : str
|
||||
作为重复键的字段名,默认 "KEY"。
|
||||
|
||||
返回
|
||||
-------
|
||||
list[dict]
|
||||
追加了重复键记录的完整列表。
|
||||
"""
|
||||
if not records:
|
||||
return []
|
||||
|
||||
duplicates: list[dict[str, Any]] = []
|
||||
for rec in records:
|
||||
dup = dict(rec)
|
||||
dup[key_field] = rec[key_field]
|
||||
dup["DATA"] = rec.get("DATA", "") + "_DUP"
|
||||
dup["SEQ"] = rec.get("SEQ", 0) + 10000
|
||||
duplicates.append(dup)
|
||||
|
||||
return records + duplicates
|
||||
@@ -0,0 +1,80 @@
|
||||
"""分割系测试数据生成模块。
|
||||
|
||||
提供 generate_division_data() 用于生成按比例/规则分割到多个文件的测试数据,
|
||||
模拟 COBOL SORT 或 OUTPUT 分件场景。
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import math
|
||||
from typing import Any
|
||||
|
||||
|
||||
def generate_division_data(
|
||||
division_type: int = 50,
|
||||
record_count: int = 50,
|
||||
) -> list[list[dict[str, Any]]]:
|
||||
"""生成分割系测试数据。
|
||||
|
||||
按指定的分割方式和记录总数,将记录分配到多个文件中。
|
||||
|
||||
参数
|
||||
----------
|
||||
division_type : int
|
||||
分割方式:
|
||||
- 50 对半分割 → 2 个文件,各 50%
|
||||
- 25 四等分分割 → 4 个文件,各 25%
|
||||
- 100 全量(不分)→ 1 个文件,100%
|
||||
record_count : int
|
||||
记录总数(将在各文件间分配)。
|
||||
|
||||
返回
|
||||
-------
|
||||
list[list[dict]]
|
||||
按文件分组的记录列表: [[文件1记录], [文件2记录], ...]
|
||||
每条记录包含 KEY, DATA, FILE_NO 等字段。
|
||||
"""
|
||||
if division_type not in (50, 25, 100):
|
||||
raise ValueError(f"division_type 必须为 50 / 25 / 100,收到 {division_type}")
|
||||
if record_count < 1:
|
||||
raise ValueError(f"record_count 必须 >= 1,收到 {record_count}")
|
||||
|
||||
if division_type == 100:
|
||||
n_files = 1
|
||||
ratios = [1.0]
|
||||
elif division_type == 50:
|
||||
n_files = 2
|
||||
ratios = [0.5, 0.5]
|
||||
elif division_type == 25:
|
||||
n_files = 4
|
||||
ratios = [0.25, 0.25, 0.25, 0.25]
|
||||
else:
|
||||
raise AssertionError("unreachable")
|
||||
|
||||
result: list[list[dict[str, Any]]] = []
|
||||
allocated = 0
|
||||
|
||||
for file_no in range(n_files):
|
||||
if file_no == n_files - 1:
|
||||
# 最后一个文件获取剩余全部记录
|
||||
file_count = record_count - allocated
|
||||
else:
|
||||
file_count = max(0, int(math.floor(record_count * ratios[file_no])))
|
||||
# 确保不会超出总记录数
|
||||
if allocated + file_count > record_count:
|
||||
file_count = record_count - allocated
|
||||
|
||||
file_records: list[dict[str, Any]] = []
|
||||
for i in range(file_count):
|
||||
seq = allocated + i + 1
|
||||
file_records.append({
|
||||
"KEY": f"DIV-{seq:04d}",
|
||||
"DATA": f"div_data_{seq}",
|
||||
"FILE_NO": file_no + 1,
|
||||
"SEQ": seq,
|
||||
})
|
||||
|
||||
result.append(file_records)
|
||||
allocated += file_count
|
||||
|
||||
return result
|
||||
@@ -0,0 +1,194 @@
|
||||
"""匹配系测试数据生成模块。
|
||||
|
||||
提供两种生成器:
|
||||
- generate_matching_data() — 生成主/从匹配测试数据
|
||||
- generate_keybreak_data() — 生成 KEY 切中断测试数据
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import random
|
||||
from typing import Any
|
||||
|
||||
|
||||
def generate_matching_data(
|
||||
matching_type: str = "1:1",
|
||||
record_count_r01: int = 10,
|
||||
record_count_r02: int = 10,
|
||||
key_match_ratio: float = 1.0,
|
||||
) -> tuple[list[dict[str, Any]], list[dict[str, Any]]]:
|
||||
"""生成匹配系测试数据。
|
||||
|
||||
参数
|
||||
----------
|
||||
matching_type : str
|
||||
匹配模式,支持:
|
||||
- "1:1" 主件每条在从件最多命中一条
|
||||
- "1:N" 主件每条在从件可能命中多条
|
||||
- "N:1" 从件每条在主件可能命中多条
|
||||
record_count_r01 : int
|
||||
主文件(R01)记录条数
|
||||
record_count_r02 : int
|
||||
从文件(R02)记录条数
|
||||
key_match_ratio : float
|
||||
键值匹配比例,0.0~1.0 之间。
|
||||
1.0 表示全部匹配,0.0 表示全部不匹配。
|
||||
|
||||
返回
|
||||
-------
|
||||
tuple[list[dict], list[dict]]
|
||||
(主文件记录列表, 从文件记录列表)
|
||||
"""
|
||||
if matching_type not in ("1:1", "1:N", "N:1"):
|
||||
raise ValueError(f"不支持的 matching_type:{matching_type!r},应为 '1:1' / '1:N' / 'N:1'")
|
||||
if not 0.0 <= key_match_ratio <= 1.0:
|
||||
raise ValueError(f"key_match_ratio 必须在 0.0~1.0 之间,收到 {key_match_ratio}")
|
||||
if record_count_r01 < 0 or record_count_r02 < 0:
|
||||
raise ValueError("记录数不能为负数")
|
||||
|
||||
main_records: list[dict[str, Any]] = []
|
||||
sub_records: list[dict[str, Any]] = []
|
||||
|
||||
# 生成主文件记录
|
||||
for i in range(record_count_r01):
|
||||
main_records.append({
|
||||
"KEY": f"MAIN-{i:04d}",
|
||||
"DATA": f"main_data_{i}",
|
||||
"SEQ": i + 1,
|
||||
})
|
||||
|
||||
# 生成从文件记录
|
||||
matched = 0
|
||||
unmatched = 0
|
||||
if matching_type == "1:1":
|
||||
# 1:1 — 最多让 record_count_r01 条从件匹配
|
||||
max_match = min(record_count_r01, record_count_r02)
|
||||
match_count = int(max_match * key_match_ratio)
|
||||
for i in range(record_count_r02):
|
||||
if i < match_count and i < record_count_r01:
|
||||
sub_records.append({
|
||||
"KEY": f"MAIN-{i:04d}",
|
||||
"DATA": f"sub_data_{i}",
|
||||
"SEQ": i + 1,
|
||||
})
|
||||
matched += 1
|
||||
else:
|
||||
sub_records.append({
|
||||
"KEY": f"UNMATCHED-SUB-{unmatched:04d}",
|
||||
"DATA": f"sub_unmatched_{unmatched}",
|
||||
"SEQ": record_count_r01 + unmatched + 1,
|
||||
})
|
||||
unmatched += 1
|
||||
|
||||
elif matching_type == "1:N":
|
||||
# 1:N — 每条主件可能对应多条从件
|
||||
match_count = int(record_count_r01 * key_match_ratio)
|
||||
idx = 0
|
||||
for i in range(record_count_r01):
|
||||
if i < match_count:
|
||||
n_per_main = max(1, record_count_r02 // max(1, match_count))
|
||||
for _ in range(n_per_main):
|
||||
if idx < record_count_r02:
|
||||
sub_records.append({
|
||||
"KEY": f"MAIN-{i:04d}",
|
||||
"DATA": f"sub_data_{idx}",
|
||||
"SEQ": idx + 1,
|
||||
})
|
||||
idx += 1
|
||||
else:
|
||||
if idx < record_count_r02:
|
||||
sub_records.append({
|
||||
"KEY": f"UNMATCHED-SUB-{unmatched:04d}",
|
||||
"DATA": f"sub_unmatched_{unmatched}",
|
||||
"SEQ": idx + 1,
|
||||
})
|
||||
idx += 1
|
||||
unmatched += 1
|
||||
# 补齐剩余
|
||||
while idx < record_count_r02:
|
||||
sub_records.append({
|
||||
"KEY": f"UNMATCHED-SUB-{unmatched:04d}",
|
||||
"DATA": f"sub_unmatched_{unmatched}",
|
||||
"SEQ": idx + 1,
|
||||
})
|
||||
idx += 1
|
||||
unmatched += 1
|
||||
|
||||
elif matching_type == "N:1":
|
||||
# N:1 — 多条主件对应同一条从件
|
||||
match_count = int(record_count_r02 * key_match_ratio)
|
||||
for i in range(record_count_r02):
|
||||
if i < match_count:
|
||||
sub_records.append({
|
||||
"KEY": f"MAIN-{i % max(1, record_count_r01):04d}",
|
||||
"DATA": f"sub_data_{i}",
|
||||
"SEQ": i + 1,
|
||||
})
|
||||
matched += 1
|
||||
else:
|
||||
sub_records.append({
|
||||
"KEY": f"UNMATCHED-SUB-{unmatched:04d}",
|
||||
"DATA": f"sub_unmatched_{unmatched}",
|
||||
"SEQ": i + 1,
|
||||
})
|
||||
unmatched += 1
|
||||
|
||||
return main_records, sub_records
|
||||
|
||||
|
||||
def generate_keybreak_data(
|
||||
group_count: int = 3,
|
||||
records_per_group: int = 2,
|
||||
sum_type: str = "accumulate",
|
||||
) -> list[dict[str, Any]]:
|
||||
"""生成 KEY 切测试数据,组间 KEY 值变化触发中断。
|
||||
|
||||
每组内的记录 KEY 值相同;组间 KEY 值递增。
|
||||
适用于测试 AT END / BREAK / 集计功能。
|
||||
|
||||
参数
|
||||
----------
|
||||
group_count : int
|
||||
分组数量,默认 3。
|
||||
records_per_group : int
|
||||
每组记录数,默认 2。
|
||||
sum_type : str
|
||||
集计类型:
|
||||
- "accumulate" 累加型(FIELD 值递增)
|
||||
- "aggregate" 集计型(FIELD 值相同)
|
||||
- "mark" 标记型(FIELD 为固定标记值)
|
||||
|
||||
返回
|
||||
-------
|
||||
list[dict]
|
||||
包含 KEY、FIELD、GROUP、SEQ 等字段的记录列表。
|
||||
"""
|
||||
if group_count < 1:
|
||||
raise ValueError(f"group_count 必须 >= 1,收到 {group_count}")
|
||||
if records_per_group < 1:
|
||||
raise ValueError(f"records_per_group 必须 >= 1,收到 {records_per_group}")
|
||||
if sum_type not in ("accumulate", "aggregate", "mark"):
|
||||
raise ValueError(f"不支持的 sum_type:{sum_type!r}")
|
||||
|
||||
records: list[dict[str, Any]] = []
|
||||
seq = 0
|
||||
|
||||
for g in range(group_count):
|
||||
group_key = f"KEY-{chr(65 + g) if g < 26 else g}" # KEY-A, KEY-B, ...
|
||||
for r in range(records_per_group):
|
||||
seq += 1
|
||||
if sum_type == "accumulate":
|
||||
field_val = (g + 1) * 100 + r + 1
|
||||
elif sum_type == "aggregate":
|
||||
field_val = (g + 1) * 100
|
||||
else: # mark
|
||||
field_val = f"MARK-{chr(65 + g)}"
|
||||
|
||||
records.append({
|
||||
"KEY": group_key,
|
||||
"FIELD": field_val,
|
||||
"GROUP": g + 1,
|
||||
"SEQ": seq,
|
||||
})
|
||||
|
||||
return records
|
||||
Reference in New Issue
Block a user