feat: Phase 2 complete — 13 Phases of COBOL type classification and test benchmark
P0.6: gcov infrastructure P1: extract_structure output expansion (11 new feature fields) P2: Confusion group rule engine (8 pairs + contradiction + backtrack) P3: 4-factor confidence calculation + quality gate update P4: 33+2 COBOL program type test samples (22 files, 7 categories) P5: parametrized/ test data generation engine P6: japanese_data.py lookup tables P7-10: Type-specific test suites (~159 parametrized tests) P11: Full classification pipeline (classify_program) + orchestrator integration P12: Documentation (module-interfaces, test-plan v3.0, coverage-matrix) Architecture decisions: - classification_pipeline/ merged to hina/pipeline/ - parametrized/ as independent module - japanese_data.py as root-level file - hina/__all__ only exports classify_program() Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@@ -1,3 +1,29 @@
|
||||
"""COBOL 数据模型 — 所有模块共享的契约
|
||||
|
||||
本包定义了全系统共用的数据类。所有模块的输入/输出必须使用这些类。
|
||||
修改本包需通知所有开发者。
|
||||
|
||||
导入方式:
|
||||
from data import Field, FieldTree # 字段树
|
||||
from data import TestCase, TestSuite, SparkConfig # 测试数据
|
||||
from data import FieldResult, VerificationRun # 对比结果
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from .field_tree import Field, FieldTree
|
||||
from .test_case import TestCase, TestSuite, SparkConfig
|
||||
from .diff_result import FieldResult, VerificationRun
|
||||
|
||||
__all__ = [
|
||||
# ═══ 字段树 ── cobol_testgen / comparator / agents 共用 ═══
|
||||
"Field", # dataclass — 单个字段定义
|
||||
"FieldTree", # dataclass — COPYBOOK 字段树
|
||||
# ═══ 测试数据 ── cobol_testgen / runners 共用 ═══
|
||||
"TestCase", # dataclass — 单条测试用例
|
||||
"TestSuite", # dataclass — 测试套件(含 Spark 配置)
|
||||
"SparkConfig", # dataclass — Spark 运行参数
|
||||
# ═══ 对比结果 ── comparator / report / orchestrator 共用 ═══
|
||||
"FieldResult", # dataclass — 单个字段对比结果
|
||||
"VerificationRun", # dataclass — 管道运行全结果
|
||||
]
|
||||
|
||||
+59
-9
@@ -1,3 +1,11 @@
|
||||
"""管道运行结果模型 — 对比结果 + 全管道运行记录
|
||||
|
||||
使用例:
|
||||
fr = FieldResult(field_name="TX-AMOUNT", status="MISMATCH",
|
||||
cobol_value="1500000", java_value="1499999.99")
|
||||
vr = VerificationRun(program="BILL-CALC", runner="native")
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from dataclasses import dataclass, field
|
||||
from datetime import datetime
|
||||
@@ -6,6 +14,21 @@ from typing import Optional
|
||||
|
||||
@dataclass
|
||||
class FieldResult:
|
||||
"""单个字段的 COBOL ↔ Java 对比结果。
|
||||
|
||||
────────── 字段说明 ──────────
|
||||
field_name — 字段名
|
||||
status — 对比状态:
|
||||
PASS = 完全一致
|
||||
TOLERATED = 在容忍度范围内
|
||||
MISMATCH = 不一致
|
||||
NOT_SET = 缺失侧
|
||||
cobol_value — COBOL 侧原始值(字符串)
|
||||
java_value — Java 侧原始值(字符串)
|
||||
tolerance_applied — 本次使用的实际容忍度
|
||||
rounding_detected — 检测到的舍入类型
|
||||
suggestion — LLM 自动诊断建议文本
|
||||
"""
|
||||
field_name: str = ""
|
||||
status: str = "PASS"
|
||||
cobol_value: str = ""
|
||||
@@ -17,6 +40,33 @@ class FieldResult:
|
||||
|
||||
@dataclass
|
||||
class VerificationRun:
|
||||
"""单次管道运行的完整记录 — 由 orchestrator.run_pipeline() 返回。
|
||||
|
||||
────────── 字段说明 ──────────
|
||||
program — 程序名
|
||||
timestamp — 时间戳(自动: YYYYMMDD-HHMMSS)
|
||||
status — 整体状态: PASS / MISMATCH / BLOCKED / ERROR / FATAL
|
||||
exit_code — 0=通过 1=不匹配 2=阻塞 3=错误 4=致命
|
||||
duration_s — 总耗时秒
|
||||
fields_matched — 一致字段数
|
||||
fields_mismatched — 不一致字段数
|
||||
coverage_target — 覆盖率目标: "" / "boundary" / "all-paths"
|
||||
field_results — 字段对比结果列表
|
||||
runner — native / spark
|
||||
branch_rate — 分支覆盖率(静态分析)
|
||||
paragraph_rate — 段落覆盖率(静态分析)
|
||||
decision_rate — 决策点覆盖率
|
||||
hina_type — HINA 分类类型
|
||||
hina_confidence — HINA 确信度
|
||||
quality_score — 质量评分 (0~1)
|
||||
quality_warn — 质量警告
|
||||
heal_retry — 自愈重试次数
|
||||
simple_retry — 朴素重试次数
|
||||
total_retry — 总重试次数
|
||||
llm_cost — LLM 累计成本 USD
|
||||
report_path — 报告输出路径
|
||||
debug — 调试信息(不兼容保证)
|
||||
"""
|
||||
program: str = ""
|
||||
timestamp: str = ""
|
||||
status: str = "PASS"
|
||||
@@ -28,15 +78,15 @@ class VerificationRun:
|
||||
field_results: list[FieldResult] = field(default_factory=list)
|
||||
runner: str = "native"
|
||||
branch_rate: float = 0.0
|
||||
paragraph_rate: float = 0.0 # 段落覆盖率
|
||||
decision_rate: float = 0.0 # 决策点覆盖率
|
||||
hina_type: str = "" # HINA 类型
|
||||
hina_confidence: float = 0.0 # HINA 确信度
|
||||
quality_score: float = 0.0 # 质量评分
|
||||
quality_warn: str = "" # 质量警告信息
|
||||
heal_retry: int = 0 # 自愈重试次数
|
||||
simple_retry: int = 0 # 朴素重试次数
|
||||
total_retry: int = 0 # 总重试次数
|
||||
paragraph_rate: float = 0.0
|
||||
decision_rate: float = 0.0
|
||||
hina_type: str = ""
|
||||
hina_confidence: float = 0.0
|
||||
quality_score: float = 0.0
|
||||
quality_warn: str = ""
|
||||
heal_retry: int = 0
|
||||
simple_retry: int = 0
|
||||
total_retry: int = 0
|
||||
llm_cost: float = 0.0
|
||||
report_path: str = ""
|
||||
debug: dict = field(default_factory=dict)
|
||||
|
||||
@@ -1,3 +1,11 @@
|
||||
"""字段树模型 — COPYBOOK 解析后的字段结构
|
||||
|
||||
使用例:
|
||||
field = Field(name="TX-AMOUNT", level=5, pic="S9(7)V99", usage="COMP-3")
|
||||
tree = FieldTree(fields=[field], copybook_name="TXCPY")
|
||||
flat = tree.flatten() # → {"TX-AMOUNT": field}
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
@@ -5,6 +13,25 @@ from typing import Optional
|
||||
|
||||
@dataclass
|
||||
class Field:
|
||||
"""单个字段定义(对应 COBOL DATA DIVISION 中的一行 01/05/10/77/88 级条目)。
|
||||
|
||||
────────── 字段说明 ──────────
|
||||
name — 字段名(大写,如 WS-AMOUNT)
|
||||
level — 层级号(01~49 / 77 / 88)
|
||||
pic — PIC 字符串(如 "S9(7)V99", "X(10)", "9(4)")
|
||||
usage — 存储类型: DISPLAY / COMP / COMP-3 / COMP-5 / BINARY / PACKED-DECIMAL
|
||||
offset — 在记录中的偏移量(字节)
|
||||
length — 字段长度(字节)
|
||||
decimal — 小数位数(从 PIC 解析)
|
||||
signed — 是否带符号(PIC 以 S 开头)
|
||||
sign_separate — 符号是否独立存储(SIGN IS LEADING/TRAILING SEPARATE)
|
||||
occurs — OCCURS 出现次数(None 表示非表列)
|
||||
occurs_max— OCCURS DEPENDING ON 的最大值
|
||||
redefines — 重定义的父字段名(如 "WS-BLOCK" 表示 REDEFINES WS-BLOCK)
|
||||
redefines_variant — REDEFINES 变体标识
|
||||
conditions— 88-level 条件列表: [{"name": "WS-APPROVED", "value": "'A'"}, ...]
|
||||
children — 子字段列表(层级嵌套时使用)
|
||||
"""
|
||||
name: str
|
||||
level: int
|
||||
pic: str
|
||||
@@ -24,11 +51,22 @@ class Field:
|
||||
|
||||
@dataclass
|
||||
class FieldTree:
|
||||
"""COPYBOOK 解析结果 —— 包含所有顶层字段(递归展开子字段)。
|
||||
|
||||
────────── 字段说明 ──────────
|
||||
fields — 顶层字段列表(01 级,不含子字段嵌入)
|
||||
copybook_name — 源 COPYBOOK 文件名
|
||||
sha256 — 源码的 SHA256 哈希
|
||||
"""
|
||||
fields: list[Field] = field(default_factory=list)
|
||||
copybook_name: str = ""
|
||||
sha256: str = ""
|
||||
|
||||
def flatten(self) -> dict[str, Field]:
|
||||
"""展平为 {字段名 → Field} 字典(递归展开 children)。
|
||||
|
||||
注意: 同名子字段会覆盖父字段,使用 get_by_name 可自动处理。
|
||||
"""
|
||||
result = {}
|
||||
def _walk(ff):
|
||||
for f in ff:
|
||||
@@ -38,6 +76,7 @@ class FieldTree:
|
||||
return result
|
||||
|
||||
def get_by_name(self, name: str) -> Optional[Field]:
|
||||
"""按字段名查找(递归搜索所有层级)。"""
|
||||
return self.flatten().get(name)
|
||||
|
||||
@classmethod
|
||||
@@ -45,6 +84,7 @@ class FieldTree:
|
||||
return cls(fields=fields, copybook_name=name)
|
||||
|
||||
|
||||
# ── 模块级断言(确保 dataclass 结构正确) ──
|
||||
_f = Field(name="BR-AMT", level=5, pic="S9(7)V99", usage="COMP-3", offset=0, length=5, decimal=2, signed=True)
|
||||
assert _f.name == "BR-AMT"
|
||||
assert _f.decimal == 2
|
||||
|
||||
@@ -1,3 +1,10 @@
|
||||
"""测试数据模型 — 测试用例 + 测试套件 + Spark 配置
|
||||
|
||||
使用例:
|
||||
tc = TestCase(id="TC-001", fields={"TX-AMOUNT": 1500000})
|
||||
suite = TestSuite(test_cases=[tc], spark_config=SparkConfig(num_records=1000))
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
from dataclasses import dataclass, field
|
||||
from typing import Optional
|
||||
@@ -5,6 +12,14 @@ from typing import Optional
|
||||
|
||||
@dataclass
|
||||
class SparkConfig:
|
||||
"""Spark 测试数据生成配置。
|
||||
|
||||
────────── 字段说明 ──────────
|
||||
num_records — 生成的记录数
|
||||
replication — 复制策略: "key_varied" / "exact_copy"
|
||||
key_field — 键字段名(key_varied 用)
|
||||
edge_cases — 边缘 case: ["null","max","min","empty"]
|
||||
"""
|
||||
num_records: int = 100
|
||||
replication: str = "key_varied"
|
||||
key_field: str = ""
|
||||
@@ -13,6 +28,13 @@ class SparkConfig:
|
||||
|
||||
@dataclass
|
||||
class TestCase:
|
||||
"""单条测试用例 — 一条待验证的字段值组合。
|
||||
|
||||
────────── 字段说明 ──────────
|
||||
id — 用例 ID(如 "TC-001")
|
||||
fields — {字段名: 值}
|
||||
coverage_targets — 覆盖的决策点 ID 列表
|
||||
"""
|
||||
id: str
|
||||
fields: dict = field(default_factory=dict)
|
||||
coverage_targets: list[str] = field(default_factory=list)
|
||||
@@ -20,6 +42,13 @@ class TestCase:
|
||||
|
||||
@dataclass
|
||||
class TestSuite:
|
||||
"""测试套件 — 多条用例 + 可选 Spark 配置。
|
||||
|
||||
────────── 字段说明 ──────────
|
||||
schema — 可选的字段 schema
|
||||
test_cases — 测试用例列表
|
||||
spark_config — None 表示非 Spark 模式
|
||||
"""
|
||||
schema: Optional[dict] = None
|
||||
test_cases: list[TestCase] = field(default_factory=list)
|
||||
spark_config: Optional[SparkConfig] = None
|
||||
|
||||
Reference in New Issue
Block a user