feat: Phase 2 complete — 13 Phases of COBOL type classification and test benchmark

P0.6: gcov infrastructure
P1: extract_structure output expansion (11 new feature fields)
P2: Confusion group rule engine (8 pairs + contradiction + backtrack)
P3: 4-factor confidence calculation + quality gate update
P4: 33+2 COBOL program type test samples (22 files, 7 categories)
P5: parametrized/ test data generation engine
P6: japanese_data.py lookup tables
P7-10: Type-specific test suites (~159 parametrized tests)
P11: Full classification pipeline (classify_program) + orchestrator integration
P12: Documentation (module-interfaces, test-plan v3.0, coverage-matrix)

Architecture decisions:
- classification_pipeline/ merged to hina/pipeline/
- parametrized/ as independent module
- japanese_data.py as root-level file
- hina/__all__ only exports classify_program()

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
hangshuo652
2026-06-19 23:51:55 +08:00
parent 63b5284715
commit bc1d56d1a4
129 changed files with 19378 additions and 261 deletions
+40
View File
@@ -1,3 +1,11 @@
"""字段树模型 — COPYBOOK 解析后的字段结构
使用例:
field = Field(name="TX-AMOUNT", level=5, pic="S9(7)V99", usage="COMP-3")
tree = FieldTree(fields=[field], copybook_name="TXCPY")
flat = tree.flatten() # → {"TX-AMOUNT": field}
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Optional
@@ -5,6 +13,25 @@ from typing import Optional
@dataclass
class Field:
"""单个字段定义(对应 COBOL DATA DIVISION 中的一行 01/05/10/77/88 级条目)。
────────── 字段说明 ──────────
name — 字段名(大写,如 WS-AMOUNT)
level — 层级号(01~49 / 77 / 88
pic — PIC 字符串(如 "S9(7)V99", "X(10)", "9(4)"
usage — 存储类型: DISPLAY / COMP / COMP-3 / COMP-5 / BINARY / PACKED-DECIMAL
offset — 在记录中的偏移量(字节)
length — 字段长度(字节)
decimal — 小数位数(从 PIC 解析)
signed — 是否带符号(PIC 以 S 开头)
sign_separate — 符号是否独立存储(SIGN IS LEADING/TRAILING SEPARATE
occurs — OCCURS 出现次数(None 表示非表列)
occurs_max— OCCURS DEPENDING ON 的最大值
redefines — 重定义的父字段名(如 "WS-BLOCK" 表示 REDEFINES WS-BLOCK
redefines_variant — REDEFINES 变体标识
conditions— 88-level 条件列表: [{"name": "WS-APPROVED", "value": "'A'"}, ...]
children — 子字段列表(层级嵌套时使用)
"""
name: str
level: int
pic: str
@@ -24,11 +51,22 @@ class Field:
@dataclass
class FieldTree:
"""COPYBOOK 解析结果 —— 包含所有顶层字段(递归展开子字段)。
────────── 字段说明 ──────────
fields — 顶层字段列表(01 级,不含子字段嵌入)
copybook_name — 源 COPYBOOK 文件名
sha256 — 源码的 SHA256 哈希
"""
fields: list[Field] = field(default_factory=list)
copybook_name: str = ""
sha256: str = ""
def flatten(self) -> dict[str, Field]:
"""展平为 {字段名 → Field} 字典(递归展开 children)。
注意: 同名子字段会覆盖父字段,使用 get_by_name 可自动处理。
"""
result = {}
def _walk(ff):
for f in ff:
@@ -38,6 +76,7 @@ class FieldTree:
return result
def get_by_name(self, name: str) -> Optional[Field]:
"""按字段名查找(递归搜索所有层级)。"""
return self.flatten().get(name)
@classmethod
@@ -45,6 +84,7 @@ class FieldTree:
return cls(fields=fields, copybook_name=name)
# ── 模块级断言(确保 dataclass 结构正确) ──
_f = Field(name="BR-AMT", level=5, pic="S9(7)V99", usage="COMP-3", offset=0, length=5, decimal=2, signed=True)
assert _f.name == "BR-AMT"
assert _f.decimal == 2