feat: Phase 2 complete — 13 Phases of COBOL type classification and test benchmark

P0.6: gcov infrastructure P1: extract_structure output expansion (11 new feature fields) P2: Confusion group rule engine (8 pairs + contradiction + backtrack) P3: 4-factor confidence calculation + quality gate update P4: 33+2 COBOL program type test samples (22 files, 7 categories) P5: parametrized/ test data generation engine P6: japanese_data.py lookup tables P7-10: Type-specific test suites (~159 parametrized tests) P11: Full classification pipeline (classify_program) + orchestrator integration P12: Documentation (module-interfaces, test-plan v3.0, coverage-matrix) Architecture decisions: - classification_pipeline/ merged to hina/pipeline/ - parametrized/ as independent module - japanese_data.py as root-level file - hina/__all__ only exports classify_program() Co-Authored-By: Claude <noreply@anthropic.com>
2026-06-19 23:51:55 +08:00
parent 63b5284715
commit bc1d56d1a4
129 changed files with 19378 additions and 261 deletions
@@ -1,3 +1,11 @@
+"""字段树模型 — COPYBOOK 解析后的字段结构
+
+使用例:
+  field = Field(name="TX-AMOUNT", level=5, pic="S9(7)V99", usage="COMP-3")
+  tree  = FieldTree(fields=[field], copybook_name="TXCPY")
+  flat  = tree.flatten()  # → {"TX-AMOUNT": field}
+"""
+
 from __future__ import annotations
 from dataclasses import dataclass, field
 from typing import Optional
@@ -5,6 +13,25 @@ from typing import Optional

@dataclass
 class Field:
+    """单个字段定义（对应 COBOL DATA DIVISION 中的一行 01/05/10/77/88 级条目）。
+
+    ────────── 字段说明 ──────────
+    name      — 字段名（大写，如 WS-AMOUNT）
+    level     — 层级号（01~49 / 77 / 88）
+    pic       — PIC 字符串（如 "S9(7)V99", "X(10)", "9(4)"）
+    usage     — 存储类型: DISPLAY / COMP / COMP-3 / COMP-5 / BINARY / PACKED-DECIMAL
+    offset    — 在记录中的偏移量（字节）
+    length    — 字段长度（字节）
+    decimal   — 小数位数（从 PIC 解析）
+    signed    — 是否带符号（PIC 以 S 开头）
+    sign_separate — 符号是否独立存储（SIGN IS LEADING/TRAILING SEPARATE）
+    occurs    — OCCURS 出现次数（None 表示非表列）
+    occurs_max— OCCURS DEPENDING ON 的最大值
+    redefines — 重定义的父字段名（如 "WS-BLOCK" 表示 REDEFINES WS-BLOCK）
+    redefines_variant — REDEFINES 变体标识
+    conditions— 88-level 条件列表: [{"name": "WS-APPROVED", "value": "'A'"}, ...]
+    children  — 子字段列表（层级嵌套时使用）
+    """
    name: str
    level: int
    pic: str
@@ -24,11 +51,22 @@ class Field:

@dataclass
 class FieldTree:
+    """COPYBOOK 解析结果 —— 包含所有顶层字段（递归展开子字段）。
+
+    ────────── 字段说明 ──────────
+    fields        — 顶层字段列表（01 级，不含子字段嵌入）
+    copybook_name — 源 COPYBOOK 文件名
+    sha256        — 源码的 SHA256 哈希
+    """
    fields: list[Field] = field(default_factory=list)
    copybook_name: str = ""
    sha256: str = ""

    def flatten(self) -> dict[str, Field]:
+        """展平为 {字段名 → Field} 字典（递归展开 children）。
+
+        注意: 同名子字段会覆盖父字段，使用 get_by_name 可自动处理。
+        """
        result = {}
        def _walk(ff):
            for f in ff:
@@ -38,6 +76,7 @@ class FieldTree:
        return result

    def get_by_name(self, name: str) -> Optional[Field]:
+        """按字段名查找（递归搜索所有层级）。"""
        return self.flatten().get(name)

    @classmethod
@@ -45,6 +84,7 @@ class FieldTree:
        return cls(fields=fields, copybook_name=name)


+# ── 模块级断言（确保 dataclass 结构正确） ──
 _f = Field(name="BR-AMT", level=5, pic="S9(7)V99", usage="COMP-3", offset=0, length=5, decimal=2, signed=True)
 assert _f.name == "BR-AMT"
 assert _f.decimal == 2