feat: Phase 2 complete — 13 Phases of COBOL type classification and test benchmark

P0.6: gcov infrastructure P1: extract_structure output expansion (11 new feature fields) P2: Confusion group rule engine (8 pairs + contradiction + backtrack) P3: 4-factor confidence calculation + quality gate update P4: 33+2 COBOL program type test samples (22 files, 7 categories) P5: parametrized/ test data generation engine P6: japanese_data.py lookup tables P7-10: Type-specific test suites (~159 parametrized tests) P11: Full classification pipeline (classify_program) + orchestrator integration P12: Documentation (module-interfaces, test-plan v3.0, coverage-matrix) Architecture decisions: - classification_pipeline/ merged to hina/pipeline/ - parametrized/ as independent module - japanese_data.py as root-level file - hina/__all__ only exports classify_program() Co-Authored-By: Claude <noreply@anthropic.com>
2026-06-19 23:51:55 +08:00
parent 63b5284715
commit bc1d56d1a4
129 changed files with 19378 additions and 261 deletions
@@ -1,4 +1,12 @@
-"""COBOL Test Data Generator — 模块化版入口"""
+"""COBOL Test Data Generator — 模块化版入口
+
+from __future__ import annotations
+公开 API:
+  extract_structure()    — 解析 COBOL 控制流 → dict
+  generate_data()        — 生成测试数据 → list[dict]
+  incremental_supplement — 差分补充数据 → list[dict]
+  check_coverage()       — 覆盖率报告 → dict
+"""

 import sys
 import re
@@ -11,14 +19,25 @@ from pathlib import Path
 CONFIG = {}

 from .read import preprocess, extract_data_division, extract_procedure_division
-from .read import resolve_copybooks, parse_data_division, parse_file_section, scan_open_statements
+from .read import resolve_copybooks, parse_data_division, parse_file_section, scan_open_statements, parse_file_control
 from .core import build_branch_tree, classify_field_roles, _init_child_names
-from .cond import parse_single_condition, is_field
+from .cond import parse_single_condition, is_field, collect_leaves
 from .design import enum_paths, generate_records, _filter_stop
 from .output import output_json, output_input_files
-from .coverage import run_coverage, generate_coverage_index
+from .coverage import run_coverage, generate_coverage_index, check_coverage
+from japanese_data import generate_fullwidth_text, generate_halfwidth_katakana, generate_wareki_date

 logger = logging.getLogger(__name__)
+n__all__ = [
+    "extract_structure",
+    "generate_data",
+    "incremental_supplement",
+    "check_coverage",
+    "CONFIG",
+    "generate_fullwidth_text",
+    "generate_halfwidth_katakana",
+    "generate_wareki_date",
+]


 # ── OCCURS 展开 ──
@@ -118,7 +137,7 @@ def main():
    fh = logging.FileHandler(log_path, encoding="utf-8", mode="w")
    fh.setLevel(logging.DEBUG)
    fh.setFormatter(logging.Formatter(
-        "%(asctime)s [%(levelname)s] %(name)s: %(message)s"
+"%(asctime)s [%(levelname)s] %(name)s: %(message)s"
    ))
    sh = logging.StreamHandler()
    sh.setLevel(logging.INFO)
@@ -353,7 +372,7 @@ def extract_structure(cobol_source: str) -> dict:
    file_sec = parse_file_section(preprocessed)
    open_dir = scan_open_statements(proc_div) if proc_div else {}

-    from .models import BrIf, BrEval, BrSeq
+    from .models import BrIf, BrEval, BrSeq, BrPerform, Assign, CondAnd, CondOr

    decision_points = []
    total_branches = 0
@@ -395,19 +414,219 @@ def extract_structure(cobol_source: str) -> dict:
        if m:
            paragraphs.add(m.group(1))

+    # ── 新增字段: select_files ──
+    select_files = parse_file_control(preprocessed)
+
+    # ── 新增字段: open_directions_detail (与 open_directions 一致) ──
+    open_directions_detail = open_dir
+
+    # ── 新增字段: has_divide / has_inspect / has_string ──
+    has_divide = bool(re.search(r'\bDIVIDE\b', cobol_source.upper()))
+    has_inspect = bool(re.search(r'\bINSPECT\b', cobol_source.upper()))
+    has_string = bool(re.search(r'\bSTRING\b', cobol_source.upper()))
+
+    # ── 新增字段: divide_constants ──
+    divide_constants = []
+    if has_divide and proc_div:
+        for dm in re.finditer(r'\bDIVIDE\s+([\d.]+)\b', proc_div, re.IGNORECASE):
+            val = dm.group(1)
+            try:
+                divide_constants.append(float(val))
+            except ValueError:
+                pass
+
+    # ── 新增字段: perform_patterns ──
+    perform_patterns = []
+
+    def _walk_performs(node):
+        if isinstance(node, BrPerform):
+            entry = {
+                "type": node.perf_type,
+                "target": node.target,
+                "condition": node.condition,
+                "times": node.times,
+                "varying_var": node.varying_var,
+            }
+            perform_patterns.append(entry)
+            _walk_performs(node.body_seq)
+        elif isinstance(node, BrIf):
+            _walk_performs(node.true_seq)
+            _walk_performs(node.false_seq)
+        elif isinstance(node, BrEval):
+            for _, seq in node.when_list:
+                _walk_performs(seq)
+            _walk_performs(node.other_seq)
+        elif isinstance(node, BrSeq):
+            for c in node.children:
+                _walk_performs(c)
+
+    if branch_tree:
+        _walk_performs(branch_tree)
+
+    # ── 新增字段: main_loop ──
+    main_loop = None
+
+    def _find_main_loop(node, depth=0):
+        nonlocal main_loop
+        if main_loop is not None:
+            return
+        if isinstance(node, BrPerform):
+            if _perform_has_read(node):
+                main_loop = {
+                    "type": node.perf_type,
+                    "read_file": _perform_read_file(node),
+                    "has_at_end": False,
+                }
+                return
+            _find_main_loop(node.body_seq, depth + 1)
+        elif isinstance(node, BrIf):
+            _find_main_loop(node.true_seq, depth + 1)
+            _find_main_loop(node.false_seq, depth + 1)
+        elif isinstance(node, BrEval):
+            for _, seq in node.when_list:
+                _find_main_loop(seq, depth + 1)
+            _find_main_loop(node.other_seq, depth + 1)
+        elif isinstance(node, BrSeq):
+            for c in node.children:
+                _find_main_loop(c, depth + 1)
+
+    def _perform_has_read(perf_node):
+        def _walk_seq(seq):
+            if isinstance(seq, Assign):
+                if seq.source_info.get('type') == 'read_into':
+                    return True
+            elif isinstance(seq, BrSeq):
+                for ch in seq.children:
+                    if _walk_seq(ch):
+                        return True
+            return False
+        return _walk_seq(perf_node.body_seq)
+
+    def _perform_read_file(perf_node):
+        def _walk_seq(seq):
+            if isinstance(seq, Assign):
+                if seq.source_info.get('type') == 'read_into':
+                    return seq.source_info.get('file', '')
+            elif isinstance(seq, BrSeq):
+                for ch in seq.children:
+                    result = _walk_seq(ch)
+                    if result:
+                        return result
+            return None
+        return _walk_seq(perf_node.body_seq)
+
+    if branch_tree:
+        _find_main_loop(branch_tree)
+
+    # ── 新增字段: if_types ──
+    if_types = {"total": 0, "comparison": 0, "equality": 0, "compound": 0, "nested_depth": 0}
+
+    def _walk_if_types(node, depth=0):
+        if isinstance(node, BrIf):
+            if_types["total"] += 1
+            if_types["nested_depth"] = max(if_types["nested_depth"], depth)
+            ct = node.cond_tree
+            if ct:
+                leaves = collect_leaves(ct)
+                # Check compound: cond_tree is CondAnd or CondOr (not just CondLeaf)
+                if isinstance(ct, (CondAnd, CondOr)):
+                    if_types["compound"] += 1
+                for leaf in leaves:
+                    if leaf.op in ('>', '<', '>=', '<='):
+                        if_types["comparison"] += 1
+                    elif leaf.op in ('=', '<>'):
+                        if_types["equality"] += 1
+            _walk_if_types(node.true_seq, depth + 1)
+            _walk_if_types(node.false_seq, depth + 1)
+        elif isinstance(node, BrEval):
+            for _, seq in node.when_list:
+                _walk_if_types(seq, depth + 1)
+            _walk_if_types(node.other_seq, depth + 1)
+        elif isinstance(node, BrPerform):
+            _walk_if_types(node.body_seq, depth + 1)
+        elif isinstance(node, BrSeq):
+            for c in node.children:
+                _walk_if_types(c, depth + 1)
+
+    if branch_tree:
+        _walk_if_types(branch_tree)
+
+    # ── 新增字段: variable_patterns ──
+    variable_patterns = {
+        "has_prev_key": False,
+        "has_accumulator": False,
+        "has_error_flag": False,
+        "has_switch": False,
+        "has_index": False,
+        "has_save_area": False,
+        "has_counter": False,
+        "has_work": False,
+    }
+    for f in fields_dict:
+        name = f.get('name', '')
+        if re.search(r'\bWS-PREV[-_]', name, re.IGNORECASE):
+            variable_patterns["has_prev_key"] = True
+        if re.search(r'[-_]CNT\b', name, re.IGNORECASE) or re.search(r'[-_]ACCUM\b', name, re.IGNORECASE):
+            variable_patterns["has_accumulator"] = True
+        if re.search(r'[-_]ERR\b', name, re.IGNORECASE) or re.search(r'[-_]ERROR[-_]', name, re.IGNORECASE):
+            variable_patterns["has_error_flag"] = True
+        if re.search(r'[-_]SW\b', name, re.IGNORECASE) or re.search(r'[-_]FLAG\b', name, re.IGNORECASE):
+            variable_patterns["has_switch"] = True
+        if re.search(r'[-_]IDX\b', name, re.IGNORECASE) or re.search(r'[-_]INDX\b', name, re.IGNORECASE) or re.search(r'[-_]SUB\b', name, re.IGNORECASE):
+            variable_patterns["has_index"] = True
+        if re.search(r'[-_]SAVE[-_]', name, re.IGNORECASE) or re.search(r'[-_]HOLD[-_]', name, re.IGNORECASE):
+            variable_patterns["has_save_area"] = True
+        if re.search(r'[-_]CNT\b', name, re.IGNORECASE) or re.search(r'[-_]COUNT\b', name, re.IGNORECASE):
+            variable_patterns["has_counter"] = True
+        if name.startswith('WS-') and not re.search(r'(?:CNT|ERR|SW|IDX|INDX|SUB|SAVE|HOLD|PREV|ACCUM)', name, re.IGNORECASE):
+            if re.search(r'[-_]W\b|[-_]WORK\b|[-_]WK\b|^WS-W[0O]\w', name, re.IGNORECASE):
+                variable_patterns["has_work"] = True
+
+    # ── 新增字段: open_pattern ──
+    open_pattern = "sequential"
+    if proc_div:
+        proc_upper = proc_div.upper()
+        open_positions = [m.start() for m in re.finditer(r'\bOPEN\b', proc_upper)]
+        close_positions = [m.start() for m in re.finditer(r'\bCLOSE\b', proc_upper)]
+        if open_positions and close_positions:
+            # Check OPEN ... CLOSE ... OPEN sequence
+            for i, opos in enumerate(open_positions):
+                for cpos in close_positions:
+                    if cpos > opos:
+                        for opos2 in open_positions:
+                            if opos2 > cpos:
+                                open_pattern = "open-close-open"
+                                break
+                        if open_pattern == "open-close-open":
+                            break
+                if open_pattern == "open-close-open":
+                    break
+
    return {
-        "paragraphs": sorted(paragraphs) if paragraphs else [],
-        "decision_points": decision_points,
-        "branch_tree": branch_tree,
-        "file_count": len(file_sec) if file_sec else 0,
-        "open_directions": open_dir,
-        "has_search_all": any('SEARCH' in str(dp.get('label', '')) for dp in decision_points),
-        "has_evaluate": any(dp['kind'] == 'EVALUATE' for dp in decision_points),
-        "has_call": 'CALL' in cobol_source.upper(),
-        "has_break": any('KEY' in str(dp.get('label', '')).upper() for dp in decision_points),
-        "total_branches": total_branches,
-        "total_paragraphs": len(paragraphs),
-        "branch_tree_obj": branch_tree,
+"paragraphs": sorted(paragraphs) if paragraphs else [],
+"decision_points": decision_points,
+"branch_tree": branch_tree,
+"file_count": len(file_sec) if file_sec else 0,
+"open_directions": open_dir,
+"has_search_all": any('SEARCH' in str(dp.get('label', '')) for dp in decision_points),
+"has_evaluate": any(dp['kind'] == 'EVALUATE' for dp in decision_points),
+"has_call": 'CALL' in cobol_source.upper(),
+"has_break": any('KEY' in str(dp.get('label', '')).upper() for dp in decision_points),
+"total_branches": total_branches,
+"total_paragraphs": len(paragraphs),
+"branch_tree_obj": branch_tree,
+# ── 新增 8 类结构特征 ──
+"select_files": select_files,
+"open_directions_detail": open_directions_detail,
+"has_divide": has_divide,
+"divide_constants": divide_constants,
+"has_inspect": has_inspect,
+"has_string": has_string,
+"perform_patterns": perform_patterns,
+"main_loop": main_loop,
+"if_types": if_types,
+"variable_patterns": variable_patterns,
+"open_pattern": open_pattern,
    }