feat: Phase 2 complete — 13 Phases of COBOL type classification and test benchmark

P0.6: gcov infrastructure
P1: extract_structure output expansion (11 new feature fields)
P2: Confusion group rule engine (8 pairs + contradiction + backtrack)
P3: 4-factor confidence calculation + quality gate update
P4: 33+2 COBOL program type test samples (22 files, 7 categories)
P5: parametrized/ test data generation engine
P6: japanese_data.py lookup tables
P7-10: Type-specific test suites (~159 parametrized tests)
P11: Full classification pipeline (classify_program) + orchestrator integration
P12: Documentation (module-interfaces, test-plan v3.0, coverage-matrix)

Architecture decisions:
- classification_pipeline/ merged to hina/pipeline/
- parametrized/ as independent module
- japanese_data.py as root-level file
- hina/__all__ only exports classify_program()

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
hangshuo652
2026-06-19 23:51:55 +08:00
parent 63b5284715
commit bc1d56d1a4
129 changed files with 19378 additions and 261 deletions
+237 -18
View File
@@ -1,4 +1,12 @@
"""COBOL Test Data Generator — 模块化版入口"""
"""COBOL Test Data Generator — 模块化版入口
from __future__ import annotations
公开 API:
extract_structure() — 解析 COBOL 控制流 → dict
generate_data() — 生成测试数据 → list[dict]
incremental_supplement — 差分补充数据 → list[dict]
check_coverage() — 覆盖率报告 → dict
"""
import sys
import re
@@ -11,14 +19,25 @@ from pathlib import Path
CONFIG = {}
from .read import preprocess, extract_data_division, extract_procedure_division
from .read import resolve_copybooks, parse_data_division, parse_file_section, scan_open_statements
from .read import resolve_copybooks, parse_data_division, parse_file_section, scan_open_statements, parse_file_control
from .core import build_branch_tree, classify_field_roles, _init_child_names
from .cond import parse_single_condition, is_field
from .cond import parse_single_condition, is_field, collect_leaves
from .design import enum_paths, generate_records, _filter_stop
from .output import output_json, output_input_files
from .coverage import run_coverage, generate_coverage_index
from .coverage import run_coverage, generate_coverage_index, check_coverage
from japanese_data import generate_fullwidth_text, generate_halfwidth_katakana, generate_wareki_date
logger = logging.getLogger(__name__)
n__all__ = [
"extract_structure",
"generate_data",
"incremental_supplement",
"check_coverage",
"CONFIG",
"generate_fullwidth_text",
"generate_halfwidth_katakana",
"generate_wareki_date",
]
# ── OCCURS 展开 ──
@@ -118,7 +137,7 @@ def main():
fh = logging.FileHandler(log_path, encoding="utf-8", mode="w")
fh.setLevel(logging.DEBUG)
fh.setFormatter(logging.Formatter(
"%(asctime)s [%(levelname)s] %(name)s: %(message)s"
"%(asctime)s [%(levelname)s] %(name)s: %(message)s"
))
sh = logging.StreamHandler()
sh.setLevel(logging.INFO)
@@ -353,7 +372,7 @@ def extract_structure(cobol_source: str) -> dict:
file_sec = parse_file_section(preprocessed)
open_dir = scan_open_statements(proc_div) if proc_div else {}
from .models import BrIf, BrEval, BrSeq
from .models import BrIf, BrEval, BrSeq, BrPerform, Assign, CondAnd, CondOr
decision_points = []
total_branches = 0
@@ -395,19 +414,219 @@ def extract_structure(cobol_source: str) -> dict:
if m:
paragraphs.add(m.group(1))
# ── 新增字段: select_files ──
select_files = parse_file_control(preprocessed)
# ── 新增字段: open_directions_detail (与 open_directions 一致) ──
open_directions_detail = open_dir
# ── 新增字段: has_divide / has_inspect / has_string ──
has_divide = bool(re.search(r'\bDIVIDE\b', cobol_source.upper()))
has_inspect = bool(re.search(r'\bINSPECT\b', cobol_source.upper()))
has_string = bool(re.search(r'\bSTRING\b', cobol_source.upper()))
# ── 新增字段: divide_constants ──
divide_constants = []
if has_divide and proc_div:
for dm in re.finditer(r'\bDIVIDE\s+([\d.]+)\b', proc_div, re.IGNORECASE):
val = dm.group(1)
try:
divide_constants.append(float(val))
except ValueError:
pass
# ── 新增字段: perform_patterns ──
perform_patterns = []
def _walk_performs(node):
if isinstance(node, BrPerform):
entry = {
"type": node.perf_type,
"target": node.target,
"condition": node.condition,
"times": node.times,
"varying_var": node.varying_var,
}
perform_patterns.append(entry)
_walk_performs(node.body_seq)
elif isinstance(node, BrIf):
_walk_performs(node.true_seq)
_walk_performs(node.false_seq)
elif isinstance(node, BrEval):
for _, seq in node.when_list:
_walk_performs(seq)
_walk_performs(node.other_seq)
elif isinstance(node, BrSeq):
for c in node.children:
_walk_performs(c)
if branch_tree:
_walk_performs(branch_tree)
# ── 新增字段: main_loop ──
main_loop = None
def _find_main_loop(node, depth=0):
nonlocal main_loop
if main_loop is not None:
return
if isinstance(node, BrPerform):
if _perform_has_read(node):
main_loop = {
"type": node.perf_type,
"read_file": _perform_read_file(node),
"has_at_end": False,
}
return
_find_main_loop(node.body_seq, depth + 1)
elif isinstance(node, BrIf):
_find_main_loop(node.true_seq, depth + 1)
_find_main_loop(node.false_seq, depth + 1)
elif isinstance(node, BrEval):
for _, seq in node.when_list:
_find_main_loop(seq, depth + 1)
_find_main_loop(node.other_seq, depth + 1)
elif isinstance(node, BrSeq):
for c in node.children:
_find_main_loop(c, depth + 1)
def _perform_has_read(perf_node):
def _walk_seq(seq):
if isinstance(seq, Assign):
if seq.source_info.get('type') == 'read_into':
return True
elif isinstance(seq, BrSeq):
for ch in seq.children:
if _walk_seq(ch):
return True
return False
return _walk_seq(perf_node.body_seq)
def _perform_read_file(perf_node):
def _walk_seq(seq):
if isinstance(seq, Assign):
if seq.source_info.get('type') == 'read_into':
return seq.source_info.get('file', '')
elif isinstance(seq, BrSeq):
for ch in seq.children:
result = _walk_seq(ch)
if result:
return result
return None
return _walk_seq(perf_node.body_seq)
if branch_tree:
_find_main_loop(branch_tree)
# ── 新增字段: if_types ──
if_types = {"total": 0, "comparison": 0, "equality": 0, "compound": 0, "nested_depth": 0}
def _walk_if_types(node, depth=0):
if isinstance(node, BrIf):
if_types["total"] += 1
if_types["nested_depth"] = max(if_types["nested_depth"], depth)
ct = node.cond_tree
if ct:
leaves = collect_leaves(ct)
# Check compound: cond_tree is CondAnd or CondOr (not just CondLeaf)
if isinstance(ct, (CondAnd, CondOr)):
if_types["compound"] += 1
for leaf in leaves:
if leaf.op in ('>', '<', '>=', '<='):
if_types["comparison"] += 1
elif leaf.op in ('=', '<>'):
if_types["equality"] += 1
_walk_if_types(node.true_seq, depth + 1)
_walk_if_types(node.false_seq, depth + 1)
elif isinstance(node, BrEval):
for _, seq in node.when_list:
_walk_if_types(seq, depth + 1)
_walk_if_types(node.other_seq, depth + 1)
elif isinstance(node, BrPerform):
_walk_if_types(node.body_seq, depth + 1)
elif isinstance(node, BrSeq):
for c in node.children:
_walk_if_types(c, depth + 1)
if branch_tree:
_walk_if_types(branch_tree)
# ── 新增字段: variable_patterns ──
variable_patterns = {
"has_prev_key": False,
"has_accumulator": False,
"has_error_flag": False,
"has_switch": False,
"has_index": False,
"has_save_area": False,
"has_counter": False,
"has_work": False,
}
for f in fields_dict:
name = f.get('name', '')
if re.search(r'\bWS-PREV[-_]', name, re.IGNORECASE):
variable_patterns["has_prev_key"] = True
if re.search(r'[-_]CNT\b', name, re.IGNORECASE) or re.search(r'[-_]ACCUM\b', name, re.IGNORECASE):
variable_patterns["has_accumulator"] = True
if re.search(r'[-_]ERR\b', name, re.IGNORECASE) or re.search(r'[-_]ERROR[-_]', name, re.IGNORECASE):
variable_patterns["has_error_flag"] = True
if re.search(r'[-_]SW\b', name, re.IGNORECASE) or re.search(r'[-_]FLAG\b', name, re.IGNORECASE):
variable_patterns["has_switch"] = True
if re.search(r'[-_]IDX\b', name, re.IGNORECASE) or re.search(r'[-_]INDX\b', name, re.IGNORECASE) or re.search(r'[-_]SUB\b', name, re.IGNORECASE):
variable_patterns["has_index"] = True
if re.search(r'[-_]SAVE[-_]', name, re.IGNORECASE) or re.search(r'[-_]HOLD[-_]', name, re.IGNORECASE):
variable_patterns["has_save_area"] = True
if re.search(r'[-_]CNT\b', name, re.IGNORECASE) or re.search(r'[-_]COUNT\b', name, re.IGNORECASE):
variable_patterns["has_counter"] = True
if name.startswith('WS-') and not re.search(r'(?:CNT|ERR|SW|IDX|INDX|SUB|SAVE|HOLD|PREV|ACCUM)', name, re.IGNORECASE):
if re.search(r'[-_]W\b|[-_]WORK\b|[-_]WK\b|^WS-W[0O]\w', name, re.IGNORECASE):
variable_patterns["has_work"] = True
# ── 新增字段: open_pattern ──
open_pattern = "sequential"
if proc_div:
proc_upper = proc_div.upper()
open_positions = [m.start() for m in re.finditer(r'\bOPEN\b', proc_upper)]
close_positions = [m.start() for m in re.finditer(r'\bCLOSE\b', proc_upper)]
if open_positions and close_positions:
# Check OPEN ... CLOSE ... OPEN sequence
for i, opos in enumerate(open_positions):
for cpos in close_positions:
if cpos > opos:
for opos2 in open_positions:
if opos2 > cpos:
open_pattern = "open-close-open"
break
if open_pattern == "open-close-open":
break
if open_pattern == "open-close-open":
break
return {
"paragraphs": sorted(paragraphs) if paragraphs else [],
"decision_points": decision_points,
"branch_tree": branch_tree,
"file_count": len(file_sec) if file_sec else 0,
"open_directions": open_dir,
"has_search_all": any('SEARCH' in str(dp.get('label', '')) for dp in decision_points),
"has_evaluate": any(dp['kind'] == 'EVALUATE' for dp in decision_points),
"has_call": 'CALL' in cobol_source.upper(),
"has_break": any('KEY' in str(dp.get('label', '')).upper() for dp in decision_points),
"total_branches": total_branches,
"total_paragraphs": len(paragraphs),
"branch_tree_obj": branch_tree,
"paragraphs": sorted(paragraphs) if paragraphs else [],
"decision_points": decision_points,
"branch_tree": branch_tree,
"file_count": len(file_sec) if file_sec else 0,
"open_directions": open_dir,
"has_search_all": any('SEARCH' in str(dp.get('label', '')) for dp in decision_points),
"has_evaluate": any(dp['kind'] == 'EVALUATE' for dp in decision_points),
"has_call": 'CALL' in cobol_source.upper(),
"has_break": any('KEY' in str(dp.get('label', '')).upper() for dp in decision_points),
"total_branches": total_branches,
"total_paragraphs": len(paragraphs),
"branch_tree_obj": branch_tree,
# ── 新增 8 类结构特征 ──
"select_files": select_files,
"open_directions_detail": open_directions_detail,
"has_divide": has_divide,
"divide_constants": divide_constants,
"has_inspect": has_inspect,
"has_string": has_string,
"perform_patterns": perform_patterns,
"main_loop": main_loop,
"if_types": if_types,
"variable_patterns": variable_patterns,
"open_pattern": open_pattern,
}