feat: Phase 2 complete — 13 Phases of COBOL type classification and test benchmark
P0.6: gcov infrastructure P1: extract_structure output expansion (11 new feature fields) P2: Confusion group rule engine (8 pairs + contradiction + backtrack) P3: 4-factor confidence calculation + quality gate update P4: 33+2 COBOL program type test samples (22 files, 7 categories) P5: parametrized/ test data generation engine P6: japanese_data.py lookup tables P7-10: Type-specific test suites (~159 parametrized tests) P11: Full classification pipeline (classify_program) + orchestrator integration P12: Documentation (module-interfaces, test-plan v3.0, coverage-matrix) Architecture decisions: - classification_pipeline/ merged to hina/pipeline/ - parametrized/ as independent module - japanese_data.py as root-level file - hina/__all__ only exports classify_program() Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
+237
-18
@@ -1,4 +1,12 @@
|
||||
"""COBOL Test Data Generator — 模块化版入口"""
|
||||
"""COBOL Test Data Generator — 模块化版入口
|
||||
|
||||
from __future__ import annotations
|
||||
公开 API:
|
||||
extract_structure() — 解析 COBOL 控制流 → dict
|
||||
generate_data() — 生成测试数据 → list[dict]
|
||||
incremental_supplement — 差分补充数据 → list[dict]
|
||||
check_coverage() — 覆盖率报告 → dict
|
||||
"""
|
||||
|
||||
import sys
|
||||
import re
|
||||
@@ -11,14 +19,25 @@ from pathlib import Path
|
||||
CONFIG = {}
|
||||
|
||||
from .read import preprocess, extract_data_division, extract_procedure_division
|
||||
from .read import resolve_copybooks, parse_data_division, parse_file_section, scan_open_statements
|
||||
from .read import resolve_copybooks, parse_data_division, parse_file_section, scan_open_statements, parse_file_control
|
||||
from .core import build_branch_tree, classify_field_roles, _init_child_names
|
||||
from .cond import parse_single_condition, is_field
|
||||
from .cond import parse_single_condition, is_field, collect_leaves
|
||||
from .design import enum_paths, generate_records, _filter_stop
|
||||
from .output import output_json, output_input_files
|
||||
from .coverage import run_coverage, generate_coverage_index
|
||||
from .coverage import run_coverage, generate_coverage_index, check_coverage
|
||||
from japanese_data import generate_fullwidth_text, generate_halfwidth_katakana, generate_wareki_date
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
n__all__ = [
|
||||
"extract_structure",
|
||||
"generate_data",
|
||||
"incremental_supplement",
|
||||
"check_coverage",
|
||||
"CONFIG",
|
||||
"generate_fullwidth_text",
|
||||
"generate_halfwidth_katakana",
|
||||
"generate_wareki_date",
|
||||
]
|
||||
|
||||
|
||||
# ── OCCURS 展开 ──
|
||||
@@ -118,7 +137,7 @@ def main():
|
||||
fh = logging.FileHandler(log_path, encoding="utf-8", mode="w")
|
||||
fh.setLevel(logging.DEBUG)
|
||||
fh.setFormatter(logging.Formatter(
|
||||
"%(asctime)s [%(levelname)s] %(name)s: %(message)s"
|
||||
"%(asctime)s [%(levelname)s] %(name)s: %(message)s"
|
||||
))
|
||||
sh = logging.StreamHandler()
|
||||
sh.setLevel(logging.INFO)
|
||||
@@ -353,7 +372,7 @@ def extract_structure(cobol_source: str) -> dict:
|
||||
file_sec = parse_file_section(preprocessed)
|
||||
open_dir = scan_open_statements(proc_div) if proc_div else {}
|
||||
|
||||
from .models import BrIf, BrEval, BrSeq
|
||||
from .models import BrIf, BrEval, BrSeq, BrPerform, Assign, CondAnd, CondOr
|
||||
|
||||
decision_points = []
|
||||
total_branches = 0
|
||||
@@ -395,19 +414,219 @@ def extract_structure(cobol_source: str) -> dict:
|
||||
if m:
|
||||
paragraphs.add(m.group(1))
|
||||
|
||||
# ── 新增字段: select_files ──
|
||||
select_files = parse_file_control(preprocessed)
|
||||
|
||||
# ── 新增字段: open_directions_detail (与 open_directions 一致) ──
|
||||
open_directions_detail = open_dir
|
||||
|
||||
# ── 新增字段: has_divide / has_inspect / has_string ──
|
||||
has_divide = bool(re.search(r'\bDIVIDE\b', cobol_source.upper()))
|
||||
has_inspect = bool(re.search(r'\bINSPECT\b', cobol_source.upper()))
|
||||
has_string = bool(re.search(r'\bSTRING\b', cobol_source.upper()))
|
||||
|
||||
# ── 新增字段: divide_constants ──
|
||||
divide_constants = []
|
||||
if has_divide and proc_div:
|
||||
for dm in re.finditer(r'\bDIVIDE\s+([\d.]+)\b', proc_div, re.IGNORECASE):
|
||||
val = dm.group(1)
|
||||
try:
|
||||
divide_constants.append(float(val))
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
# ── 新增字段: perform_patterns ──
|
||||
perform_patterns = []
|
||||
|
||||
def _walk_performs(node):
|
||||
if isinstance(node, BrPerform):
|
||||
entry = {
|
||||
"type": node.perf_type,
|
||||
"target": node.target,
|
||||
"condition": node.condition,
|
||||
"times": node.times,
|
||||
"varying_var": node.varying_var,
|
||||
}
|
||||
perform_patterns.append(entry)
|
||||
_walk_performs(node.body_seq)
|
||||
elif isinstance(node, BrIf):
|
||||
_walk_performs(node.true_seq)
|
||||
_walk_performs(node.false_seq)
|
||||
elif isinstance(node, BrEval):
|
||||
for _, seq in node.when_list:
|
||||
_walk_performs(seq)
|
||||
_walk_performs(node.other_seq)
|
||||
elif isinstance(node, BrSeq):
|
||||
for c in node.children:
|
||||
_walk_performs(c)
|
||||
|
||||
if branch_tree:
|
||||
_walk_performs(branch_tree)
|
||||
|
||||
# ── 新增字段: main_loop ──
|
||||
main_loop = None
|
||||
|
||||
def _find_main_loop(node, depth=0):
|
||||
nonlocal main_loop
|
||||
if main_loop is not None:
|
||||
return
|
||||
if isinstance(node, BrPerform):
|
||||
if _perform_has_read(node):
|
||||
main_loop = {
|
||||
"type": node.perf_type,
|
||||
"read_file": _perform_read_file(node),
|
||||
"has_at_end": False,
|
||||
}
|
||||
return
|
||||
_find_main_loop(node.body_seq, depth + 1)
|
||||
elif isinstance(node, BrIf):
|
||||
_find_main_loop(node.true_seq, depth + 1)
|
||||
_find_main_loop(node.false_seq, depth + 1)
|
||||
elif isinstance(node, BrEval):
|
||||
for _, seq in node.when_list:
|
||||
_find_main_loop(seq, depth + 1)
|
||||
_find_main_loop(node.other_seq, depth + 1)
|
||||
elif isinstance(node, BrSeq):
|
||||
for c in node.children:
|
||||
_find_main_loop(c, depth + 1)
|
||||
|
||||
def _perform_has_read(perf_node):
|
||||
def _walk_seq(seq):
|
||||
if isinstance(seq, Assign):
|
||||
if seq.source_info.get('type') == 'read_into':
|
||||
return True
|
||||
elif isinstance(seq, BrSeq):
|
||||
for ch in seq.children:
|
||||
if _walk_seq(ch):
|
||||
return True
|
||||
return False
|
||||
return _walk_seq(perf_node.body_seq)
|
||||
|
||||
def _perform_read_file(perf_node):
|
||||
def _walk_seq(seq):
|
||||
if isinstance(seq, Assign):
|
||||
if seq.source_info.get('type') == 'read_into':
|
||||
return seq.source_info.get('file', '')
|
||||
elif isinstance(seq, BrSeq):
|
||||
for ch in seq.children:
|
||||
result = _walk_seq(ch)
|
||||
if result:
|
||||
return result
|
||||
return None
|
||||
return _walk_seq(perf_node.body_seq)
|
||||
|
||||
if branch_tree:
|
||||
_find_main_loop(branch_tree)
|
||||
|
||||
# ── 新增字段: if_types ──
|
||||
if_types = {"total": 0, "comparison": 0, "equality": 0, "compound": 0, "nested_depth": 0}
|
||||
|
||||
def _walk_if_types(node, depth=0):
|
||||
if isinstance(node, BrIf):
|
||||
if_types["total"] += 1
|
||||
if_types["nested_depth"] = max(if_types["nested_depth"], depth)
|
||||
ct = node.cond_tree
|
||||
if ct:
|
||||
leaves = collect_leaves(ct)
|
||||
# Check compound: cond_tree is CondAnd or CondOr (not just CondLeaf)
|
||||
if isinstance(ct, (CondAnd, CondOr)):
|
||||
if_types["compound"] += 1
|
||||
for leaf in leaves:
|
||||
if leaf.op in ('>', '<', '>=', '<='):
|
||||
if_types["comparison"] += 1
|
||||
elif leaf.op in ('=', '<>'):
|
||||
if_types["equality"] += 1
|
||||
_walk_if_types(node.true_seq, depth + 1)
|
||||
_walk_if_types(node.false_seq, depth + 1)
|
||||
elif isinstance(node, BrEval):
|
||||
for _, seq in node.when_list:
|
||||
_walk_if_types(seq, depth + 1)
|
||||
_walk_if_types(node.other_seq, depth + 1)
|
||||
elif isinstance(node, BrPerform):
|
||||
_walk_if_types(node.body_seq, depth + 1)
|
||||
elif isinstance(node, BrSeq):
|
||||
for c in node.children:
|
||||
_walk_if_types(c, depth + 1)
|
||||
|
||||
if branch_tree:
|
||||
_walk_if_types(branch_tree)
|
||||
|
||||
# ── 新增字段: variable_patterns ──
|
||||
variable_patterns = {
|
||||
"has_prev_key": False,
|
||||
"has_accumulator": False,
|
||||
"has_error_flag": False,
|
||||
"has_switch": False,
|
||||
"has_index": False,
|
||||
"has_save_area": False,
|
||||
"has_counter": False,
|
||||
"has_work": False,
|
||||
}
|
||||
for f in fields_dict:
|
||||
name = f.get('name', '')
|
||||
if re.search(r'\bWS-PREV[-_]', name, re.IGNORECASE):
|
||||
variable_patterns["has_prev_key"] = True
|
||||
if re.search(r'[-_]CNT\b', name, re.IGNORECASE) or re.search(r'[-_]ACCUM\b', name, re.IGNORECASE):
|
||||
variable_patterns["has_accumulator"] = True
|
||||
if re.search(r'[-_]ERR\b', name, re.IGNORECASE) or re.search(r'[-_]ERROR[-_]', name, re.IGNORECASE):
|
||||
variable_patterns["has_error_flag"] = True
|
||||
if re.search(r'[-_]SW\b', name, re.IGNORECASE) or re.search(r'[-_]FLAG\b', name, re.IGNORECASE):
|
||||
variable_patterns["has_switch"] = True
|
||||
if re.search(r'[-_]IDX\b', name, re.IGNORECASE) or re.search(r'[-_]INDX\b', name, re.IGNORECASE) or re.search(r'[-_]SUB\b', name, re.IGNORECASE):
|
||||
variable_patterns["has_index"] = True
|
||||
if re.search(r'[-_]SAVE[-_]', name, re.IGNORECASE) or re.search(r'[-_]HOLD[-_]', name, re.IGNORECASE):
|
||||
variable_patterns["has_save_area"] = True
|
||||
if re.search(r'[-_]CNT\b', name, re.IGNORECASE) or re.search(r'[-_]COUNT\b', name, re.IGNORECASE):
|
||||
variable_patterns["has_counter"] = True
|
||||
if name.startswith('WS-') and not re.search(r'(?:CNT|ERR|SW|IDX|INDX|SUB|SAVE|HOLD|PREV|ACCUM)', name, re.IGNORECASE):
|
||||
if re.search(r'[-_]W\b|[-_]WORK\b|[-_]WK\b|^WS-W[0O]\w', name, re.IGNORECASE):
|
||||
variable_patterns["has_work"] = True
|
||||
|
||||
# ── 新增字段: open_pattern ──
|
||||
open_pattern = "sequential"
|
||||
if proc_div:
|
||||
proc_upper = proc_div.upper()
|
||||
open_positions = [m.start() for m in re.finditer(r'\bOPEN\b', proc_upper)]
|
||||
close_positions = [m.start() for m in re.finditer(r'\bCLOSE\b', proc_upper)]
|
||||
if open_positions and close_positions:
|
||||
# Check OPEN ... CLOSE ... OPEN sequence
|
||||
for i, opos in enumerate(open_positions):
|
||||
for cpos in close_positions:
|
||||
if cpos > opos:
|
||||
for opos2 in open_positions:
|
||||
if opos2 > cpos:
|
||||
open_pattern = "open-close-open"
|
||||
break
|
||||
if open_pattern == "open-close-open":
|
||||
break
|
||||
if open_pattern == "open-close-open":
|
||||
break
|
||||
|
||||
return {
|
||||
"paragraphs": sorted(paragraphs) if paragraphs else [],
|
||||
"decision_points": decision_points,
|
||||
"branch_tree": branch_tree,
|
||||
"file_count": len(file_sec) if file_sec else 0,
|
||||
"open_directions": open_dir,
|
||||
"has_search_all": any('SEARCH' in str(dp.get('label', '')) for dp in decision_points),
|
||||
"has_evaluate": any(dp['kind'] == 'EVALUATE' for dp in decision_points),
|
||||
"has_call": 'CALL' in cobol_source.upper(),
|
||||
"has_break": any('KEY' in str(dp.get('label', '')).upper() for dp in decision_points),
|
||||
"total_branches": total_branches,
|
||||
"total_paragraphs": len(paragraphs),
|
||||
"branch_tree_obj": branch_tree,
|
||||
"paragraphs": sorted(paragraphs) if paragraphs else [],
|
||||
"decision_points": decision_points,
|
||||
"branch_tree": branch_tree,
|
||||
"file_count": len(file_sec) if file_sec else 0,
|
||||
"open_directions": open_dir,
|
||||
"has_search_all": any('SEARCH' in str(dp.get('label', '')) for dp in decision_points),
|
||||
"has_evaluate": any(dp['kind'] == 'EVALUATE' for dp in decision_points),
|
||||
"has_call": 'CALL' in cobol_source.upper(),
|
||||
"has_break": any('KEY' in str(dp.get('label', '')).upper() for dp in decision_points),
|
||||
"total_branches": total_branches,
|
||||
"total_paragraphs": len(paragraphs),
|
||||
"branch_tree_obj": branch_tree,
|
||||
# ── 新增 8 类结构特征 ──
|
||||
"select_files": select_files,
|
||||
"open_directions_detail": open_directions_detail,
|
||||
"has_divide": has_divide,
|
||||
"divide_constants": divide_constants,
|
||||
"has_inspect": has_inspect,
|
||||
"has_string": has_string,
|
||||
"perform_patterns": perform_patterns,
|
||||
"main_loop": main_loop,
|
||||
"if_types": if_types,
|
||||
"variable_patterns": variable_patterns,
|
||||
"open_pattern": open_pattern,
|
||||
}
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user