From 0730045e278a009ec20f4db8c8499403f350e325 Mon Sep 17 00:00:00 2001 From: hangshuo652 Date: Mon, 8 Jun 2026 21:07:16 +0800 Subject: [PATCH] add cobol_testgen module --- cobol_testgen/.gitignore | 4 + cobol_testgen/__init__.py | 312 ++++ cobol_testgen/__main__.py | 4 + cobol_testgen/agents.py | 308 ++++ cobol_testgen/cond.py | 258 +++ cobol_testgen/core.py | 1465 +++++++++++++++++ cobol_testgen/coverage.py | 1121 +++++++++++++ cobol_testgen/design.py | 775 +++++++++ cobol_testgen/grammar.lark | 35 + cobol_testgen/models.py | 151 ++ cobol_testgen/output.py | 118 ++ cobol_testgen/prompts/parse_proc_division.txt | 596 +++++++ cobol_testgen/read.py | 439 +++++ 13 files changed, 5586 insertions(+) create mode 100644 cobol_testgen/.gitignore create mode 100644 cobol_testgen/__init__.py create mode 100644 cobol_testgen/__main__.py create mode 100644 cobol_testgen/agents.py create mode 100644 cobol_testgen/cond.py create mode 100644 cobol_testgen/core.py create mode 100644 cobol_testgen/coverage.py create mode 100644 cobol_testgen/design.py create mode 100644 cobol_testgen/grammar.lark create mode 100644 cobol_testgen/models.py create mode 100644 cobol_testgen/output.py create mode 100644 cobol_testgen/prompts/parse_proc_division.txt create mode 100644 cobol_testgen/read.py diff --git a/cobol_testgen/.gitignore b/cobol_testgen/.gitignore new file mode 100644 index 0000000..f162b54 --- /dev/null +++ b/cobol_testgen/.gitignore @@ -0,0 +1,4 @@ +__pycache__/ +.pytest_cache/ +*.pyc +test_output/ diff --git a/cobol_testgen/__init__.py b/cobol_testgen/__init__.py new file mode 100644 index 0000000..90b13b3 --- /dev/null +++ b/cobol_testgen/__init__.py @@ -0,0 +1,312 @@ +"""COBOL Test Data Generator — 模块化版入口""" + +import sys +import logging +from datetime import datetime +from pathlib import Path + +# ── 配置(必须放在本地模块导入之前,避免循环导入) ── + +CONFIG = { + "proc_parser": "rule", # "rule" | "ai" + "llm_generator": False, # True=启用LLM路径生成; False=纯规则引擎 +} + +from .read import preprocess, extract_data_division, extract_procedure_division +from .read import resolve_copybooks, parse_data_division, parse_file_section, scan_open_statements +from .core import build_branch_tree, classify_field_roles, _init_child_names +from .cond import parse_single_condition, is_field +from .design import enum_paths, generate_records, _filter_stop +from .output import output_json, output_input_files +from . import agents +from .coverage import run_coverage, generate_coverage_index + +logger = logging.getLogger(__name__) + + +# ── OCCURS 展开 ── + + +def _add_subscript(name, occ): + """追加或扩展下标:WS-CELL → WS-CELL(1), WS-CELL(1) → WS-CELL(1,2)""" + if name.endswith(')'): + return name[:-1] + f',{occ})' + return name + f'({occ})' + + +def expand_occurs(fields): + """展开 OCCURS 字段为下标副本。递归处理嵌套 OCCURS。""" + result = [] + i = 0 + while i < len(fields): + f = fields[i] + if f.get('occurs', 0) > 0 and not f.get('is_88'): + children = [] + j = i + 1 + while j < len(fields): + child = fields[j] + if child.get('is_88'): + children.append(child) + j += 1 + continue + if child['level'] <= f['level'] or child.get('level') == 77: + break + children.append(child) + j += 1 + + if children: + group = dict(f) + group['occurs'] = 0 + result.append(group) + for occ in range(1, f['occurs'] + 1): + for child in children: + copy = dict(child) + if child.get('occurs', 0) == 0: + copy['occurs'] = 0 + copy['occurs_depending'] = f.get('occurs_depending') + if child.get('is_88'): + parent = child.get('parent') or f['name'] + copy['parent'] = _add_subscript(parent, occ) + copy['name'] = _add_subscript(child['name'], occ) + else: + copy['name'] = _add_subscript(child['name'], occ) + result.append(copy) + else: + for occ in range(1, f['occurs'] + 1): + copy = dict(f) + copy['name'] = _add_subscript(f['name'], occ) + copy['occurs'] = 0 + copy['occurs_depending'] = f.get('occurs_depending') + result.append(copy) + + i = j + else: + result.append(f) + i += 1 + + if any(f.get('occurs', 0) > 0 for f in result): + return expand_occurs(result) + return result + + +# ── 入口 ── + +def main(): + if len(sys.argv) < 2: + print("用法: python -m cobol_testgen [cobol文件2 ...] [输出目录]") + sys.exit(1) + + args = sys.argv[1:] + + # 分离 cobol 文件与输出目录 + cobol_files = [] + outdir = None + for a in args: + p = Path(a) + if p.is_dir(): + outdir = p + elif p.suffix.upper() in ('.CBL', '.COB', '.CPY'): + cobol_files.append(p) + else: + print(f"警告:跳过未知参数 {a}") + if not cobol_files: + print("错误:未找到任何 COBOL 文件") + sys.exit(1) + if outdir is None: + outdir = cobol_files[0].parent + + # 配置全局 Logger + outdir.mkdir(parents=True, exist_ok=True) + log_path = outdir / f"cobol_testgen_{datetime.now():%Y%m%d_%H%M%S}.log" + fh = logging.FileHandler(log_path, encoding="utf-8", mode="w") + fh.setLevel(logging.DEBUG) + fh.setFormatter(logging.Formatter( + "%(asctime)s [%(levelname)s] %(name)s: %(message)s" + )) + sh = logging.StreamHandler() + sh.setLevel(logging.INFO) + sh.setFormatter(logging.Formatter("%(message)s")) + root_logger = logging.getLogger() + root_logger.setLevel(logging.DEBUG) + root_logger.addHandler(fh) + root_logger.addHandler(sh) + + programs = [] + + for filepath in cobol_files: + if not filepath.exists(): + logger.error(f"错误:文件不存在 {filepath}") + continue + + source = filepath.read_text(encoding='utf-8') + source = resolve_copybooks(source, str(filepath.parent)) + preprocessed = preprocess(source) + file_sec = parse_file_section(preprocessed) + + # DATA DIVISION解析 + data_div = extract_data_division(preprocessed) + if not data_div: + logger.error(f"错误:{filepath.name} 中没有 DATA DIVISION。") + continue + + data_fields = parse_data_division(data_div) + if not data_fields: + logger.error(f"错误:{filepath.name} 中没有找到含 PIC 的字段。") + continue + + # FieldDef → dict + fields_dict = [] + parent_pic = {} + filler_counter = 0 + for f in data_fields: + pi = f.pic_info + name = f.name + if name == 'FILLER': + filler_counter += 1 + if filler_counter > 1: + name = f'FILLER_{filler_counter}' + entry = { + 'name': name, + 'level': f.level, + 'pic': f.pic, + 'pic_info': { + 'type': pi.type if pi else 'unknown', + 'digits': pi.digits if pi else 0, + 'decimal': pi.decimal if pi else 0, + 'length': pi.length if pi else 0, + 'signed': pi.signed if pi else False, + }, + 'value': f.value, + 'values': f.values, + 'section': f.section, + 'is_filler': f.is_filler, + 'redefines': f.redefines, + 'usage': f.usage, + 'occurs': f.occurs_count, + 'occurs_depending': f.occurs_depending, + } + if f.is_88: + entry['is_88'] = True + entry['parent'] = f.parent + # Copy parent's pic_info for value generation + if f.parent and f.parent in parent_pic: + entry['pic_info'] = dict(parent_pic[f.parent]) + else: + parent_pic[name] = entry['pic_info'] + fields_dict.append(entry) + + fields_dict = expand_occurs(fields_dict) + + # Build FD→children 和 field→FD 映射 + fd_fields = {} + field_to_fd = {} + if file_sec: + for fd_name, rec_names in file_sec.items(): + fds = [] + seen = set() + for rec in rec_names: + if rec not in seen: + fds.append(rec) + seen.add(rec) + for child in _init_child_names(rec, fields_dict): + if child not in seen: + fds.append(child) + seen.add(child) + fd_fields[fd_name] = fds + for child in fds: + field_to_fd[child] = fd_name + + logger.info(f"\n========== {filepath.name} ==========") + logger.info(f"\n字段列表:") + logger.info(f"{'层级':<6} {'名称':<25} {'PIC':<15} {'类型':<12} {'长度':<5}") + logger.info("-" * 65) + for f in fields_dict: + pi = f['pic_info'] + t = pi.get('type', '?') + l = pi.get('digits', 0) + pi.get('decimal', 0) or pi.get('length', 0) + pic_display = str(f.get('pic', '')) if f.get('pic') else ('88-level' if f.get('is_88') else '') + logger.info(f"{f['level']:<6} {f['name']:<25} {pic_display:<15} {t:<12} {l:<5}") + + # PROCEDURE DIVISION解析 + proc_div = extract_procedure_division(preprocessed) + branch_paths = [] + assignments = {} + + if proc_div: + if CONFIG["proc_parser"] == "ai": + try: + result = agents.parse_proc_division_ai(proc_div, fields_dict) + branch_tree, assignments = result + except NotImplementedError: + branch_tree, assignments = build_branch_tree(proc_div, fields_dict) + else: + branch_tree, assignments = build_branch_tree(proc_div, fields_dict) + + roles = classify_field_roles(branch_tree, assignments, fields_dict, + source=preprocessed, proc_text=proc_div) + logger.info(f"\n字段角色(输入/输出/出入/未用):") + for f in fields_dict: + if f.get('is_88'): + continue + logger.info(f" {f['name']:<30} {roles.get(f['name'], '?')}") + + branch_paths_with_assigns = enum_paths(branch_tree, fields_dict) + branch_paths_with_assigns = [ + (_filter_stop(c), a) for c, a in branch_paths_with_assigns + ] + + # OPEN 方向解析 + open_dir = scan_open_statements(proc_div) if proc_div else {} + + if proc_div: + logger.info(f"\n分支路径数:{len(branch_paths_with_assigns)}") + for i, (path_cons, _path_assign) in enumerate(branch_paths_with_assigns): + descs = [] + for c in path_cons: + if len(c) == 4: + field, op, val, want = c + if op == 'not_in': + descs.append(f"{field} not in {val}") + else: + descs.append(f"{field} {op} {val} ({'T' if want else 'F'})") + logger.debug(f" 路径 {i + 1}: {', '.join(descs)}") + else: + logger.warning("\n没有找到 PROCEDURE DIVISION。") + branch_paths_with_assigns = [([], {})] + roles = {f['name']: 'unused' for f in fields_dict} + + # 覆盖率报告(传入原始源文本用于行号定位) + cov_prefix = str(outdir / filepath.stem) + index_relpath = 'coverage/index.html' + cov_result = run_coverage(branch_tree, branch_paths_with_assigns, fields_dict, + source, cov_prefix, index_relpath=index_relpath) + + records = generate_records(branch_paths_with_assigns, fields_dict, assignments, file_sec=file_sec) + + # 输出 JSON(完整文件) + outpath = outdir / (filepath.stem + '.json') + output_json(records, outpath, roles, + fd_fields=fd_fields, field_to_fd=field_to_fd, + open_dir=open_dir, + path_cons_list=[c for c, a in branch_paths_with_assigns]) + + # 输出入力 JSON(按 FD 拆分) + output_input_files(records, outdir, filepath.stem, roles, + fd_fields, field_to_fd, open_dir) + + logger.info(f"\n输出:{outpath}({len(records)} 条记录)") + logger.debug(f"\n记录明细:") + for i, rec in enumerate(records, 1): + vals = [] + for f in fields_dict: + r = roles.get(f['name'], '?') + marker = f"[{r[0].upper()}]" if r != '?' and r != 'unused' else '' + vals.append(f"{marker}{f['name']}={rec.get(f['name'], '?')}") + logger.debug(f" 记录 {i}: {' | '.join(vals)}") + + programs.append(cov_result) + + # 生成覆盖率总括索引页 + if programs: + generate_coverage_index(programs, outdir) + logger.info(f"\n覆盖率总览:{outdir / 'coverage' / 'index.html'}") diff --git a/cobol_testgen/__main__.py b/cobol_testgen/__main__.py new file mode 100644 index 0000000..bc9ead3 --- /dev/null +++ b/cobol_testgen/__main__.py @@ -0,0 +1,4 @@ +"""允许 python -m cobol_testgen 直接运行""" + +from cobol_testgen import main +main() diff --git a/cobol_testgen/agents.py b/cobol_testgen/agents.py new file mode 100644 index 0000000..dc95016 --- /dev/null +++ b/cobol_testgen/agents.py @@ -0,0 +1,308 @@ +"""AI智能体接口 — 基于DeepSeek的PROCEDURE DIVISION解析""" + +import json +import os +import re +from pathlib import Path + +from .models import BrSeq, BrIf, BrEval, BrPerform, Assign, CallNode + + +DEEPSEEK_API_KEY_ENV = "DEEPSEEK_API_KEY" +DEEPSEEK_BASE_URL = "https://api.deepseek.com/v1" +DEEPSEEK_MODEL = "deepseek-chat" +PROMPT_FILE = Path(__file__).parent / "prompts" / "parse_proc_division.txt" + + +def parse_proc_division_ai(proc_text: str, fields: list = None, spec_doc: str = ""): + """AI版PROCEDURE DIVISION解析:调用DeepSeek API,返回(branch_tree, assignments).""" + api_key = os.environ.get(DEEPSEEK_API_KEY_ENV) + if not api_key: + raise NotImplementedError( + f"AI agent requires {DEEPSEEK_API_KEY_ENV} environment variable" + ) + + prompt = _build_prompt(proc_text, fields) + response_text = _call_llm(prompt, api_key) + data = _extract_json(response_text) + if not data: + raise NotImplementedError("AI returned no parsable JSON") + + branch_tree = _json_to_tree(data.get("tree", {})) + assignments = data.get("assignments", {}) + return branch_tree, assignments + + +def _build_prompt(proc_text: str, fields: list = None) -> list[dict]: + system = PROMPT_FILE.read_text(encoding="utf-8") + + fields_json = json.dumps(fields, ensure_ascii=False, indent=2) if fields else "[]" + + user = f"""## PROCEDURE DIVISION 源码 + +``` +{proc_text} +``` + +## DATA DIVISION 字段列表 + +```json +{fields_json} +``` +""" + + return [ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + ] + + +def _call_llm(messages: list[dict], api_key: str) -> str: + try: + from openai import OpenAI + except ImportError: + raise NotImplementedError( + "openai package not installed. Run: pip install openai" + ) + + client = OpenAI(api_key=api_key, base_url=DEEPSEEK_BASE_URL) + response = client.chat.completions.create( + model=DEEPSEEK_MODEL, + messages=messages, + temperature=0.1, + max_tokens=8192, + ) + return response.choices[0].message.content or "" + + +def _extract_json(text: str) -> dict | None: + stripped = text.strip() + # Try extracting from markdown code block first + m = re.search(r"```(?:json)?\s*\n?(.*?)\n?```", stripped, re.DOTALL) + if m: + stripped = m.group(1).strip() + try: + return json.loads(stripped) + except json.JSONDecodeError: + return None + + +def _json_to_tree(data: dict): + node_type = data.get("type", "seq") + + if node_type == "seq": + node = BrSeq() + for child_data in data.get("children", []): + child = _json_to_tree(child_data) + if child is not None: + node.add(child) + return node + + if node_type == "if": + node = BrIf(data.get("condition", "")) + node.true_seq = _json_to_tree(data.get("true_seq", {"type": "seq", "children": []})) + node.false_seq = _json_to_tree(data.get("false_seq", {"type": "seq", "children": []})) + return node + + if node_type == "eval": + node = BrEval(data.get("subject", "")) + for w in data.get("when_list", []): + node.when_list.append((w.get("value", ""), _json_to_tree(w.get("seq", {"type": "seq", "children": []})))) + node.other_seq = _json_to_tree(data.get("other_seq", {"type": "seq", "children": []})) + node.has_other = data.get("has_other", False) + return node + + if node_type == "perform": + perf_type = data.get("perf_type", "para") + kw = {"perf_type": perf_type} + for k in ("condition", "target", "thru", "times", + "varying_var", "varying_from", "varying_by"): + if k in data: + kw[k] = data[k] + node = BrPerform(**kw) + if "body_seq" in data: + node.body_seq = _json_to_tree(data["body_seq"]) + return node + + if node_type == "assign": + return Assign( + target=data.get("target", ""), + source_info=data.get("source_info", {}), + ) + + if node_type == "call": + return CallNode( + program_name=data.get("program_name", ""), + using_params=data.get("using_params", []), + ) + + return None + + +# ── LLM 路径生成 ── + + +def llm_generate_all_paths(tree_root, fields) -> list | None: + """为整个控制流树生成 MC/DC 路径。返回 [(constraints, assignments), ...] 或 None。""" + api_key = os.environ.get(DEEPSEEK_API_KEY_ENV) + if not api_key: + return None + + tree_json = _serialize_tree_for_llm(tree_root) + if tree_json is None: + return None + + level88_map = _extract_88_mapping(fields) + messages = _build_path_prompt(tree_json, fields, level88_map) + + try: + response = _call_llm(messages, api_key) + data = _extract_json(response) + if data and "paths" in data: + return _parse_llm_paths(data["paths"]) + except Exception: + pass + return None + + +def _serialize_tree_for_llm(node): + if node is None: + return None + from .models import BrSeq, BrIf, BrEval, BrPerform, Assign, CallNode, ExitNode, GoTo + + if isinstance(node, BrSeq): + children = [] + for child in node.children: + s = _serialize_tree_for_llm(child) + if s is not None: + children.append(s) + return {"type": "seq", "children": children} if children else None + + if isinstance(node, BrIf): + return { + "type": "if", + "condition": node.condition, + "true_seq": _serialize_tree_for_llm(node.true_seq) or {"type": "seq", "children": []}, + "false_seq": _serialize_tree_for_llm(node.false_seq) or {"type": "seq", "children": []}, + } + + if isinstance(node, BrEval): + when_list = [] + for val, seq in node.when_list: + s = _serialize_tree_for_llm(seq) + when_list.append({"value": val, "seq": s or {"type": "seq", "children": []}}) + return { + "type": "eval", + "subject": node.subject, + "when_list": when_list, + "other_seq": _serialize_tree_for_llm(node.other_seq) or {"type": "seq", "children": []}, + "has_other": node.has_other, + } + + if isinstance(node, BrPerform): + result = {"type": "perform", "perf_type": node.perf_type} + for attr in ("condition", "target", "thru", "times", + "varying_var", "varying_from", "varying_by"): + val = getattr(node, attr, None) + if val is not None: + result[attr] = val + if node.body_seq: + bs = _serialize_tree_for_llm(node.body_seq) + if bs: + result["body_seq"] = bs + return result + + # Assign / CallNode / ExitNode / GoTo — 不影响路径生成,可省略 + return None + + +def _extract_88_mapping(fields): + mapping = {} + for f in fields: + if f.get('is_88'): + mapping[f['name']] = { + "parent": f['parent'], + "value": f['value'], + "pic_info": f.get('pic_info', {}), + } + return mapping + + +def _build_path_prompt(tree_json, fields, level88_map): + system = ("你是 COBOL 测试路径生成专家。" + "请为给定的控制流树生成满足 MC/DC 覆盖的测试路径集。" + "只输出 JSON,不要多余文字。") + + reduced_fields = [] + for f in fields: + entry = {"name": f["name"], "pic": f.get("pic", "")} + pi = f.get("pic_info", {}) + if pi: + entry["pic_info"] = { + "type": pi.get("type"), "digits": pi.get("digits"), + "decimal": pi.get("decimal"), "length": pi.get("length"), + } + if f.get("is_88"): + entry["is_88"] = True + entry["value"] = f.get("value") + entry["parent"] = f.get("parent") + reduced_fields.append(entry) + + user = ( + "## 控制流树(JSON)\n\n" + f"```json\n{json.dumps(tree_json, ensure_ascii=False, indent=2)}\n```\n\n" + "## 字段定义\n\n" + f"```json\n{json.dumps(reduced_fields, ensure_ascii=False, indent=2)}\n```\n\n" + "## 要求\n" + "1. 每个 IF/EVALUATE/PERFORM UNTIL 的每个分支至少被覆盖一次\n" + "2. 复合条件(AND/OR/NOT)需要满足 MC/DC:每个叶条件的独立影响对\n" + "3. 路径数尽量少(最小集优先)\n" + "4. 88-level 条件名要展开为实际字段比较(如 CUST-VIP → WS-CUST-LEVEL='V')\n" + "5. 同一路径中的约束不能自相矛盾(同一字段不能同时等于 'A' 和等于 'B')\n" + "6. 数值边界值合理(>5000 → 5001, <100 → 99)\n" + "7. AND 优先级高于 OR\n\n" + "## 输出格式\n\n" + "```json\n" + "{\n" + ' "paths": [\n' + " {\n" + ' "constraints": [\n' + ' {"field": "WS-AMOUNT", "op": ">", "value": "5000", "want_true": true}\n' + " ],\n" + ' "assignments": {}\n' + " }\n" + " ]\n" + "}\n" + "```" + ) + + return [ + {"role": "system", "content": system}, + {"role": "user", "content": user}, + ] + + +def _parse_llm_paths(paths_data): + result = [] + for p in paths_data: + constraints = [] + for c in p.get("constraints", []): + constraints.append((c["field"], c["op"], str(c["value"]), c["want_true"])) + assignments = p.get("assignments", {}) + result.append((constraints, assignments)) + return result + + +def resolve_constraints_ai(paths, fields=None, assignments=None): + """AI版约束推理(未来实现)""" + raise NotImplementedError("AI agent not yet implemented") + + +def enhance_metadata_ai(records, fields=None, spec_doc: str = ""): + """AI版测试用例元数据生成(未来实现)""" + raise NotImplementedError("AI agent not yet implemented") + + +def analyze_spec_ai(spec_doc: str = ""): + """AI版式样书解析(未来实现)""" + raise NotImplementedError("AI agent not yet implemented") diff --git a/cobol_testgen/cond.py b/cobol_testgen/cond.py new file mode 100644 index 0000000..a94f06b --- /dev/null +++ b/cobol_testgen/cond.py @@ -0,0 +1,258 @@ +"""条件层:COBOL条件表达式解析 + MC/DC枚举 + 约束合并""" + +import re +from .models import CondLeaf, CondAnd, CondOr, CondNot, PicInfo + + +# ── 条件解析 ── + +def _split_at_operator(text, operator): + """Split text on operator word, respecting parentheses.""" + result = [] + current = [] + depth = 0 + # Normalize so parentheses are space-delimited tokens + normalized = text.replace('(', ' ( ').replace(')', ' ) ') + for token in normalized.split(): + if not token: + continue + if token == '(': + depth += 1 + current.append(token) + elif token == ')': + depth -= 1 + current.append(token) + elif token == operator and depth == 0: + result.append(' '.join(current).strip()) + current = [] + else: + current.append(token) + result.append(' '.join(current).strip()) + return result + + +def parse_single_condition(text, fields=None): + """Parse 'AMOUNT > 1000' into ('AMOUNT', '>', '1000'). + Also handles subscripted fields: 'WS-ITEM(SUB) = 'A''. + Also resolves 88-level condition names (e.g. STATUS-APPROVED → WS-TRAN-STATUS = 'A'). + Returns None if the condition contains AND/OR (compound). + """ + if ' AND ' in text or ' OR ' in text: + return None + # Check if text is an 88-level condition name + if fields: + for f in fields: + if f.get('is_88') and f['name'] == text.upper(): + return (f.get('parent', ''), '=', f.get('value', '')) + m = re.match( + r"^(\w[\w-]*(?:\s*\([^)]*\))?)\s*(>=|<=|<>|>|<|=)\s*(.+)$", + text + ) + if m: + field = re.sub(r'\s*([(),])\s*', r'\1', m.group(1)) + return (field, m.group(2), m.group(3).strip().strip("'").strip('"')) + # Try arithmetic expression: e.g. A + B > C + m = re.match( + r"^(\w[\w\s+\-*/().-]+?)\s*(>=|<=|<>|>|<|=)\s*(.+)$", + text + ) + if m: + field = re.sub(r'\s*([(),])\s*', r'\1', m.group(1)).strip() + return (field, m.group(2), m.group(3).strip().strip("'").strip('"')) + return None + + +def parse_compound_condition(text, fields=None): + """Parse a COBOL condition into a condition tree (AND/OR/LEAF). + Handles AND > OR precedence and parentheses. + """ + text = text.strip() + if not text: + return None + # Normalize parentheses to be space-delimited for reliable tokenization + text = text.replace('(', ' ( ').replace(')', ' ) ') + text = re.sub(r'\s+', ' ', text).strip() + # Strip outer parentheses + if text.startswith('(') and text.endswith(')'): + depth = 0 + wrapped = True + for i, c in enumerate(text): + if c == '(': + depth += 1 + elif c == ')': + depth -= 1 + if depth == 0 and i < len(text) - 1: + wrapped = False + break + if wrapped: + inner = parse_compound_condition(text[1:-1], fields) + if inner: + return inner + # Split on OR (lowest precedence) + parts = _split_at_operator(text, 'OR') + if len(parts) > 1: + node = parse_compound_condition(parts[0], fields) + for p in parts[1:]: + node = CondOr(node, parse_compound_condition(p, fields)) + return node + # Split on AND + parts = _split_at_operator(text, 'AND') + if len(parts) > 1: + node = parse_compound_condition(parts[0], fields) + for p in parts[1:]: + node = CondAnd(node, parse_compound_condition(p, fields)) + return node + # NOT prefix (highest precedence, after AND/OR splitting) + if text.upper().startswith('NOT '): + inner = parse_compound_condition(text[4:].strip(), fields) + return CondNot(inner) if inner else None + # Leaf condition + parsed = parse_single_condition(text, fields) + if parsed: + return CondLeaf(*parsed) + return None + + +def collect_leaves(tree): + """Return list of all CondLeaf nodes in the tree.""" + if isinstance(tree, CondLeaf): + return [tree] + elif isinstance(tree, CondNot): + return collect_leaves(tree.child) + elif isinstance(tree, (CondAnd, CondOr)): + return collect_leaves(tree.left) + collect_leaves(tree.right) + return [] + + +def evaluate_tree(tree, assignment): + """Evaluate condition tree given leaf→bool assignment dict.""" + if isinstance(tree, CondLeaf): + return assignment[tree] + elif isinstance(tree, CondNot): + return not evaluate_tree(tree.child, assignment) + elif isinstance(tree, CondAnd): + return evaluate_tree(tree.left, assignment) and evaluate_tree(tree.right, assignment) + elif isinstance(tree, CondOr): + return evaluate_tree(tree.left, assignment) or evaluate_tree(tree.right, assignment) + return False + + +def is_field(name, fields): + # Strip subscript: WS-ITEM-STATUS(WS-INDEX-VAR) -> WS-ITEM-STATUS + bare = re.sub(r'\s*\(.*\)\s*$', '', name).strip() + for f in fields: + if f['name'] == bare.upper(): + return True + return False + + +# ── MC/DC ── + +def mcdc_sets(tree, fields=None): + """Generate MC/DC constraint sets. + Returns list of (constraints_list, decision_outcome) or None for simple conditions. + Each constraint is (field, op, value, want_true). + """ + leaves = collect_leaves(tree) + n = len(leaves) + if n <= 1: + return None + # Evaluate all 2^n truth assignments + all_results = [] + for bits in range(1 << n): + assignment = {} + for i, leaf in enumerate(leaves): + assignment[leaf] = bool(bits & (1 << i)) + result = evaluate_tree(tree, assignment) + all_results.append((assignment, result)) + # For each leaf, find a pair showing independent effect on decision + needed_pairs = {} + for leaf in leaves: + for a1, r1 in all_results: + if leaf in needed_pairs: + break + for a2, r2 in all_results: + if a1[leaf] != a2[leaf] and r1 != r2: + if all(a1[o] == a2[o] for o in leaves if o != leaf): + needed_pairs[leaf] = (dict(a1), r1, dict(a2), r2) + break + # Convert leaf assignments to constraint tuples + result = [] + added = set() + for leaf, (a1, r1, a2, r2) in needed_pairs.items(): + for assignment, decision in [(a1, r1), (a2, r2)]: + key = frozenset((l, assignment[l]) for l in leaves) + if key not in added: + added.add(key) + constraints = [] + for l in leaves: + want = assignment[l] + constraints.append((l.field, l.op, l.value, want)) + result.append((constraints, decision)) + return result + + + + + +# ── 值计算 ── + +def satisfying_value(field_info: dict, operator: str, value, want_true: bool) -> str: + ftype = field_info.get('type', 'unknown') + digits = field_info.get('digits', 0) + decimal = field_info.get('decimal', 0) + total = digits + decimal + + if ftype == 'numeric': + try: + val_str = str(value) + val_float = float(val_str) + val_int = int(val_float * (10 ** decimal) + 0.5) + except (ValueError, TypeError): + val_int = 0 + + if want_true: + if operator == '>': + val_int = val_int + 1 + elif operator in ('>=', '=', '<='): + pass + elif operator == '<': + val_int = max(0, val_int - 1) + elif operator == '<>': + val_int = (val_int + 1) % (10 ** total) + else: + if operator in ('>', '>='): + val_int = 0 + elif operator == '=': + val_int = (val_int + 1) % (10 ** total) + elif operator == '<': + pass + elif operator == '<=': + val_int = val_int + 1 + elif operator == '<>': + pass + + val_int = val_int % (10 ** total) + int_part = str(val_int // (10 ** decimal)).zfill(digits) + dec_part = str(val_int % (10 ** decimal)).zfill(decimal) + if decimal == 0: + return int_part + return int_part + dec_part + + elif ftype in ('alphanumeric', 'alphabetic'): + length = field_info.get('length', 1) + base_chr = value[0].upper() if isinstance(value, str) and value else 'A' + if want_true: + if operator in ('=', '=='): + return base_chr.ljust(length, base_chr) + elif operator in ('<>', '!='): + other = chr(65 + (ord(base_chr) - 64) % 26) + return other.ljust(length, other) + else: + if operator in ('=', '=='): + other = chr(65 + (ord(base_chr) - 64) % 26) + return other.ljust(length, other) + elif operator in ('<>', '!='): + return base_chr.ljust(length, base_chr) + + return '0'.zfill(total) diff --git a/cobol_testgen/core.py b/cobol_testgen/core.py new file mode 100644 index 0000000..43dc9a8 --- /dev/null +++ b/cobol_testgen/core.py @@ -0,0 +1,1465 @@ +"""核心层:PROCEDURE DIVISION解析 + 数据流追踪""" + +import re +import logging +from datetime import datetime +from .models import BrSeq, BrIf, BrEval, BrPerform, BrSeq, CondLeaf, CondNot, ParseError, Assign, CallNode, ExitNode, GoTo +from .cond import parse_compound_condition, parse_single_condition, collect_leaves + +logger = logging.getLogger(__name__) + + +_COBOL_SCOPE_ENDERS = { + 'END-IF', 'END-EVALUATE', 'END-PERFORM', 'END-EXEC', 'END-CALL', + 'END-READ', 'END-WRITE', 'END-DELETE', 'END-REWRITE', 'END-START', + 'ELSE', 'WHEN', 'OTHER', +} + + +def scan_paragraphs(raw_lines): + paragraphs = {} + i = 0 + while i < len(raw_lines): + line = raw_lines[i].strip() + m = re.match(r'^([A-Z0-9][A-Z0-9-]*)\.\s*$', line) + if m and m.group(1) not in _COBOL_SCOPE_ENDERS: + name = m.group(1) + start = i + 1 + j = i + 1 + while j < len(raw_lines): + nline = raw_lines[j].strip() + nm = re.match(r'^([A-Z0-9][A-Z0-9-]*)\.\s*$', nline) + if nm and nm.group(1) not in _COBOL_SCOPE_ENDERS: + break + if re.match(r'^[A-Z][A-Z0-9-]*\s+SECTION\.\s*$', nline, re.IGNORECASE): + break + j += 1 + paragraphs[name] = (start, j - 1) + i = j + else: + i += 1 + return paragraphs + + +def build_branch_tree(proc_text, fields=None): + raw_lines = proc_text.split('\n') + paragraphs = scan_paragraphs(raw_lines) + + first_para_name = None + first_para_idx = None + for i, line in enumerate(raw_lines): + clean = line.strip() + m = re.match(r'^([A-Z0-9][A-Z0-9-]*)\.\s*$', clean) + if m and m.group(1) in paragraphs: + first_para_name = m.group(1) + first_para_idx = i + break + + if first_para_name: + before = raw_lines[:first_para_idx] + has_code = any( + l.strip() and 'PROCEDURE DIVISION' not in l + for l in before + ) + if has_code: + main_raw = raw_lines[:first_para_idx] + else: + p_start, p_end = paragraphs[first_para_name] + main_raw = raw_lines[p_start:p_end + 1] + else: + main_raw = raw_lines + + filtered = [l for l in main_raw if l.strip()] + assignments = {} + parser = _BrParser(filtered, paragraphs, raw_lines, assignments, fields) + tree = parser.parse_seq(terminators={'GOBACK', 'STOP RUN', 'EXIT PROGRAM'}) + return tree, assignments + + +# ── 定数 ── + +_FIGURATIVE_CONSTANTS = frozenset({ + 'ZERO', 'ZEROS', 'ZEROES', + 'SPACE', 'SPACES', + 'HIGH-VALUE', 'HIGH-VALUES', + 'LOW-VALUE', 'LOW-VALUES', +}) + + +# ── _BrParser ── + +class _BrParser: + def __init__(self, lines, paragraphs=None, raw_lines=None, assignments=None, fields=None, goto_depth=0): + self.lines = lines + self.pos = 0 + self.paragraphs = paragraphs or {} + self.raw_lines = raw_lines or lines + # assignments is a dict[str, list[dict]] — append, never overwrite + self.assignments = assignments if assignments is not None else {} + self.fields = fields + self._goto_depth = goto_depth + + def peek(self): + if self.pos < len(self.lines): + return self.lines[self.pos].strip() + return '' + + def clean(self): + return self.peek().rstrip('.').strip() + + def advance(self): + self.pos += 1 + + def parse_seq(self, end_tokens=None, end_check=None, terminators=None): + if end_tokens is None: + end_tokens = [] + seq = BrSeq() + while self.pos < len(self.lines): + line = self.clean() + if self._is_end(line, end_tokens, end_check): + return seq + if terminators and line in terminators: + self.advance() + return seq + m_goto = re.match(r'^GO\s+TO\s+(\w[\w-]*)\s*$', line) + if m_goto: + goto_node = self._parse_goto(m_goto.group(1)) + if goto_node: + seq.add(goto_node) + while self.pos < len(self.lines): + cl = self.clean() + if self._is_end(cl, end_tokens, end_check): + break + if cl in _COBOL_SCOPE_ENDERS: + break + self.advance() + return seq + m_exit = re.match(r'^EXIT\s+(PARAGRAPH|PERFORM|SECTION)\s*$', line) + if m_exit: + self.advance() + seq.add(ExitNode(m_exit.group(1))) + while self.pos < len(self.lines): + cl = self.clean() + if self._is_end(cl, end_tokens, end_check): + break + if cl in _COBOL_SCOPE_ENDERS: + break + self.advance() + return seq + m = re.match(r'^IF\s+(.+?)(?:THEN)?\s*$', line) + if m: + seq.add(self._parse_if()) + continue + m = re.match(r'^EVALUATE\s+(.+?)\s*$', line) + if m: + seq.add(self._parse_evaluate()) + continue + m = re.match(r'^PERFORM\s+', line) + if m: + perf_node = self._parse_perform() + if perf_node: + seq.add(perf_node) + continue + m = re.match(r'^INITIALIZE\s+', line) + if m: + init_seq = self._parse_initialize() + if init_seq: + seq.add(init_seq) + continue + m_str = re.match(r'^STRING\s+', line) + if m_str: + str_seq = self._parse_string() + if str_seq: + seq.add(str_seq) + continue + m_unstr = re.match(r'^UNSTRING\s+', line) + if m_unstr: + unstr_seq = self._parse_unstring() + if unstr_seq: + seq.add(unstr_seq) + continue + m = re.match(r'^CALL\s+', line) + if m: + seq.add(self._parse_call()) + continue + m = re.match( + r'^ACCEPT\s+(\w[\w-]*)(?:\s+FROM\s+(DATE|TIME|DAY|DAY-OF-WEEK|YEAR|YYYYMMDD|HHMMSS))?\s*$', + line, re.IGNORECASE + ) + if m: + tgt = m.group(1).strip().upper() + from_type = (m.group(2) or 'USER').upper() + info = {'type': 'accept', 'from': from_type} + self.assignments.setdefault(tgt, []).append(info) + seq.add(Assign(tgt, info)) + self.advance() + continue + m = re.match(r'^READ\s+(\w[\w-]*)\s+INTO\s+(\w[\w-]*)\s*$', line, re.IGNORECASE) + if m: + tgt = m.group(2).strip().upper() + info = {'type': 'read_into', 'file': m.group(1).strip().upper(), 'source_vars': []} + self.assignments.setdefault(tgt, []).append(info) + seq.add(Assign(tgt, info)) + self.advance() + # 跳过 READ 语句剩余行(AT END / NOT AT END / END-READ) + while self.pos < len(self.lines): + cl = self.clean() + if cl in ('END-READ', 'END-READ.'): + self.advance() + break + self.advance() + continue + m_set_false = re.match(r'^SET\s+(\w[\w-]*)\s+TO\s+FALSE\s*$', line, re.IGNORECASE) + if m_set_false: + seq.add(self._parse_set_false(m_set_false.group(1))) + continue + m = re.match(r'^(?:WRITE|REWRITE)\s+(\w[\w-]*)(?:\s+FROM\s+(\w[\w-]*))?\s*$', line, re.IGNORECASE) + if m: + rec_name = m.group(1).strip().upper() + if m.group(2): + tgt = m.group(2).strip().upper() + info = {'type': 'write_from', 'file': rec_name, 'source_vars': [tgt]} + self.assignments.setdefault(tgt, []).append(info) + seq.add(Assign(tgt, info)) + else: + seq.add(Assign(rec_name, {'type': 'write_bare', 'file': rec_name})) + self.advance() + continue + m_set = re.match(r'^SET\s+(\w[\w-]*)\s+TO\s+TRUE\s*$', line, re.IGNORECASE) + if m_set: + seq.add(self._parse_set_true(m_set.group(1))) + continue + assign_node = self._record_assignment(line) + if assign_node: + seq.add(assign_node) + self.advance() + return seq + + def _is_end(self, line, end_tokens, end_check): + if end_check and end_check(line): + return True + for tok in end_tokens: + if line == tok or line.startswith(tok + ' '): + return True + return False + + def _record_assignment(self, line): + if self.assignments is None: + return None + + # MOVE + m = re.match(r'^MOVE\s+(.+?)\s+TO\s+(.+?)\s*$', line) + if m: + raw_src = m.group(1).strip() + tgt = m.group(2).strip() + # 保留下标:WS-CODE-VAL(1) → key='WS-CODE-VAL(1)' + m_tgt = re.match(r'^([A-Z][A-Z0-9-]*)(?:\s*\(([^)]*)\))?\s*$', tgt, re.IGNORECASE) + if not m_tgt: + return None + tgt_base = m_tgt.group(1).upper() + if m_tgt.group(2): + subscript = re.sub(r'\s*', '', m_tgt.group(2)) + tgt_key = f"{tgt_base}({subscript})" + else: + tgt_key = tgt_base + src_clean = raw_src.strip("'").strip('"') + is_field_name = self.fields and any(f['name'] == src_clean for f in self.fields) + if is_field_name: + info = {'type': 'move', 'source_vars': [src_clean]} + else: + info = {'type': 'move_literal', 'literal': src_clean} + self.assignments.setdefault(tgt_key, []).append(info) + return Assign(tgt_key, info) + + # COMPUTE + m = re.match(r'^COMPUTE\s+(.+?)(?:\s+ROUNDED)?\s*=\s*(.*)$', line) + if m: + tgt_raw = m.group(1).strip() + expr = m.group(2).strip() + m_tgt = re.match(r'^([A-Z][A-Z0-9-]*)(?:\s*\(([^)]*)\))?\s*$', tgt_raw, re.IGNORECASE) + tgt_key = tgt_raw + if m_tgt: + tgt_base = m_tgt.group(1).upper() + if m_tgt.group(2): + subscript = re.sub(r'\s*', '', m_tgt.group(2)) + tgt_key = f"{tgt_base}({subscript})" + else: + tgt_key = tgt_base + if not expr: + peek_pos = self.pos + 1 + if peek_pos < len(self.lines): + nxt = self.lines[peek_pos].strip().rstrip('.').strip() + if nxt and not re.match(r'^(PERFORM|END-|IF|ELSE|EVALUATE|WHEN|OTHER|MOVE|COMPUTE|ADD|SUBTRACT|MULTIPLY|DIVIDE|STRING|UNSTRING|READ|WRITE|INITIALIZE|ACCEPT|CALL|GO\s*TO|GOBACK|STOP|EXIT)', nxt, re.IGNORECASE): + expr = nxt + if expr: + info = self._parse_compute_expr(tgt_key, expr) + self.assignments.setdefault(tgt_key, []).append(info) + return Assign(tgt_key, info) + + # ADD x TO y → y = y + x (支持变量和常量源) + m = re.match(r'^ADD\s+(\w[\w-]*)\s+TO\s+(\w[\w-]*?)(?:\s+ROUNDED)?\s*$', line) + if m: + src = m.group(1).strip() + tgt = m.group(2).strip() + is_field = self.fields and any(f['name'] == src for f in self.fields) + if is_field: + info = {'type': 'compute', 'source_vars': [tgt, src], + 'op': '+', 'const': None, 'expr': f'{tgt} + {src}'} + else: + try: + const = float(src) + info = {'type': 'compute', 'source_vars': [tgt], + 'op': '+', 'const': const, 'expr': f'{tgt} + {const}'} + except ValueError: + return None + self.assignments.setdefault(tgt, []).append(info) + return Assign(tgt, info) + + # ADD x TO y GIVING z → z = y + x + m = re.match(r'^ADD\s+(.+?)\s+TO\s+(\w[\w-]*)\s+GIVING\s+(\w[\w-]*?)(?:\s+ROUNDED)?\s*$', line, re.IGNORECASE) + if m: + raw_a = m.group(1).strip() + src_b = m.group(2).strip() + tgt = m.group(3).strip() + is_field_a = self.fields and any(f['name'] == raw_a for f in self.fields) + if is_field_a: + info = {'type': 'compute', 'source_vars': [src_b, raw_a], + 'op': '+', 'const': None, 'expr': f'{src_b} + {raw_a}'} + else: + try: + const = float(raw_a) + info = {'type': 'compute', 'source_vars': [src_b], + 'op': '+', 'const': const, 'expr': f'{src_b} + {const}'} + except ValueError: + return None + self.assignments.setdefault(tgt, []).append(info) + return Assign(tgt, info) + + # ADD a[, b[, c...]] GIVING z → z = a + b + c + ... + m = re.match(r'^ADD\s+(.+?)\s+GIVING\s+(\w[\w-]*?)(?:\s+ROUNDED)?\s*$', line, re.IGNORECASE) + if m: + raw_parts = re.findall(r'[A-Z][A-Z0-9-]*|\d+(?:\.\d+)?', m.group(1).upper()) + fields_only = [] + const_sum = 0.0 + for p in raw_parts: + if self.fields and any(f['name'] == p for f in self.fields): + fields_only.append(p) + else: + try: + const_sum += float(p) + except ValueError: + pass + tgt = m.group(2).strip() + if not fields_only: + info = {'type': 'move_literal', + 'literal': str(int(const_sum)) if const_sum == int(const_sum) else str(const_sum)} + else: + info = {'type': 'compute', 'source_vars': fields_only, + 'op': '+', 'const': const_sum if const_sum != 0 else None, + 'expr': '+'.join(fields_only) + (f' + {const_sum}' if const_sum else '')} + self.assignments.setdefault(tgt, []).append(info) + return Assign(tgt, info) + + # SUBTRACT x FROM y → y = y - x + m = re.match(r'^SUBTRACT\s+([\d.]+)\s+FROM\s+(\w[\w-]*?)(?:\s+ROUNDED)?\s*$', line) + if m: + const = float(m.group(1)) + tgt = m.group(2).strip() + info = {'type': 'compute', 'source_vars': [tgt], + 'op': '-', 'const': const, 'expr': f'{tgt} - {const}'} + self.assignments.setdefault(tgt, []).append(info) + return Assign(tgt, info) + + # SUBTRACT a FROM b GIVING z → z = b - a + m = re.match(r'^SUBTRACT\s+([\d.\w-]*)\s+FROM\s+(\w[\w-]*)\s+GIVING\s+(\w[\w-]*?)(?:\s+ROUNDED)?\s*$', line, re.IGNORECASE) + if m: + raw_a = m.group(1).strip() + src_b = m.group(2).strip() + tgt = m.group(3).strip() + is_field_a = self.fields and any(f['name'] == raw_a for f in self.fields) + if is_field_a: + info = {'type': 'compute', 'source_vars': [src_b, raw_a], + 'op': '-', 'const': None, 'expr': f'{src_b} - {raw_a}'} + else: + try: + const = float(raw_a) + info = {'type': 'compute', 'source_vars': [src_b], + 'op': '-', 'const': const, 'expr': f'{src_b} - {const}'} + except ValueError: + return None + self.assignments.setdefault(tgt, []).append(info) + return Assign(tgt, info) + + # MULTIPLY x BY y → y = y * x + m = re.match(r'^MULTIPLY\s+([\d.]+)\s+BY\s+(\w[\w-]*?)(?:\s+ROUNDED)?\s*$', line) + if m: + const = float(m.group(1)) + tgt = m.group(2).strip() + info = {'type': 'compute', 'source_vars': [tgt], + 'op': '*', 'const': const, 'expr': f'{tgt} * {const}'} + self.assignments.setdefault(tgt, []).append(info) + return Assign(tgt, info) + + # MULTIPLY a BY b GIVING z → z = a * b + m = re.match(r'^MULTIPLY\s+(\w[\w-]*)\s+BY\s+(\w[\w-]*)\s+GIVING\s+(\w[\w-]*?)(?:\s+ROUNDED)?\s*$', line, re.IGNORECASE) + if m: + src_a = m.group(1).strip() + src_b = m.group(2).strip() + tgt = m.group(3).strip() + is_field_a = self.fields and any(f['name'] == src_a for f in self.fields) + if is_field_a: + info = {'type': 'compute', 'source_vars': [src_a, src_b], + 'op': '*', 'const': None, 'expr': f'{src_a} * {src_b}'} + else: + try: + const = float(src_a) + info = {'type': 'compute', 'source_vars': [src_b], + 'op': '*', 'const': const, 'expr': f'{const} * {src_b}'} + except ValueError: + return None + self.assignments.setdefault(tgt, []).append(info) + return Assign(tgt, info) + + # DIVIDE x INTO y → y = y / x + m = re.match(r'^DIVIDE\s+([\d.]+)\s+INTO\s+(\w[\w-]*?)(?:\s+ROUNDED)?\s*$', line) + if m: + const = float(m.group(1)) + tgt = m.group(2).strip() + info = {'type': 'compute', 'source_vars': [tgt], + 'op': '/', 'const': const, 'expr': f'{tgt} / {const}'} + self.assignments.setdefault(tgt, []).append(info) + return Assign(tgt, info) + + # DIVIDE a INTO b GIVING z → z = b / a + # Optional REMAINDER r → r = b - (b / a) * a + m = re.match(r'^DIVIDE\s+(.+?)\s+INTO\s+(\w[\w-]*)\s+GIVING\s+(\w[\w-]*?)(?:\s+ROUNDED)?(?:\s+REMAINDER\s+(\w[\w-]*))?\s*$', line, re.IGNORECASE) + if m: + raw_a = m.group(1).strip() + src_b = m.group(2).strip() + tgt = m.group(3).strip() + rem_tgt = m.group(4).strip().upper() if m.group(4) else None + is_field_a = self.fields and any(f['name'] == raw_a for f in self.fields) + if is_field_a: + info = {'type': 'compute', 'source_vars': [src_b, raw_a], + 'op': '/', 'const': None, 'expr': f'{src_b} / {raw_a}'} + rem_info = {'type': 'compute', 'source_vars': [src_b, raw_a], + 'op': 'rem', 'const': None, 'expr': f'REM({src_b} / {raw_a})'} + else: + try: + const = float(raw_a) + info = {'type': 'compute', 'source_vars': [src_b], + 'op': '/', 'const': const, 'expr': f'{src_b} / {const}'} + rem_info = {'type': 'compute', 'source_vars': [src_b], + 'op': 'rem', 'const': const, 'expr': f'REM({src_b} / {const})'} + except ValueError: + return None + self.assignments.setdefault(tgt, []).append(info) + seq = BrSeq() + seq.add(Assign(tgt, info)) + if rem_tgt: + self.assignments.setdefault(rem_tgt, []).append(rem_info) + seq.add(Assign(rem_tgt, rem_info)) + return seq + + # DIVIDE a BY b GIVING z → z = a / b + # Optional REMAINDER r → r = a - (a / b) * b + m = re.match(r'^DIVIDE\s+(\w[\w-]*)\s+BY\s+(\w[\w-]*)\s+GIVING\s+(\w[\w-]*?)(?:\s+ROUNDED)?(?:\s+REMAINDER\s+(\w[\w-]*))?\s*$', line, re.IGNORECASE) + if m: + src_a = m.group(1).strip() + src_b = m.group(2).strip() + tgt = m.group(3).strip() + rem_tgt = m.group(4).strip().upper() if m.group(4) else None + info = {'type': 'compute', 'source_vars': [src_a, src_b], + 'op': '/', 'const': None, 'expr': f'{src_a} / {src_b}'} + rem_info = {'type': 'compute', 'source_vars': [src_a, src_b], + 'op': 'rem', 'const': None, 'expr': f'REM({src_a} / {src_b})'} + self.assignments.setdefault(tgt, []).append(info) + seq = BrSeq() + seq.add(Assign(tgt, info)) + if rem_tgt: + self.assignments.setdefault(rem_tgt, []).append(rem_info) + seq.add(Assign(rem_tgt, rem_info)) + return seq + + return None + + def _parse_compute_expr(self, target, expr): + # const OP var + m = re.match(r'^\s*([\d.]+)\s*([+\-*/])\s*(\w[\w-]*)\s*$', expr) + if m: + const, op, var = float(m.group(1)), m.group(2), m.group(3) + return {'type': 'compute', 'source_vars': [var], 'op': op, 'const': const, 'expr': expr} + # var OP const + m = re.match(r'^\s*(\w[\w-]*)\s*([+\-*/])\s*([\d.]+)\s*$', expr) + if m: + var, op, const = m.group(1), m.group(2), float(m.group(3)) + return {'type': 'compute', 'source_vars': [var], 'op': op, 'const': const, 'expr': expr} + # var OP var + m = re.match(r'^\s*(\w[\w-]*)\s*([+\-*/])\s*(\w[\w-]*)\s*$', expr) + if m: + var1, op, var2 = m.group(1), m.group(2), m.group(3) + return {'type': 'compute', 'source_vars': [var1, var2], 'op': op, 'expr': expr} + # complex expression — extract variable names only + vars_in = re.findall(r'[A-Z][A-Z0-9-]*', expr.upper()) + return {'type': 'compute', 'source_vars': list(set(vars_in)), 'op': None, 'const': None, 'expr': expr} + + def _parse_if(self): + line = self.clean() + m = re.match(r'^IF\s+(.+?)(?:THEN)?\s*$', line) + cond_text = m.group(1).strip() + self.advance() + # Join continuation lines (multi-line IF conditions) + while self.pos < len(self.lines): + peek = self.clean() + if re.match(r'^(THEN|ELSE|END-IF|MOVE|IF|PERFORM|EVALUATE|COMPUTE|CALL|STRING|UNSTRING|INITIALIZE|ADD|SUBTRACT|MULTIPLY|DIVIDE|GO\b|EXIT\b)', peek, re.IGNORECASE): + break + if peek.endswith('.'): + cond_text += ' ' + peek.rstrip('.') + self.advance() + break + cond_text += ' ' + peek + self.advance() + # Consume optional THEN on its own line + if self.pos < len(self.lines): + peek = self.clean() + if peek == 'THEN': + self.advance() + node = BrIf(cond_text) + node.cond_tree = parse_compound_condition(node.condition, self.fields) + node.true_seq = self.parse_seq(['ELSE', 'END-IF']) + if self.clean() == 'ELSE': + self.advance() + node.false_seq = self.parse_seq(['END-IF']) + if self.clean() == 'END-IF': + self.advance() + return node + + def _parse_evaluate(self): + line = self.clean() + m = re.match(r'^EVALUATE\s+(.+?)\s*$', line) + raw_subject = m.group(1).strip() + node = BrEval(raw_subject) + if ' ALSO ' in raw_subject: + node.subjects = [s.strip() for s in re.split(r'\s+ALSO\s+', raw_subject)] + self.advance() + while self.pos < len(self.lines): + line = self.clean() + if line == 'END-EVALUATE': + self.advance() + return node + m = re.match(r'^WHEN\s+(.+?)\s*$', line) + if m: + raw_val = m.group(1).strip().strip("'").strip('"') + self.advance() + # Capture multi-line WHEN conditions (AND/OR continuation) + while self.pos < len(self.lines): + peek = self.clean() + if re.match(r'^(?:AND|OR)\b', peek, re.IGNORECASE): + raw_val += ' ' + peek + self.advance() + else: + break + if raw_val == 'OTHER': + node.other_seq = self.parse_seq(end_check=lambda l: l == 'END-EVALUATE') + node.has_other = True + else: + case_seq = self.parse_seq(end_check=lambda l: l.startswith('WHEN') or l == 'END-EVALUATE') + if node.subjects: + vals = [v.strip().strip("'").strip('"') + for v in re.split(r'\s+ALSO\s+', raw_val)] + node.when_list.append((vals, case_seq)) + else: + node.when_list.append((raw_val, case_seq)) + continue + self.advance() + return node + + def _parse_perform(self): + line = self.clean() + + m = re.match(r'^PERFORM\s+UNTIL\s+(.+?)\s*$', line) + if m: + node = BrPerform('until', condition=m.group(1).strip()) + self.advance() + node.body_seq = self.parse_seq(end_check=lambda l: l == 'END-PERFORM') + if self.clean() == 'END-PERFORM': + self.advance() + return node + + m = re.match(r'^PERFORM\s+(\w[\w-]*)\s+UNTIL\s+(.+?)\s*$', line) + if m: + target = m.group(1).strip() + node = BrPerform('para_until', target=target, condition=m.group(2).strip()) + self.advance() + self._inline_perform(node, target) + return node + + m = re.match(r'^PERFORM\s+(\d+)\s+TIMES\s*$', line) + if m: + node = BrPerform('times', times=int(m.group(1))) + self.advance() + return node + + m = re.match(r'^PERFORM\s+(\w[\w-]*)\s+THRU\s+(\w[\w-]*)\s*$', line) + if m: + node = BrPerform('thru', target=m.group(1).strip(), thru=m.group(2).strip()) + self.advance() + self._inline_perform(node, node.target, node.thru) + return node + + m = re.match(r'^PERFORM\s+VARYING\s+(\w[\w-]*)\s+FROM\s+(\S+)\s+BY\s+(\S+)(?:\s+UNTIL\s+(.+))?\s*$', line) + if m: + varying_var = m.group(1).strip() + from_val = m.group(2).strip() + by_val = m.group(3).strip() + condition = m.group(4).strip() if m.group(4) else None + if not condition: + save_pos = self.pos + self.advance() + while self.pos < len(self.lines): + nxt = self.clean() + cm = re.match(r'^UNTIL\s+(.+)$', nxt) + if cm: + condition = cm.group(1).strip() + self.advance() + break + fm = re.match(r'^FROM\s+(\S+)\s+BY\s+(\S+)$', nxt) + if fm: + from_val = fm.group(1).strip() + by_val = fm.group(2).strip() + self.advance() + continue + self.pos = save_pos + break + if condition: + node = BrPerform('varying', condition=condition, + varying_var=varying_var, + varying_from=from_val, + varying_by=by_val) + # condition from regex (single-line) → advance past PERFORM line + # condition from while-loop (multi-line) → already advanced past FROM/BY/UNTIL + if m.group(4): + self.advance() + node.body_seq = self.parse_seq(end_check=lambda l: l == 'END-PERFORM') + if self.clean() == 'END-PERFORM': + self.advance() + return node + self.pos = save_pos + # PERFORM VARYING var — FROM/BY/UNTIL all on subsequent lines + m = re.match(r'^PERFORM\s+VARYING\s+(\w[\w-]*)\s*$', line) + if m: + varying_var = m.group(1).strip() + save_pos = self.pos + self.advance() + from_val = by_val = condition = None + while self.pos < len(self.lines): + nxt = self.clean() + fm = re.match(r'^FROM\s+(\S+)\s+BY\s+(\S+)$', nxt) + if fm: + from_val, by_val = fm.group(1).strip(), fm.group(2).strip() + self.advance() + continue + um = re.match(r'^UNTIL\s+(.+)$', nxt) + if um: + condition = um.group(1).strip() + self.advance() + break + break + if from_val and by_val and condition: + node = BrPerform('varying', condition=condition, + varying_var=varying_var, + varying_from=from_val, + varying_by=by_val) + node.body_seq = self.parse_seq(end_check=lambda l: l == 'END-PERFORM') + if self.clean() == 'END-PERFORM': + self.advance() + return node + self.pos = save_pos + + m = re.match(r'^PERFORM\s+(\w[\w-]*)\s+VARYING\s+(\w[\w-]*)\s+FROM\s+(\S+)\s+BY\s+(\S+)(?:\s+UNTIL\s+(.+))?\s*$', line) + if m: + target = m.group(1).strip() + varying_var = m.group(2).strip() + from_val = m.group(3).strip() + by_val = m.group(4).strip() + condition = m.group(5).strip() if m.group(5) else None + if not condition: + save_pos = self.pos + self.advance() + while self.pos < len(self.lines): + nxt = self.clean() + cm = re.match(r'^UNTIL\s+(.+)$', nxt) + if cm: + condition = cm.group(1).strip() + self.advance() + break + self.pos = save_pos + break + if condition: + node = BrPerform('para_varying', target=target, + condition=condition, + varying_var=varying_var, + varying_from=from_val, + varying_by=by_val) + self.advance() + self._inline_perform(node, node.target) + return node + self.pos = save_pos + + m = re.match(r'^PERFORM\s+(\w[\w-]*)\s*$', line) + if m: + target = m.group(1).strip() + node = BrPerform('para', target=target) + self.advance() + self._inline_perform(node, target) + return node + + self.advance() + return None + + def _inline_perform(self, node, target, thru=None): + if thru: + if target in self.paragraphs and thru in self.paragraphs: + start = self.paragraphs[target][0] + end = self.paragraphs[thru][1] + all_lines = [] + for name, (s, e) in self.paragraphs.items(): + if s >= start and e <= end: + all_lines.extend(self.raw_lines[s:e + 1]) + sub = _BrParser( + [l for l in all_lines if l.strip()], + self.paragraphs, self.raw_lines, self.assignments, self.fields + ) + node.body_seq = sub.parse_seq() + elif target in self.paragraphs: + start, end = self.paragraphs[target] + para_lines = self.raw_lines[start:end + 1] + sub = _BrParser( + [l for l in para_lines if l.strip()], + self.paragraphs, self.raw_lines, self.assignments, self.fields + ) + node.body_seq = sub.parse_seq() + + def _parse_initialize(self): + line = self.clean() + m = re.match(r'^INITIALIZE\s+(.+?)\s*$', line) + if not m: + self.advance() + return None + rest = m.group(1).strip() + + # Split off REPLACING clause + parts = re.split(r'\s+REPLACING\s+', rest, maxsplit=1, flags=re.IGNORECASE) + target_str = parts[0].strip() + targets = re.findall(r'[A-Z][A-Z0-9-]*', target_str) + + # Parse REPLACING: (NUMERIC|ALPHANUMERIC|ALPHABETIC) DATA BY literal + replacing = {} + if len(parts) > 1: + pairs = re.findall( + r'(NUMERIC|ALPHANUMERIC-EDITED|NUMERIC-EDITED|ALPHANUMERIC|ALPHABETIC)\s+DATA\s+BY\s+(\S+)', + parts[1], re.IGNORECASE + ) + for ptype, literal in pairs: + replacing[ptype.upper()] = literal.strip("'").strip('"') + + seq = BrSeq() + for tgt in targets: + info = {'type': 'initialize'} + if replacing: + info['replacing'] = replacing + self.assignments.setdefault(tgt, []).append(info) + seq.add(Assign(tgt, info)) + self.advance() + return seq + + def _parse_string(self): + parts = [self.clean()] + self.advance() + while self.pos < len(self.lines): + cl = self.clean() + if cl == 'END-STRING': + self.advance() + break + parts.append(cl) + self.advance() + full = ' '.join(parts) + m = re.match(r'^STRING\s+(.+)\s+INTO\s+(\w[\w-]*)\s*$', full, re.IGNORECASE | re.DOTALL) + if not m: + return None + source_part = m.group(1).strip() + target = m.group(2).strip() + source_vars = re.findall(r'[A-Z][A-Z0-9-]*', source_part) + info = {'type': 'string_concat', 'source_vars': source_vars} + self.assignments.setdefault(target, []).append(info) + seq = BrSeq() + seq.add(Assign(target, info)) + return seq + + def _parse_unstring(self): + parts = [self.clean()] + self.advance() + while self.pos < len(self.lines): + cl = self.clean() + if cl == 'END-UNSTRING': + self.advance() + break + parts.append(cl) + self.advance() + full = ' '.join(parts) + m = re.match(r'^UNSTRING\s+(.+?)\s+INTO\s+(.+?)\s*$', full, re.IGNORECASE | re.DOTALL) + if not m: + return None + source_part = m.group(1).strip() + targets_part = m.group(2).strip() + source_vars = re.findall(r'[A-Z][A-Z0-9-]*', source_part) + targets = re.findall(r'[A-Z][A-Z0-9-]*', targets_part) + source_var = source_vars[0] if source_vars else '' + seq = BrSeq() + for tgt in targets: + info = {'type': 'unstring_split', 'source_vars': [source_var], 'index': targets.index(tgt)} + self.assignments.setdefault(tgt, []).append(info) + seq.add(Assign(tgt, info)) + return seq + + def _parse_call(self): + line = self.clean() + m = re.match(r'^CALL\s+(\S+?)(?:\s+USING\s+(.+))?\s*$', line) + if not m: + self.advance() + return BrSeq() + prog = m.group(1).strip("'\"").upper() + params = [] + if m.group(2): + rest = m.group(2) + # 逐 segment 解析: BY mechanism names... + current = "reference" # COBOL 默认 BY REFERENCE + for seg in re.split(r'\s+(?=BY\s+(?:REFERENCE|CONTENT|VALUE)\s+)', + rest, flags=re.IGNORECASE): + seg = seg.strip() + m_mech = re.match( + r'BY\s+(REFERENCE|CONTENT|VALUE)\s+(.*)', seg, re.IGNORECASE + ) + if m_mech: + current = m_mech.group(1).lower() + names_text = m_mech.group(2) + else: + names_text = seg + for nm in re.findall(r'\w[\w-]*', names_text): + params.append({"name": nm.upper(), "mechanism": current}) + node = CallNode(prog, using_params=params) + self.advance() + return node + + def _parse_goto(self, target): + node = GoTo(target) + if self._goto_depth < 10 and target in self.paragraphs: + start, end = self.paragraphs[target] + para_lines = self.raw_lines[start:end + 1] + sub = _BrParser( + [l for l in para_lines if l.strip()], + self.paragraphs, self.raw_lines, self.assignments, self.fields, + goto_depth=self._goto_depth + 1 + ) + node.body_seq = sub.parse_seq() + self.advance() + return node + + def _parse_set_true(self, name): + name = name.upper() + parent = None + value = None + if self.fields: + for f in self.fields: + if f.get('is_88') and f['name'] == name: + parent = f.get('parent', '') + value = f.get('value', '') + break + info = {'type': 'set_true', '88_name': name, 'value': value} + tgt = parent or name + if parent: + self.assignments.setdefault(tgt, []).append(info) + self.advance() + return Assign(tgt, info) + + def _parse_set_false(self, name): + name = name.upper() + parent = None + value = None + if self.fields: + for f in self.fields: + if f.get('is_88') and f['name'] == name: + parent = f.get('parent', '') + value = f.get('value', '') + break + # FALSE 值 = 88-level VALUE 的反值 + if value: + false_val = 'N' if value == 'Y' else ('Y' if value == 'N' else ' ') + else: + false_val = 'N' + info = {'type': 'move_literal', 'literal': false_val} + tgt = parent or name + self.assignments.setdefault(tgt, []).append(info) + self.advance() + return Assign(tgt, info) + + +# ── 工具函数 ── + + +def _basename(name: str) -> str: + """去除下标后缀,如 WS-TABLE(1) → WS-TABLE""" + return re.sub(r'\s*\(.*?\)\s*$', '', name).strip() + + +def _init_child_names(group_name: str, fields: list) -> list: + """递归收集 group 下所有非 88 级子字段的扁平名列表""" + result = [] + grp_level = None + found = False + for f in fields: + if not found and f['name'] == group_name: + grp_level = f.get('level', 0) + found = True + continue + if found: + if f.get('level', 0) <= grp_level or f.get('level') == 77: + break + if f.get('is_88') or f.get('redefines'): + continue + if not f.get('pic_info') or f['pic_info'].get('type') == 'unknown': + result.extend(_init_child_names(f['name'], fields)) + else: + result.append(f['name']) + return result + + +# ── 数据流追踪 ── + +def trace_to_root(field_name, assignments, fields, path_assign=None): + seen = set() + var = field_name + chain = [] + while var in assignments and var not in seen: + seen.add(var) + if path_assign and var in path_assign: + asgn_list = path_assign[var] + if isinstance(asgn_list, list): + asgn = asgn_list[-1] + for a in reversed(asgn_list): + sv = a.get('source_vars', []) + if len(sv) == 1 and sv[0] == var: + continue + asgn = a + break + else: + asgn = asgn_list + else: + asgn_list = assignments[var] + asgn = asgn_list[-1] + if isinstance(asgn_list, list): + for a in reversed(asgn_list): + sv = a.get('source_vars', []) + if len(sv) == 1 and sv[0] == var: + continue + asgn = a + break + chain.append((var, asgn)) + if not asgn.get('source_vars'): + break + sv = asgn['source_vars'] + if len(sv) == 1: + next_var = sv[0] + if next_var == var: + break + var = next_var + if next_var not in assignments: + break + elif len(sv) >= 2 and asgn.get('op') == '+': + # 多源加法:取第一个源变量继续追溯 + var = sv[0] + else: + break + return var, chain + + +def invert_through_chain(root_var, chain, operator, value): + op = operator + try: + val = float(value) + except (ValueError, TypeError): + return root_var, op, value + for var, asgn in reversed(chain): + if asgn['type'] == 'move': + continue + sv = asgn.get('source_vars', []) + if asgn['type'] == 'compute' and asgn['op'] is not None: + if len(sv) == 1: + c = asgn['const'] + inv = {'+': '-', '-': '+', '*': '/', '/': '*'}[asgn['op']] + if inv == '/': + val = val / c if c != 0 else val + elif inv == '*': + val = val * c + elif inv == '-': + val = val - c + elif inv == '+': + val = val + c + elif len(sv) >= 2 and asgn['op'] == '+': + # 多源加法:追溯第一个源变量,值不变(忽略其他源) + pass + if val == int(val): + return root_var, op, str(int(val)) + return root_var, op, str(val) + + +FIGURATIVE_NUMERIC = { + 'ZERO': 0.0, 'ZEROS': 0.0, 'ZEROES': 0.0, + 'SPACE': 0.0, 'SPACES': 0.0, + 'HIGH-VALUE': None, 'HIGH-VALUES': None, + 'LOW-VALUE': 0.0, 'LOW-VALUES': 0.0, +} +FIGURATIVE_ALPHA = { + 'SPACE': ' ', 'SPACES': ' ', + 'HIGH-VALUE': chr(255), 'HIGH-VALUES': chr(255), + 'LOW-VALUE': chr(0), 'LOW-VALUES': chr(0), +} + + +def _resolve_subscript(key, rec): + """将变量下标解析为具体值:WS-FIXED-KEY(WS-IDX) → WS-FIXED-KEY(1) if WS-IDX=1 in rec""" + m = re.match(r'^(\w[\w-]*)\((\w[\w-]*)\)$', key) + if m: + base, var = m.groups() + if var in rec: + try: + return f'{base}({int(rec[var])})' + except (ValueError, TypeError): + pass + return key + + +def propagate_assignments(rec, assignments, fields, file_sec=None): + def raw_to_float(val, pi): + if pi.get('type') == 'numeric': + digits = pi.get('digits', 0) + decimal = pi.get('decimal', 0) + total = digits + decimal + s = str(val) + neg = s.startswith('-') + if neg: + s = s[1:] + s = s.zfill(total) + int_part = s[:digits] if digits else '0' + dec_part = s[digits:] if decimal > 0 else '0' + result = float(int(int_part or '0') + int(dec_part or '0') / (10 ** decimal)) + return -result if neg else result + try: + return float(val) + except (ValueError, TypeError): + return 0.0 + + def float_to_raw(val, pi): + if pi.get('type') == 'numeric': + digits = pi.get('digits', 0) + decimal = pi.get('decimal', 0) + signed = pi.get('signed', False) + scaled = int(round(val * (10 ** decimal))) + if not signed and scaled < 0: + scaled = 0 + capped = abs(scaled) % (10 ** (digits + decimal)) + int_part = str(capped // (10 ** decimal)).zfill(digits) + dec_part = str(capped % (10 ** decimal)).zfill(decimal) + result = int_part + (dec_part if decimal > 0 else '') + if signed and scaled < 0: + result = '-' + result + return result + return str(val) + + def literal_to_raw(literal, pi): + ftype = pi.get('type', 'unknown') + if ftype == 'numeric': + key = literal.upper() + if key in FIGURATIVE_NUMERIC: + v = FIGURATIVE_NUMERIC[key] + if v is None: + digits = pi.get('digits', 0) + decimal = pi.get('decimal', 0) + v = 10 ** (digits + decimal) - 1 + return float_to_raw(v, pi) + try: + return float_to_raw(float(literal), pi) + except ValueError: + return float_to_raw(0.0, pi) + if ftype in ('alphanumeric', 'alphabetic'): + key = literal.upper() + if key in FIGURATIVE_ALPHA: + ch = FIGURATIVE_ALPHA[key] + return ch[0].ljust(pi.get('length', 1), ch[0]) + return literal.ljust(pi.get('length', len(literal)))[:pi.get('length', len(literal))] + return literal + + pi_map = {f['name']: f.get('pic_info', {}) for f in fields} + if file_sec is None: + file_sec = {} + + # Flatten: {tgt: [info1, info2]} → [(tgt, info1), (tgt, info2)] + flat_list = [] + for tgt, asgn_val in assignments.items(): + if isinstance(asgn_val, list): + for asgn in asgn_val: + flat_list.append((tgt, asgn)) + elif isinstance(asgn_val, dict): + flat_list.append((tgt, asgn_val)) + + _MAX_CONVERGE = 20 + + # 识别有"锚定赋值"(非自引用赋值,如 MOVE literal 或不同字段的 MOVE) 的 target + _anchored = set() + for tgt, asgn in flat_list: + if asgn.get('type') != 'compute': + _anchored.add(tgt) + else: + sv = asgn.get('source_vars', []) + if not (len(sv) == 1 and sv[0] == tgt) and not (len(sv) >= 2 and tgt == sv[0]): + _anchored.add(tgt) + + for _converge_iter in range(_MAX_CONVERGE): + _old = dict(rec) + + # Pass 1: variable-to-variable MOVE + for tgt, asgn in flat_list: + if asgn['type'] == 'move' and asgn['source_vars']: + src = asgn['source_vars'][0] + resolved_tgt = _resolve_subscript(tgt, rec) + resolved_src = _resolve_subscript(src, rec) + if resolved_src in rec: + rec[resolved_tgt] = rec[resolved_src] + + # Pass 2: literal MOVE + for tgt, asgn in flat_list: + if asgn['type'] == 'move_literal': + resolved_tgt = _resolve_subscript(tgt, rec) + pi = pi_map.get(resolved_tgt, {}) + rec[resolved_tgt] = literal_to_raw(asgn['literal'], pi) + + # Pass 3: INITIALIZE + for tgt, asgn in flat_list: + if asgn['type'] == 'initialize': + resolved_tgt = _resolve_subscript(tgt, rec) + pi = pi_map.get(resolved_tgt, {}) + ftype = pi.get('type', 'unknown') + replacing = asgn.get('replacing', {}) + if replacing: + mapped = replacing.get(ftype.upper(), None) + if mapped: + rec[resolved_tgt] = literal_to_raw(mapped, pi) + else: + if ftype == 'numeric': + rec[resolved_tgt] = float_to_raw(0.0, pi) + else: + rec[resolved_tgt] = literal_to_raw('SPACE', pi) + else: + if ftype == 'numeric': + rec[resolved_tgt] = float_to_raw(0.0, pi) + else: + rec[resolved_tgt] = literal_to_raw('SPACE', pi) + + # Pass 3.5: READ INTO + for tgt, asgn in flat_list: + if asgn['type'] == 'read_into': + fname = asgn.get('file', '') + if fname in file_sec: + fd_children = _init_child_names(file_sec[fname][0], fields) + ws_children = _init_child_names(tgt, fields) + for ws_c in ws_children: + fd_candidate = ws_c + if ws_c.startswith('WS-'): + fd_candidate = ws_c[3:] + if fd_candidate in rec: + rec[ws_c] = rec[fd_candidate] + else: + idx = ws_children.index(ws_c) + if idx < len(fd_children) and fd_children[idx] in rec: + rec[ws_c] = rec[fd_children[idx]] + rec[tgt] = ''.join(str(rec.get(c, '')) for c in ws_children) + + # Pass 4: COMPUTE + for tgt, asgn in flat_list: + if asgn['type'] == 'compute' and asgn['source_vars'] and asgn['op'] is not None: + resolved_tgt = _resolve_subscript(tgt, rec) + pi_tgt = pi_map.get(resolved_tgt, {}) + if len(asgn['source_vars']) == 1: + src = asgn['source_vars'][0] + resolved_src = _resolve_subscript(src, rec) + # 无锚定的自引用 COMPUTE(如 ADD 1 TO X):只在第 0 轮应用一次 + if resolved_tgt == resolved_src and tgt not in _anchored and _converge_iter > 0: + continue + if resolved_src in rec: + sv = raw_to_float(rec[resolved_src], pi_map.get(resolved_src, {})) + c = asgn.get('const', 0) + if asgn['op'] == 'rem': + quotient = int(sv / c) if c != 0 else 0 + result = sv - quotient * c + else: + result = {'+': sv + c, '-': sv - c, '*': sv * c, '/': sv / c if c != 0 else sv}[asgn['op']] + rec[resolved_tgt] = float_to_raw(result, pi_tgt) + elif len(asgn['source_vars']) == 2: + v1, v2 = asgn['source_vars'] + resolved_v1 = _resolve_subscript(v1, rec) + resolved_v2 = _resolve_subscript(v2, rec) + # 无锚定的自引用 COMPUTE(如 ADD X TO Y 且 Y 无前置 MOVE) + if resolved_tgt == resolved_v1 and tgt not in _anchored and _converge_iter > 0: + continue + if resolved_v1 in rec and resolved_v2 in rec: + sv1 = raw_to_float(rec[resolved_v1], pi_map.get(resolved_v1, {})) + sv2 = raw_to_float(rec[resolved_v2], pi_map.get(resolved_v2, {})) + if asgn['op'] == 'rem': + quotient = int(sv1 / sv2) if sv2 != 0 else 0 + result = sv1 - quotient * sv2 + else: + result = {'+': sv1 + sv2, '-': sv1 - sv2, '*': sv1 * sv2, '/': sv1 / sv2 if sv2 != 0 else sv1}[asgn['op']] + rec[resolved_tgt] = float_to_raw(result, pi_tgt) + elif len(asgn['source_vars']) >= 3 and asgn['op'] == '+': + total = 0 + all_found = True + for v in asgn['source_vars']: + resolved_v = _resolve_subscript(v, rec) + if resolved_v in rec: + total += raw_to_float(rec[resolved_v], pi_map.get(resolved_v, {})) + else: + all_found = False + break + if all_found: + rec[resolved_tgt] = float_to_raw(total, pi_tgt) + + # Pass 5: STRING / UNSTRING + for tgt, asgn in flat_list: + if asgn['type'] == 'string_concat': + resolved_tgt = _resolve_subscript(tgt, rec) + pi = pi_map.get(resolved_tgt, {}) + parts = [] + for v in asgn.get('source_vars', []): + resolved_v = _resolve_subscript(v, rec) + if resolved_v in rec: + parts.append(str(rec[resolved_v])) + val = ''.join(parts) + if pi.get('type') in ('alphanumeric', 'alphabetic'): + val = val.ljust(pi.get('length', len(val)))[:pi.get('length', len(val))] + rec[resolved_tgt] = val + elif asgn['type'] == 'unstring_split': + resolved_tgt = _resolve_subscript(tgt, rec) + pi = pi_map.get(resolved_tgt, {}) + src_var = asgn.get('source_vars', [None])[0] + resolved_src = _resolve_subscript(src_var, rec) if src_var else None + idx = asgn.get('index', 0) + if resolved_src and resolved_src in rec: + src_val = str(rec[resolved_src]) + ftype = pi.get('type', 'unknown') + if idx == 0: + val = src_val + else: + val = ' ' if ftype in ('alphanumeric', 'alphabetic') else '0' + if ftype in ('alphanumeric', 'alphabetic'): + val = val.ljust(pi.get('length', len(val)))[:pi.get('length', len(val))] + rec[resolved_tgt] = val + + # Pass 6: READ INTO / WRITE FROM + for tgt, asgn in flat_list: + if asgn['type'] == 'read_into': + fname = asgn.get('file', '') + if fname in file_sec: + children = _init_child_names(file_sec[fname][0], fields) + rec[tgt] = ''.join(str(rec.get(c, '')) for c in children) + elif asgn['type'] == 'write_from': + buf = tgt + rec_name = asgn.get('file', '') + children = _init_child_names(rec_name, fields) + if children: + src = str(rec.get(buf, '')) + pos = 0 + for c in children: + pi = pi_map.get(c, {}) + length = pi.get('digits', 0) + pi.get('decimal', 0) or pi.get('length', 0) + if length > 0: + chunk = src[pos:pos + length] + if not chunk: + chunk = '0' if pi.get('type') == 'numeric' else ' ' + rec[c] = chunk.ljust(length) + pos += length + + # Pass 7: ACCEPT + for tgt, asgn in flat_list: + if asgn['type'] == 'accept': + resolved_tgt = _resolve_subscript(tgt, rec) + pi = pi_map.get(resolved_tgt, {}) + ftype = pi.get('type', 'unknown') + total = pi.get('digits', 0) + pi.get('decimal', 0) + length = pi.get('length', 0) + from_type = asgn.get('from', 'USER') + val = None + if from_type == 'DATE': + val = '20260603' + elif from_type == 'TIME': + val = '120000' + elif from_type == 'DAY': + val = '2026154' + elif from_type == 'DAY-OF-WEEK': + val = '3' + elif from_type == 'YEAR': + val = '2026' + if val is not None: + if ftype == 'numeric': + rec[resolved_tgt] = val.zfill(total) + else: + rec[resolved_tgt] = val.ljust(length)[:length] if length else val + + # Pass 8: SET var TO TRUE (88-level) + for tgt, asgn in flat_list: + if asgn['type'] == 'set_true': + resolved_tgt = _resolve_subscript(tgt, rec) + val = asgn.get('value', '1') + pi = pi_map.get(resolved_tgt, {}) + ftype = pi.get('type', 'unknown') + if ftype in ('alphanumeric', 'alphabetic'): + length = pi.get('length', len(str(val))) + rec[resolved_tgt] = str(val)[0].ljust(length)[:length] + else: + total = pi.get('digits', 0) + pi.get('decimal', 0) + rec[resolved_tgt] = str(val).zfill(max(total, 1)) + + if rec == _old: + break + else: + logger.warning(f"propagate_assignments 未收敛({_MAX_CONVERGE} 次迭代后仍有变化)") + + +def classify_field_roles(tree, assignments, fields, source=None, proc_text=None): + """分析分支树和赋值记录,分类各字段的入出力角色。 + 优先级:FD/OPEN 方向 > 静态分析 + 返回 {字段名: 'input'|'output'|'inout'|'unused'}. + """ + # Phase 0: FD/OPEN 方向解析 + fd_roles = {} + if source and proc_text: + from .read import parse_file_control, parse_file_section, scan_open_statements + file_ctl = parse_file_control(source) + file_sec = parse_file_section(source) + open_dir = scan_open_statements(proc_text) + for iname, direction in open_dir.items(): + if iname in file_sec: + for rec_name in file_sec[iname]: + if direction == 'INPUT': + fd_roles[rec_name] = 'input' + elif direction == 'OUTPUT': + fd_roles[rec_name] = 'output' + elif direction == 'I-O': + fd_roles[rec_name] = 'inout' + # 传播到子字段 + for rec_name, role in list(fd_roles.items()): + for child in _init_child_names(rec_name, fields): + fd_roles[child] = role + + counts = {f['name']: {'read': 0, 'write': 0} for f in fields} + + def _walk(node): + if isinstance(node, BrIf): + if node.cond_tree: + for leaf in collect_leaves(node.cond_tree): + name = _basename(leaf.field) + if name in counts: + counts[name]['read'] += 1 + _walk(node.true_seq) + _walk(node.false_seq) + elif isinstance(node, BrEval): + name = _basename(node.subject) + if name in counts: + counts[name]['read'] += 1 + for _, seq in node.when_list: + _walk(seq) + _walk(node.other_seq) + elif isinstance(node, BrPerform): + if node.condition: + parsed = parse_single_condition(node.condition) + if parsed: + name = _basename(parsed[0]) + if name in counts: + counts[name]['read'] += 1 + if node.varying_var: + name = _basename(node.varying_var) + if name in counts: + counts[name]['write'] += 1 + _walk(node.body_seq) + elif isinstance(node, CallNode): + for p in node.using_params: + name = _basename(p.get("name", "")) + mechanism = p.get("mechanism", "reference") + if name in counts: + counts[name]["read"] += 1 + if mechanism.lower() == "reference": + counts[name]["write"] += 1 + elif isinstance(node, Assign): + tgt_base = _basename(node.target) + atype = node.source_info.get('type') + if atype == 'read_into': + if tgt_base in counts: + counts[tgt_base]['write'] += 1 + elif atype == 'write_from': + if tgt_base in counts: + counts[tgt_base]['read'] += 1 + elif atype == 'set_true': + if tgt_base in counts: + counts[tgt_base]['write'] += 1 + else: + if tgt_base in counts: + counts[tgt_base]['write'] += 1 + for v in node.source_info.get('source_vars', []): + v_base = _basename(v) + if v_base in counts: + counts[v_base]['read'] += 1 + if atype == 'initialize' and tgt_base in counts: + for child in _init_child_names(tgt_base, fields): + if child in counts: + counts[child]['write'] += 1 + elif isinstance(node, BrSeq): + for c in node.children: + _walk(c) + + _walk(tree) + + # Phase extra: ACCEPT / DISPLAY (proc_text 扫描) + if proc_text: + for m in re.finditer(r'ACCEPT\s+(\w[\w-]*)', proc_text): + name = _basename(m.group(1).upper()) + if name in counts: + counts[name]['write'] += 1 + for m in re.finditer(r'DISPLAY\s+(\w[\w-]*)', proc_text): + name = _basename(m.group(1).upper()) + if name in counts: + counts[name]['read'] += 1 + + # LINKAGE 字段默认 input(未使用时不改变) + for f in fields: + if f.get('section') == 'LINKAGE': + name = f['name'] + if name in counts and counts[name]['read'] == 0 and counts[name]['write'] == 0: + counts[name]['read'] = 1 + + result = {} + for name, c in counts.items(): + if name in fd_roles: + result[name] = fd_roles[name] + continue + if c['read'] > 0 and c['write'] > 0: + result[name] = 'inout' + elif c['write'] > 0: + result[name] = 'output' + elif c['read'] > 0: + result[name] = 'input' + else: + result[name] = 'unused' + # 确保 FD 记录字段也出现(即使不在 fields 中—应不会) + for name, role in fd_roles.items(): + if name not in result: + result[name] = role + return result diff --git a/cobol_testgen/coverage.py b/cobol_testgen/coverage.py new file mode 100644 index 0000000..efb91b7 --- /dev/null +++ b/cobol_testgen/coverage.py @@ -0,0 +1,1121 @@ +"""覆盖率统计:决策点收集 + 路径标记 + HTML报告""" + +import re +import logging +from dataclasses import dataclass, field +from pathlib import Path + +logger = logging.getLogger(__name__) +from .models import BrSeq, BrIf, BrEval, BrPerform, CondLeaf +from .cond import parse_single_condition, parse_compound_condition, is_field, collect_leaves, evaluate_tree + + +# ── 数据模型 ── + +@dataclass +class LeafStat: + field: str + op: str + value: str + covered_true: bool = False + covered_false: bool = False + + +@dataclass +class DecisionPoint: + id: int + kind: str # "IF" | "EVALUATE" | "PERFORM" + label: str + branch_names: list[str] + covered_branches: set = field(default_factory=set) + active_branches: set = field(default_factory=set) + implied_branches: set = field(default_factory=set) + leaves: list[LeafStat] = field(default_factory=list) + source_line: int = 0 + when_list: list = field(default_factory=list) + cond_tree: object = None + cond_leaves: list = field(default_factory=list) + + +# ── 决策点收集 ── + +def collect_decision_points(node, fields, counter=None): + if counter is None: + counter = [0] + points = [] + all_leaves = [] + + if isinstance(node, BrIf): + counter[0] += 1 + dp = DecisionPoint(id=counter[0], kind='IF', label=node.condition, + branch_names=['T', 'F']) + simple = parse_single_condition(node.condition) + if simple and is_field(simple[0], fields): + dp.parsed = simple + elif simple: + dp.parsed = simple + elif node.cond_tree: + leaves = collect_leaves(node.cond_tree) + if leaves: + dp.cond_tree = node.cond_tree + dp.cond_leaves = list(leaves) + for leaf in leaves: + ls = LeafStat(field=leaf.field, op=leaf.op, value=leaf.value) + dp.leaves.append(ls) + all_leaves.append(ls) + points.append(dp) + p, l = _walk_collect(node.true_seq, fields, counter) + points.extend(p); all_leaves.extend(l) + p, l = _walk_collect(node.false_seq, fields, counter) + points.extend(p); all_leaves.extend(l) + + elif isinstance(node, BrEval): + counter[0] += 1 + names = [f"WHEN {v}" for v, _ in node.when_list] + if node.has_other: + names.append("OTHER") + dp = DecisionPoint(id=counter[0], kind='EVALUATE', label=node.subject, + branch_names=names, when_list=node.when_list) + points.append(dp) + for _, seq in node.when_list: + p, l = _walk_collect(seq, fields, counter) + points.extend(p); all_leaves.extend(l) + p, l = _walk_collect(node.other_seq, fields, counter) + points.extend(p); all_leaves.extend(l) + + elif isinstance(node, BrPerform): + if node.perf_type in ('until', 'para_until', 'varying', 'para_varying'): + counter[0] += 1 + dp = DecisionPoint(id=counter[0], kind='PERFORM', + label=node.condition or '', + branch_names=['Enter', 'Skip']) + simple = parse_single_condition(node.condition) if node.condition else None + if simple and is_field(simple[0], fields): + dp.parsed = simple + points.append(dp) + p, l = _walk_collect(node.body_seq, fields, counter) + points.extend(p); all_leaves.extend(l) + + elif isinstance(node, BrSeq): + for child in node.children: + p, l = collect_decision_points(child, fields, counter) + points.extend(p); all_leaves.extend(l) + + return points, all_leaves + + +def _walk_collect(node, fields, counter): + return collect_decision_points(node, fields, counter) + + +# ── 覆盖率标记 ── + +def mark_coverage(decision_points, leaf_stats, branch_paths, fields): + for cons, _assign in branch_paths: + for dp in decision_points: + if dp.kind == 'IF': + _mark_if(dp, cons) + elif dp.kind == 'EVALUATE': + _mark_eval(dp, cons) + elif dp.kind == 'PERFORM': + _mark_perform(dp, cons) + for leaf in leaf_stats: + for c in cons: + if _match_leaf(c, leaf): + if c[3]: + leaf.covered_true = True + else: + leaf.covered_false = True + + for dp in decision_points: + _infer_implied(dp) + + +def _match_constraint(c, parsed): + if len(c) != 4: + return False + return (c[0] == parsed[0] and c[1] == parsed[1] + and str(c[2]) == str(parsed[2])) + + +def _match_leaf(c, leaf): + if len(c) != 4: + return False + return (c[0] == leaf.field and c[1] == leaf.op + and str(c[2]) == str(leaf.value)) + + +def _mark_if(dp, cons): + simple = getattr(dp, 'parsed', None) + if simple: + for c in cons: + if _match_constraint(c, simple): + if c[3]: + dp.active_branches.add('T') + else: + dp.active_branches.add('F') + elif dp.cond_tree and dp.cond_leaves: + assignment = {} + for leaf in dp.cond_leaves: + for c in cons: + if _match_leaf(c, leaf): + assignment[leaf] = c[3] + break + if len(assignment) == len(dp.cond_leaves): + if evaluate_tree(dp.cond_tree, assignment): + dp.active_branches.add('T') + else: + dp.active_branches.add('F') + else: + matched = 0 + for leaf in dp.leaves: + for c in cons: + if _match_leaf(c, leaf): + matched += 1 + break + if matched <= 1: + for c in cons: + for leaf in dp.leaves: + if _match_leaf(c, leaf): + dp.active_branches.add('T' if c[3] else 'F') + + +def _mark_eval(dp, cons): + if dp.label == 'TRUE': + for when_val, _ in dp.when_list: + parsed = parse_single_condition(when_val) + if parsed: + for c in cons: + if _match_constraint(c, parsed): + name = f"WHEN {when_val}" + if name in dp.branch_names: + dp.active_branches.add(name) + else: + cond_tree = parse_compound_condition(when_val) + if cond_tree and not isinstance(cond_tree, CondLeaf): + leaves = list(collect_leaves(cond_tree)) + assignment = {} + for leaf in leaves: + for c in cons: + if _match_leaf(c, leaf): + assignment[leaf] = c[3] + break + if len(assignment) == len(leaves): + if evaluate_tree(cond_tree, assignment): + name = f"WHEN {when_val}" + if name in dp.branch_names: + dp.active_branches.add(name) + return + for c in cons: + if c[0] == dp.label and c[1] == '=': + name = f"WHEN {c[2]}" + if name in dp.branch_names: + dp.active_branches.add(name) + elif c[0] == dp.label and c[1] == 'not_in': + dp.active_branches.add('OTHER') + + +def _mark_perform(dp, cons): + simple = getattr(dp, 'parsed', None) + if simple: + for c in cons: + if _match_constraint(c, simple): + if c[3]: + dp.active_branches.add('Skip') + else: + dp.active_branches.add('Enter') + else: + for c in cons: + if c[0] == dp.label or any(c[0] == f for f in _get_fields_in_cond(dp.label)): + if c[3]: + dp.active_branches.add('Skip') + else: + dp.active_branches.add('Enter') + + +def _get_fields_in_cond(cond_text): + return re.findall(r'[A-Z][A-Z0-9-]*', cond_text.upper()) + + +def _infer_implied(dp): + dp.implied_branches.update(dp.active_branches) + + +# ── 行号定位(基于原始源文本)── + +def locate_decision_lines(decision_points, raw_source): + """在原始源文本中搜索每个决策点的近似行号""" + lines = raw_source.upper().splitlines() + for dp in decision_points: + patterns = _build_search_patterns(dp) + for i, line in enumerate(lines): + for pat in patterns: + if re.search(pat, line): + dp.source_line = i + 1 + break + if dp.source_line: + break + + +def _normalize(text): + """标准化条件文本用于比较:去多余空白、标准化引号""" + t = re.sub(r'\s+', ' ', text).strip() + t = t.replace('"', "'") + return t + + +def _build_search_patterns(dp): + texts = [] + if dp.kind == 'IF': + texts.append((r'\bIF\b', dp.label)) + elif dp.kind == 'EVALUATE': + texts.append((r'\bEVALUATE\b', dp.label)) + elif dp.kind == 'PERFORM': + texts.append((r'\bUNTIL\b', dp.condition if hasattr(dp, 'condition') else dp.label + if dp.label else '')) + else: + return [r'$^'] # 永不匹配 + + patterns = [] + for keyword, condition in texts: + if not condition: + continue + norm_cond = _normalize(condition) + # 转义正则特殊字符,但保留空格(替换为\s+) + esc = re.escape(norm_cond) + esc = esc.replace(r'\ ', r'\s+') + esc = esc.replace(r'\'', r"['\"]") + patterns.append(keyword + r'\s+' + esc) + if not patterns: + return [r'$^'] + return patterns + + +# ── HTML 报告(详情页)── + +_DETAIL_HTML = ''' + + + + +{title} + + + + +
+ ← 覆盖率总览 + | +

{title}

+
+ +
+ +
+

📈 覆盖率概要

+
+
+
{dec_frac}
+
决策覆盖率
+
+
+
{cond_frac}
+
条件覆盖率
+
+
+
{dp_count_text}
+
决策点
+
+
+
+
+
+
{dec_pct_text}
+
+ 已覆盖 + 未覆盖 + 推断覆盖 +
+
+ + {decision_table} + + {leaf_table} + + {source_section} + +
+ +''' + + +def generate_html_report(decision_points, leaf_stats, source_lines, outpath, + filename='', index_relpath=None, covered_lines=None): + title = f"覆盖率报告 — {filename}" if filename else "覆盖率报告" + + total_branches = sum(len(dp.branch_names) for dp in decision_points) + covered_branches = sum(len(dp.active_branches) for dp in decision_points) + implied_branches = sum(len(dp.implied_branches) for dp in decision_points) + if covered_lines: + # 无分支程序:隐式 100% + total_branches = max(total_branches, 1) + covered_branches = max(covered_branches, 1) + + total_leaves = len(leaf_stats) * 2 + covered_leaves = (sum(1 for l in leaf_stats if l.covered_true) + + sum(1 for l in leaf_stats if l.covered_false)) + + # 计算数值 + is_implicit = bool(covered_lines) # 无分支程序,隐式 100% + dec_pct_val = (covered_branches / total_branches * 100) if total_branches else 0 + dec_pct_text = "100% ✓" if is_implicit else (f"{dec_pct_val:.1f}%" if total_branches else "无") + dec_frac = "全部覆盖" if is_implicit else (f"{covered_branches}/{total_branches}" if total_branches else "—") + cond_frac = f"{covered_leaves}/{total_leaves}" if total_leaves else "—" + implied_text = f'(+{implied_branches - covered_branches} 推断)' if implied_branches > covered_branches else '' + + # 颜色 + if is_implicit or not total_branches or dec_pct_val >= 100: + dec_val_cls = 'val-green' + bar_cls = '' + elif dec_pct_val >= 80: + dec_val_cls = 'val-amber' + bar_cls = ' amber' + else: + dec_val_cls = 'val-red' + bar_cls = ' red' + + if not total_leaves or covered_leaves == total_leaves: + cond_val_cls = 'val-green' + elif covered_leaves / total_leaves >= 0.8: + cond_val_cls = 'val-amber' + else: + cond_val_cls = 'val-red' + + # 决策点表格 + if decision_points: + dp_rows = [] + for dp in decision_points: + ln = str(dp.source_line) if dp.source_line else '?' + branch_cells = [] + for bn in dp.branch_names: + if bn in dp.active_branches: + branch_cells.append(f'{bn} ✓') + elif bn in dp.implied_branches: + branch_cells.append(f'{bn} ○') + else: + branch_cells.append(f'{bn} ✗') + dp_rows.append(f'#{dp.id}{dp.kind}{ln}' + f'{dp.label}' + f'{" ".join(branch_cells)}') + + decision_table = f'''
+

📜 决策点

+ + + {"".join(dp_rows)} +
#类型行号条件分支
+
''' + else: + decision_table = '' + + # 叶条件表格 + if leaf_stats: + leaf_rows = [] + for leaf in leaf_stats: + t = '' if leaf.covered_true else '' + f = '' if leaf.covered_false else '' + leaf_rows.append(f'{leaf.field}{leaf.op}' + f'{leaf.value}{t}{f}') + + leaf_table = f'''
+

🔢 条件覆盖明细(叶条件)

+ + + {"".join(leaf_rows)} +
字段运算符
+
''' + else: + leaf_table = '' + + # 源码标注 + if source_lines: + line_cov = {} + for dp in decision_points: + if dp.source_line: + if dp.source_line not in line_cov: + line_cov[dp.source_line] = [] + has_missed = any(bn not in dp.active_branches for bn in dp.branch_names) + has_active = any(bn in dp.active_branches for bn in dp.branch_names) + if has_active and not has_missed: + line_cov[dp.source_line].append('hl-green') + elif has_active: + line_cov[dp.source_line].append('hl-red') + else: + line_cov[dp.source_line].append('hl-amber') + + # 无分支程序:所有 PD 行标记为已覆盖 + if covered_lines: + for ln in covered_lines: + line_cov.setdefault(ln, []).append('hl-green') + + src_lines = [] + for i, line in enumerate(source_lines, 1): + cls_list = line_cov.get(i, []) + hl = ' ' + ' '.join(cls_list) if cls_list else '' + src_lines.append(f'
' + f'{i}' + f'{line}
') + + source_section = f'''
+

📖 源码标注

+ {"".join(src_lines)} +
''' + else: + source_section = '' + + html = _DETAIL_HTML.format( + title=title, + index_relpath=index_relpath or '#', + dec_frac=dec_frac, + dec_pct_text=dec_pct_text, + dec_val_cls=dec_val_cls, + cond_frac=cond_frac, + cond_val_cls=cond_val_cls, + bar_cls=bar_cls, + bar_pct=str(int(dec_pct_val)), + decision_table=decision_table, + leaf_table=leaf_table, + source_section=source_section, + dp_count_text=('—' if is_implicit else str(len(decision_points))), + ) + + outpath = Path(outpath) + outpath.parent.mkdir(parents=True, exist_ok=True) + outpath.write_text(html, encoding='utf-8') + + +# ── 总括索引页 ── + +_INDEX_HTML = ''' + + + + +覆盖率总览 + + + + +
+

📊 覆盖率总览报告

+ {timestamp} +
+ +
+ +
+
+
{agg_dec_num}
+
决策覆盖率
+
+
+
{agg_cond_num}
+
条件覆盖率
+
+
+
{prog_count}
+
已分析程序
+
+
+
{uncovered_count}
+
未完全覆盖程序
+
+
+ +
+
+ {dec_ring_svg} +
决策覆盖率
+
+
+ {cond_ring_svg} +
条件覆盖率
+
+
+ +
+ 已覆盖 + 未覆盖 + 推断覆盖 +
+ +
+ +
+ + +
+
+ +
+ + + + + + + + + + + +{rows} + +
程序 决策分支 条件覆盖 覆盖率 状态
+
+ +
+ + + + +''' + + +def _ring_svg(pct, color_stops): + """生成 SVG 圆环 HTML。pct: 0-100 浮点数。""" + r = 54 + circ = 2 * 3.14159265 * r + offset = circ * (1 - pct / 100) if pct > 0 else circ + if pct >= 80: + stroke = '#00c853' + elif pct >= 50: + stroke = '#ff8f00' + else: + stroke = '#ff1744' + return ( + f'' + f'' + f'' + f'' + f'{pct:.0f}%' + f'覆盖率' + f'' + ) + + +def generate_coverage_index(programs, outdir): + """生成覆盖率总括索引页。""" + from datetime import datetime + timestamp = datetime.now().strftime('%Y-%m-%d %H:%M') + + agg_total = sum(p['total_branches'] for p in programs) + agg_covered = sum(p['covered_branches'] for p in programs) + agg_implied = sum(p['implied_branches'] for p in programs) + agg_ctotal = sum(p['total_conditions'] for p in programs) + agg_ccovered = sum(p['covered_conditions'] for p in programs) + + agg_dec_pct = (agg_covered / agg_total * 100) if agg_total else 0 + agg_cond_pct = (agg_ccovered / agg_ctotal * 100) if agg_ctotal else 0 + uncovered_count = sum(1 for p in programs if p['total_branches'] and + p['covered_branches'] < p['total_branches']) + + dec_num_cls = 'num-green' if agg_dec_pct == 100 else ('num-amber' if agg_dec_pct >= 80 else 'num-red') + cond_num_cls = 'num-green' if agg_cond_pct == 100 else ('num-amber' if agg_cond_pct >= 80 else 'num-red') + uncovered_num_cls = 'num-green' if uncovered_count == 0 else 'num-red' + + def sort_key(p): + if p['total_branches']: + return -p['covered_branches'] / p['total_branches'] + return -1.0 + sorted_programs = sorted(programs, key=sort_key) + + rows = [] + for p in sorted_programs: + name = p['name'] + href = p['detail_relpath'] + tb = p['total_branches'] + cb = p['covered_branches'] + ib = p['implied_branches'] + tc = p['total_conditions'] + cc = p['covered_conditions'] + imp = p.get('implicit_100', False) + + pct_dec = (cb / tb * 100) if tb else 0 + pct_text = "全部覆盖" if imp else (f"{pct_dec:.1f}%" if tb else "—") + implied_text = f'(+{ib - cb} 推断)' if ib > cb else '' + branch_text = "—" if imp else f"{cb}/{tb}" + cond_text = f"{cc}/{tc}" if tc else "—" + bar_pct = int(pct_dec) + + # 进度条颜色 + if imp or pct_dec >= 100: + bar_cls = '' + elif pct_dec >= 80: + bar_cls = ' amber' + else: + bar_cls = ' red' + + # 状态徽标 + if tb == 0 or (cb == tb and not (ib > cb)): + badge = '✓ 完全' + elif cb == tb and ib > cb: + badge = '○ 推断' + elif pct_dec >= 80: + badge = '⚠ 不足' + else: + badge = '✗ 欠缺' + + # 条件覆盖数字颜色 + if tc: + cond_pct = cc / tc * 100 + cond_color = 'num-green' if cond_pct == 100 else ('num-amber' if cond_pct >= 80 else 'num-red') + cond_display = f'{cond_text}' + else: + cond_display = '' + + row_class = 'row-imperfect' if cb < tb else '' + rows.append(f''' + {name} + {branch_text} {implied_text} + {cond_display} + +
+
+
+ {pct_text} +
+
+ {pct_text} +
+ + {badge} +''') + + dec_ring_svg = _ring_svg(agg_dec_pct, '') + cond_ring_svg = _ring_svg(agg_cond_pct, '') + + html = _INDEX_HTML.format( + timestamp=timestamp, + agg_dec_num=f"{agg_covered}/{agg_total}", + dec_num_cls=dec_num_cls, + agg_cond_num=f"{agg_ccovered}/{agg_ctotal}" if agg_ctotal else "无数据", + cond_num_cls=cond_num_cls, + prog_count=str(len(programs)), + uncovered_num_cls=uncovered_num_cls, + uncovered_count=str(uncovered_count), + dec_ring_svg=dec_ring_svg, + cond_ring_svg=cond_ring_svg, + rows='\n'.join(rows), + ) + + outpath = Path(outdir) / 'coverage' / 'index.html' + outpath.parent.mkdir(parents=True, exist_ok=True) + outpath.write_text(html, encoding='utf-8') + + +# ── PROCEDURE DIVISION 行范围定位(用于无分支程序标记)── + +def _find_proc_range(raw_source: str): + """返回 PROCEDURE DIVISION 的行范围 (start_line, end_line) 1-indexed,或 None。""" + lines = raw_source.splitlines() + proc_start = None + for i, line in enumerate(lines): + if re.search(r'PROCEDURE\s+DIVISION', line.upper()): + proc_start = i + 1 + break + if proc_start is None: + return None + # 找下一个 DIVISION 作为结束边界(或文件尾) + for i in range(proc_start, len(lines)): + if re.search(r'(IDENTIFICATION|DATA|ENVIRONMENT)\s+DIVISION', lines[i].upper()): + return (proc_start, i) # 不包含下一个 DIVISION + return (proc_start, len(lines) + 1) + + +# ── 接入入口 ── + +def run_coverage(branch_tree, branch_paths_with_assigns, fields, + raw_source, output_prefix, index_relpath=None): + """完整覆盖率流程:收集 → 标记 → 定位 → 输出。 + + Returns: + dict: 汇总数据,用于总括页聚合 + """ + decision_points, leaf_stats = collect_decision_points(branch_tree, fields) + + mark_coverage(decision_points, leaf_stats, branch_paths_with_assigns, fields) + + if raw_source: + locate_decision_lines(decision_points, raw_source) + + total = sum(len(dp.branch_names) for dp in decision_points) + covered = sum(len(dp.active_branches) for dp in decision_points) + implied = sum(len(dp.implied_branches) for dp in decision_points) + leaf_covered = (sum(1 for l in leaf_stats if l.covered_true) + + sum(1 for l in leaf_stats if l.covered_false)) + leaf_total = len(leaf_stats) * 2 + + # 无决策点但有路径 → PROCEDURE DIVISION 全部覆盖 + covered_lines = set() + if total == 0 and branch_paths_with_assigns and raw_source: + proc_range = _find_proc_range(raw_source) + if proc_range: + covered_lines.update(range(proc_range[0], proc_range[1])) + total = 1 + covered = 1 + + if output_prefix: + generate_html_report(decision_points, leaf_stats, + raw_source.splitlines() if raw_source else [], + f"{output_prefix}_coverage.html", + Path(output_prefix).stem, + index_relpath=index_relpath, + covered_lines=covered_lines) + + # 控制台摘要 + if total or leaf_total: + logger.info(f"\n=== 分支覆盖率 ===") + if covered_lines and not decision_points: + logger.info(" 程序无分支结构,全部代码已覆盖") + for dp in decision_points: + branches = [] + for bn in dp.branch_names: + if bn in dp.active_branches: + branches.append(f'{bn} [x]') + elif bn in dp.implied_branches: + branches.append(f'{bn} [o]') + else: + branches.append(f'{bn} [ ]') + ln = f":{dp.source_line}" if dp.source_line else "" + logger.info(f" #{dp.id} [{dp.kind}] {dp.label}{ln}") + logger.info(f" {' | '.join(branches)}") + + if total: + pct = covered / total * 100 + logger.info(f"\n 决策覆盖率:{covered}/{total}({pct:.1f}%)") + if leaf_total: + pct = leaf_covered / leaf_total * 100 + logger.info(f" 条件覆盖率:{leaf_covered}/{leaf_total}({pct:.1f}%)") + + if output_prefix: + logger.info(f"\n 覆盖率报告:{output_prefix}_coverage.html") + + implicit_100 = bool(covered_lines) + return { + 'name': Path(output_prefix).stem if output_prefix else '', + 'detail_relpath': ('../' + Path(output_prefix).stem + '_coverage.html' + if output_prefix else ''), + 'total_branches': total, + 'covered_branches': covered, + 'implied_branches': implied, + 'implicit_100': implicit_100, + 'total_conditions': leaf_total, + 'covered_conditions': leaf_covered, + '_decision_points': decision_points, + '_leaf_stats': leaf_stats, + } diff --git a/cobol_testgen/design.py b/cobol_testgen/design.py new file mode 100644 index 0000000..c0d4542 --- /dev/null +++ b/cobol_testgen/design.py @@ -0,0 +1,775 @@ +"""设计层:路径枚举 + 值生成 + 约束应用""" + +import os +import re +import logging +from . import agents, CONFIG +from .models import BrSeq, BrIf, BrEval, BrPerform, Assign, CallNode, CondNot, CondLeaf, ExitNode, GoTo +from .cond import parse_single_condition, parse_compound_condition, is_field, collect_leaves, mcdc_sets, satisfying_value +from .core import trace_to_root, invert_through_chain, propagate_assignments, _basename + +logger = logging.getLogger(__name__) + +_STOP = ('__STOP__', '', None, True) +_MAX_PATHS = 5000 +_FALLBACK_MAX_PATHS = 100 +_ACTIVE_MAX_PATHS = _MAX_PATHS +_LLM_FAILED = False + + +def _filter_stop(cons): + return [c for c in cons if c is not _STOP] + + +def _cap_paths(paths): + if len(paths) > _ACTIVE_MAX_PATHS: + return paths[:_ACTIVE_MAX_PATHS] + return paths + + +# ── 路径枚举 ── + +def _try_llm_enum_paths(node, fields): + global _LLM_FAILED + if _LLM_FAILED: + logger.debug("断路器已跳,跳过 LLM") + return None + if not CONFIG.get("llm_generator", True): + logger.debug("llm_generator 已关闭,降级规则引擎") + return None + if not os.environ.get(agents.DEEPSEEK_API_KEY_ENV): + logger.warning("DEEPSEEK_API_KEY 未设置,降级规则引擎") + return None + try: + result = agents.llm_generate_all_paths(node, fields) + if result is not None: + logger.info(f"LLM 路径生成成功,{len(result)} 条") + return result + logger.warning("LLM 返回空,降级规则引擎") + except Exception as e: + logger.error(f"LLM API 调用异常: {e}") + _LLM_FAILED = True + return None + + +def enum_paths(node, fields): + global _ACTIVE_MAX_PATHS + # === LLM 优先(整体替换整个树的路径生成) === + llm_result = _try_llm_enum_paths(node, fields) + if llm_result is not None: + _ACTIVE_MAX_PATHS = _MAX_PATHS + return llm_result + if _ACTIVE_MAX_PATHS == _MAX_PATHS: + logger.warning("降级到规则引擎(路径上限 5000 → 100)") + _ACTIVE_MAX_PATHS = _FALLBACK_MAX_PATHS + """枚举路径,每条路径返回 (constraints, assignments). + 返回 list[tuple[list[tuple], dict]]. + """ + if isinstance(node, Assign): + return [([], {node.target: [node.source_info]})] + + if isinstance(node, BrSeq): + if not node.children: + return [([], {})] + paths = [([], {})] + for child in node.children: + child_paths = _cap_paths(enum_paths(child, fields)) + new_active = [] + for p_cons, p_assign in paths: + if any(c is _STOP for c in p_cons): + new_active.append((p_cons, p_assign)) + continue + for cp_cons, cp_assign in child_paths: + merged = {} + for d in (p_assign, cp_assign): + for k, v in d.items(): + merged.setdefault(k, []).extend(v if isinstance(v, list) else [v]) + merged_cons = p_cons + list(cp_cons) + new_active.append((merged_cons, merged)) + paths = _cap_paths(new_active) + return paths + + elif isinstance(node, BrIf): + parsed = parse_single_condition(node.condition, fields) + if parsed and is_field(parsed[0], fields): + field, op, val = parsed + paths = [] + true_sub = _cap_paths(enum_paths(node.true_seq, fields)) + for sp_cons, sp_assign in (true_sub or [([], {})]): + paths.append(([(field, op, val, True)] + sp_cons, sp_assign)) + false_sub = _cap_paths(enum_paths(node.false_seq, fields)) + for fp_cons, fp_assign in (false_sub or [([], {})]): + paths.append(([(field, op, val, False)] + fp_cons, fp_assign)) + return paths + # CondNot wrapping a single leaf (e.g., IF NOT WS-AMOUNT > 1000) + if node.cond_tree and isinstance(node.cond_tree, CondNot): + child = node.cond_tree.child + if isinstance(child, CondLeaf) and is_field(child.field, fields): + paths = [] + true_sub = _cap_paths(enum_paths(node.true_seq, fields)) + for sp_cons, sp_assign in (true_sub or [([], {})]): + paths.append(([(child.field, child.op, child.value, False)] + sp_cons, sp_assign)) + false_sub = _cap_paths(enum_paths(node.false_seq, fields)) + for fp_cons, fp_assign in (false_sub or [([], {})]): + paths.append(([(child.field, child.op, child.value, True)] + fp_cons, fp_assign)) + return paths + if node.cond_tree: + leaves = collect_leaves(node.cond_tree) + if leaves and all(is_field(l.field, fields) for l in leaves): + sets = mcdc_sets(node.cond_tree, fields) + if sets: + paths = [] + for constraints, decision in sets: + body = _cap_paths(enum_paths( + node.true_seq if decision else node.false_seq, fields + )) + for sp_cons, sp_assign in (body or [([], {})]): + paths.append((constraints + sp_cons, sp_assign)) + return paths + # CondLeaf fallback: 单 leaf(含 88-level 解析后的条件树)MC/DC 不适用 + if len(leaves) == 1: + leaf = leaves[0] + paths = [] + true_sub = _cap_paths(enum_paths(node.true_seq, fields)) + for sp_cons, sp_assign in (true_sub or [([], {})]): + paths.append(([(leaf.field, leaf.op, leaf.value, True)] + sp_cons, sp_assign)) + false_sub = _cap_paths(enum_paths(node.false_seq, fields)) + for fp_cons, fp_assign in (false_sub or [([], {})]): + paths.append(([(leaf.field, leaf.op, leaf.value, False)] + fp_cons, fp_assign)) + return paths + # Fallback: parsed condition but non-field (e.g. arithmetic expr) + if parsed: + field, op, val = parsed + paths = [] + true_sub = enum_paths(node.true_seq, fields) + for sp_cons, sp_assign in (true_sub or [([], {})]): + paths.append(([(field, op, val, True)] + sp_cons, sp_assign)) + false_sub = enum_paths(node.false_seq, fields) + for fp_cons, fp_assign in (false_sub or [([], {})]): + paths.append(([(field, op, val, False)] + fp_cons, fp_assign)) + return paths + return [([], {})] + + elif isinstance(node, BrEval): + if node.subjects: + paths = [] + prior_false_cons = [] + for values, seq in node.when_list: + sub = _cap_paths(enum_paths(seq, fields)) + for sp_cons, sp_assign in (sub or [([], {})]): + when_cons = [(node.subjects[i], '=', values[i], True) + for i in range(len(node.subjects))] + constraints = list(prior_false_cons) + when_cons + sp_cons + paths.append((constraints, sp_assign)) + for i in range(len(node.subjects)): + prior_false_cons.append((node.subjects[i], '=', values[i], False)) + if node.has_other: + sub = _cap_paths(enum_paths(node.other_seq, fields)) + for sp_cons, sp_assign in (sub or [([], {})]): + paths.append((list(prior_false_cons) + sp_cons, sp_assign)) + return paths + if node.subject == 'TRUE': + paths = [] + prior_false = [] + for value, seq in node.when_list: + cond = parse_compound_condition(value, fields) + if cond and isinstance(cond, CondLeaf) and is_field(cond.field, fields): + # Simple condition + sub = _cap_paths(enum_paths(seq, fields)) + for sp_cons, sp_assign in (sub or [([], {})]): + constraints = list(prior_false) + constraints.append((cond.field, cond.op, cond.value, True)) + paths.append((constraints + sp_cons, sp_assign)) + prior_false.append((cond.field, cond.op, cond.value, False)) + elif cond: + # Compound condition — use MC/DC for path generation + leaves = collect_leaves(cond) + if leaves and all(is_field(l.field, fields) for l in leaves): + sets = mcdc_sets(cond, fields) + if sets: + sub = _cap_paths(enum_paths(seq, fields)) + false_set = None + for cs, decision in sets: + if decision: + for sp_cons, sp_assign in (sub or [([], {})]): + paths.append((list(prior_false) + list(cs) + sp_cons, sp_assign)) + elif false_set is None: + false_set = cs + if false_set is not None: + prior_false.extend(false_set) + else: + prior_false = [] + break + else: + prior_false = [] + break + else: + prior_false = [] + break + else: + prior_false = [] + break + if node.has_other: + sub = _cap_paths(enum_paths(node.other_seq, fields)) + for sp_cons, sp_assign in (sub or [([], {})]): + paths.append((list(prior_false) + sp_cons, sp_assign)) + return paths + if not is_field(node.subject, fields): + return [([], {})] + paths = [] + for value, seq in node.when_list: + sub = _cap_paths(enum_paths(seq, fields)) + for sp_cons, sp_assign in (sub or [([], {})]): + paths.append(([(node.subject, '=', value, True)] + sp_cons, sp_assign)) + if node.has_other: + case_vals = [v for v, _ in node.when_list] + sub = _cap_paths(enum_paths(node.other_seq, fields)) + for sp_cons, sp_assign in (sub or [([], {})]): + paths.append(([(node.subject, 'not_in', case_vals, True)] + sp_cons, sp_assign)) + return paths + + elif isinstance(node, BrPerform): + if node.perf_type in ('para', 'thru'): + if node.body_seq: + return enum_paths(node.body_seq, fields) + return [([], {})] + elif node.perf_type in ('until', 'para_until', 'varying', 'para_varying'): + # 尝试单条件(现有逻辑) + parsed = parse_single_condition(node.condition, fields) + if parsed and is_field(parsed[0], fields): + field, op, val = parsed + paths = [] + false_sub = _cap_paths(enum_paths(node.body_seq, fields)) + for sp_cons, sp_assign in (false_sub or [([], {})]): + # PERFORM VARYING: 将 FROM 值作为 MOVE 赋值加入 Enter 路径 + if node.varying_from and node.varying_var: + is_fld = any(f['name'] == node.varying_from for f in fields) if fields else False + from_asgn = {'type': 'move', 'source_vars': [node.varying_from]} if is_fld else {'type': 'move_literal', 'literal': node.varying_from} + from_assign = {node.varying_var: [from_asgn]} + merged = {} + for d in (from_assign, sp_assign): + for k, v in d.items(): + merged.setdefault(k, []).extend(v if isinstance(v, list) else [v]) + sp_assign = merged + paths.append(([(field, op, val, False)] + sp_cons, sp_assign)) + paths.append(([(field, op, val, True)], {})) + return paths + # 尝试复合条件(AND/OR) + cond_tree = parse_compound_condition(node.condition, fields) + if cond_tree: + leaves = collect_leaves(cond_tree) + if leaves and all(is_field(l.field, fields) for l in leaves): + sets = mcdc_sets(cond_tree, fields) + if sets: + paths = [] + false_sub = _cap_paths(enum_paths(node.body_seq, fields)) + for sp_cons, sp_assign in (false_sub or [([], {})]): + # PERFORM VARYING: 将 FROM 值作为 MOVE 赋值加入 Enter 路径 + if node.varying_from and node.varying_var: + is_fld = any(f['name'] == node.varying_from for f in fields) if fields else False + from_asgn = {'type': 'move', 'source_vars': [node.varying_from]} if is_fld else {'type': 'move_literal', 'literal': node.varying_from} + from_assign = {node.varying_var: [from_asgn]} + merged = {} + for d in (from_assign, sp_assign): + for k, v in d.items(): + merged.setdefault(k, []).extend(v if isinstance(v, list) else [v]) + sp_assign = merged + for constraints, decision in sets: + if not decision: + paths.append((list(constraints) + sp_cons, sp_assign)) + for constraints, decision in sets: + if decision: + paths.append((list(constraints), {})) + if paths: + return paths + return [([], {})] + + elif isinstance(node, CallNode): + return [([], {})] + + elif isinstance(node, ExitNode): + return [([_STOP], {})] + + elif isinstance(node, GoTo): + paths = enum_paths(node.body_seq, fields) + return [([_STOP] + c, a) for c, a in paths] + + return [([], {})] + + +# ── 值生成 ── + +def seq_numeric(seq_num: int, total_digits: int) -> str: + val = seq_num % (10 ** total_digits) + if val == 0: + val = 10 ** total_digits - 1 + return str(val).zfill(total_digits) + + +def seq_alpha(seq_num: int, length: int) -> str: + letter = chr(65 + (seq_num - 1) % 26) + return letter * length + + +def seq_date(seq_num: int) -> str: + from datetime import datetime, timedelta + base = datetime(2000, 1, 1) + d = base + timedelta(days=seq_num - 1) + return d.strftime('%Y%m%d') + + +def _is_date_field(name: str) -> bool: + patterns = [r'DATE', r'YYMMDD', r'YYYYMM', r'YEAR', r'MONTH', r'DAY'] + for p in patterns: + if re.search(p, name.upper()): + return True + return False + + +_SPECIAL_VALUES = { + 'ZERO': '0', 'ZEROS': '0', 'ZEROES': '0', + 'SPACE': ' ', 'SPACES': ' ', + 'HIGH-VALUE': '\xff', 'HIGH-VALUES': '\xff', + 'LOW-VALUE': '\x00', 'LOW-VALUES': '\x00', + 'QUOTE': "'", 'QUOTES': "'", + 'ALL': '', +} + + +def _apply_value(field: dict, rec: dict) -> bool: + """尝试应用 VALUE 子句的初始值。返回 True 表示已处理。""" + raw = field.get('value') + if raw is None: + return False + val = str(raw).strip("'\"").strip() + name = field['name'] + pi = field.get('pic_info', {}) + + # 处理 COBOL 特殊值 + if val.upper() in _SPECIAL_VALUES: + val = _SPECIAL_VALUES[val.upper()] + + ftype = pi.get('type', 'unknown') + if ftype == 'numeric': + digits = pi.get('digits', 0) + pi.get('decimal', 0) + if digits: + rec[name] = val.zfill(digits) + else: + rec[name] = val + else: + length = pi.get('length', 0) or 1 + rec[name] = val.ljust(length)[:length] + return True + + +def _children_of(group_name: str, fields: list) -> list: + """返回组项目 group_name 在 fields 中的直属子字段列表(按声明顺序)。 + 终止条件:遇到同/更高级别(sibling/组边界)或 77 级(独立字段)。 + """ + result = [] + group_level = None + found = False + for f in fields: + if not found and f['name'] == group_name: + group_level = f['level'] + found = True + continue + if found: + if f['level'] <= group_level or f['level'] == 77: + break + # 88-level 是条件名,不计为子字段 + if f.get('is_88'): + continue + result.append(f) + return result + + +def _make_numeric_value(idx: int, record_num: int, total_digits: int) -> str: + for step in (100, 10, 1): + val = idx * step + record_num + if val < 10 ** total_digits: + return str(val).zfill(total_digits) + return str(record_num).zfill(total_digits) + + +def _make_alpha_value(idx: int, record_num: int, length: int) -> str: + if length == 1: + ch = chr(65 + (idx + record_num - 2) % 26) + return ch + letter = chr(65 + (idx - 1) % 26) + return letter + str(record_num).zfill(length - 1) + + +def make_base_record(seq_num: int, fields: list) -> dict: + rec = {} + redefines_map = {} # 标量 REDEFINES: parent_name → [child_names] + group_redefines = [] # 组 REDEFINES: [(redef_name, target_name)] + filler_key_counter = 0 + numeric_idx = 0 + alpha_idx = 0 + record_num = seq_num + + for f in fields: + name = f['name'] + + if f.get('is_88'): + continue + + if f.get('redefines'): + parent = f['redefines'] + if f.get('pic'): + # 标量 REDEFINES(有 PIC,如 WS-AMOUNT-DISP REDEFINES WS-AMOUNT PIC X(9)) + redefines_map.setdefault(parent, []).append(name) + continue + else: + # 组 REDEFINES(无 PIC,如 CUST-ADDR2 REDEFINES CUST-ADDR) + group_redefines.append((name, parent)) + # 不 continue — 组本身无 PIC 会在下方"组项目跳过"处理 + # 其子字段作为独立字段正常走循环 + + if f.get('is_filler'): + if name in rec: + filler_key_counter += 1 + name = f'FILLER_{filler_key_counter + 1}' + rec[name] = 'x' * (f.get('pic_info', {}).get('length', 0) or 1) + continue + + # Pass 0: VALUE 子句初始值优先 + if _apply_value(f, rec): + continue + + # 组项目(无 PIC)跳过 + if not f.get('pic'): + continue + + pi = f.get('pic_info', {}) + ftype = pi.get('type', 'unknown') + digits = pi.get('digits', 0) + decimal = pi.get('decimal', 0) + length = pi.get('length', 0) + + if ftype == 'numeric': + if _is_date_field(name): + rec[name] = seq_date(record_num) + else: + numeric_idx += 1 + rec[name] = _make_numeric_value(numeric_idx, record_num, digits + decimal) + elif ftype in ('alphanumeric', 'alphabetic'): + alpha_idx += 1 + rec[name] = _make_alpha_value(alpha_idx, record_num, length or 1) + elif ftype == 'numeric-edited': + numeric_idx += 1 + raw = _make_numeric_value(numeric_idx, record_num, digits + decimal) + rec[name] = raw.rjust(length) + else: + alpha_idx += 1 + rec[name] = _make_alpha_value(alpha_idx, record_num, 8) + + # Pass 2a: 标量 REDEFINES 复制 + for parent_name, child_names in redefines_map.items(): + if parent_name in rec: + for child_name in child_names: + rec[child_name] = rec[parent_name] + + # Pass 2b: 组 REDEFINES 按位置递归复制子字段 + for redef_name, target_name in group_redefines: + redef_kids = _children_of(redef_name, fields) + tgt_kids = _children_of(target_name, fields) + tgt_idx = 0 + for i, rk in enumerate(redef_kids): + if tgt_idx >= len(tgt_kids): + break + if i == len(redef_kids) - 1 and len(redef_kids) < len(tgt_kids): + # 最后一个 REDEFINES 子字段,且目标更多 → 拼接剩余所有目标值 + parts = [rec.get(tk['name'], '') for tk in tgt_kids[tgt_idx:]] + rec[rk['name']] = ''.join(parts) + elif i == len(redef_kids) - 1 and len(redef_kids) > len(tgt_kids): + # REDEFINES 子字段更多 → 最后一个 REDEFINES 子字段取最后目标值 + rec[rk['name']] = rec.get(tgt_kids[-1]['name'], '') + else: + rec[rk['name']] = rec.get(tgt_kids[tgt_idx]['name'], '') + tgt_idx += 1 + + return rec + + +# ── 约束应用 ── + +def _check_constraint_satisfied(rec, field_name, operator, value, want_true, fields): + """检查 field_name 当前值是否满足该约束。满足返回 True。""" + for f in fields: + if f['name'] == field_name: + pi = f.get('pic_info', {}) + ftype = pi.get('type', 'unknown') + val = rec.get(field_name) + if val is None: + return False + if operator == 'not_in': + cases = value if isinstance(value, list) else [] + return str(val) not in cases + if ftype == 'numeric': + try: + num_val = int(float(str(val))) + num_target = int(float(str(value))) + except (ValueError, TypeError): + return False + if operator in ('>=', '>', '<', '<=', '=', '<>'): + if operator == '>=': ok = num_val >= num_target + elif operator == '>': ok = num_val > num_target + elif operator == '<': ok = num_val < num_target + elif operator == '<=': ok = num_val <= num_target + elif operator == '=': ok = num_val == num_target + elif operator == '<>': ok = num_val != num_target + return ok == want_true + return True + else: + s_val = str(val).strip().upper() + s_target = str(value).strip().upper() + eq = s_val == s_target + if operator == '=': + return eq == want_true + elif operator == '<>': + return (not eq) == want_true + return True + return False + + +_ARITH_BOUNDS = { + 'left_big_ops': {'>', '>=', '<>'}, + 'left_small_ops': {'<', '<='}, +} + +def _arith_pic_info(field_name, fields): + for f in fields: + if f['name'] == field_name.upper(): + return f.get('pic_info', {}) + return {} + +def _arith_numeric_pick(field_name, want_big, fields): + """为字段选一个大值或小值,返回字符串。""" + pi = _arith_pic_info(field_name, fields) + if pi.get('type') != 'numeric': + return None + digits = pi.get('digits', 0) + decimal = pi.get('decimal', 0) + total = digits + decimal + max_val = 10 ** total - 1 + if want_big: + pick = int(max_val * 0.7) + else: + pick = 1 + int_part = str(pick // (10 ** decimal)).zfill(digits) + dec_part = str(pick % (10 ** decimal)).zfill(decimal) + if decimal == 0: + return int_part + return int_part + dec_part + +def _apply_arith_constraint(rec, field_name, operator, value, want_true, fields): + """对算术表达式条件进行字段值 steering。 + + 例如 A + B > C (want_true=True): + - 左值字段(A, B)设大 → 右值字段(C)设小 + 例如 A + B <= C (want_true=True): + - 左值字段设小 → 右值字段设大 + + 这是启发式 steering,不是精确求解。 + 主要目标是保证分支可达,不保证边界值精确。 + """ + # 1. 提取左值表达式中的所有字段名(大写) + tokens = re.findall(r'\b[A-Z][A-Z0-9-]*(?:\([^)]*\))?\b', field_name.upper()) + left_fields = [t for t in tokens if any(f['name'] == t for f in fields)] + + # 2. 右值是否也为字段 + right_field = value if any(f['name'] == value for f in fields) else None + + if not left_fields: + logger.debug(f"算术表达式无法提取字段: {field_name}") + return + + # 3. 确定方向:want_true 时左值应大还是小 + if operator in _ARITH_BOUNDS['left_big_ops']: + left_big = want_true + elif operator in _ARITH_BOUNDS['left_small_ops']: + left_big = not want_true + else: + left_big = want_true + + # 4. 设置左值字段 + for lf in left_fields: + pick = _arith_numeric_pick(lf, left_big, fields) + if pick is not None: + rec[lf] = pick + + # 5. 设置右值字段(如果有) + if right_field: + pick = _arith_numeric_pick(right_field, not left_big, fields) + if pick is not None: + rec[right_field] = pick + + +def apply_constraint(rec, field_name, operator, value, want_true, fields, assignments=None, path_assign=None): + # 标准化字段名:去除括号内空格(WS-CELL ( 1, 1 ) → WS-CELL(1,1)) + field_name = re.sub(r'\s*([(),])\s*', r'\1', field_name) + # 变量下标解析:WS-FIXED-VALUE(WS-IDX) → WS-FIXED-VALUE(1) + vm = re.match(r'^(\w[\w-]*)\((\w[\w-]*)\)$', field_name) + if vm: + base_var, subscript_var = vm.groups() + if subscript_var in rec: + try: + resolved_name = f'{base_var}({int(rec[subscript_var])})' + if any(f['name'] == resolved_name for f in fields): + apply_constraint(rec, resolved_name, operator, value, want_true, fields, assignments, path_assign) + return + except (ValueError, TypeError): + pass + # 下标传播:无下标约束 → 应用到所有下标变体 + base = _basename(field_name) + subscripted = [f for f in fields if f['name'] != base and _basename(f['name']) == base] + if subscripted and field_name == base: + for sf in subscripted: + apply_constraint(rec, sf['name'], operator, value, want_true, fields, assignments, path_assign) + return + + # REDEFINES 字段的约束重定向到父字段(共享存储) + for f in fields: + if f['name'] == field_name: + if f.get('is_filler'): + return + if f.get('redefines'): + parent_name = f['redefines'] + logger.debug(f"REDEFINES 约束重定向: {field_name} → {parent_name}") + apply_constraint(rec, parent_name, operator, value, want_true, fields, assignments, path_assign) + return + break + if assignments: + root_var, chain = trace_to_root(field_name, assignments, fields, path_assign) + if root_var != field_name: + new_field_name, new_op, new_val = invert_through_chain(root_var, chain, operator, value) + if any(f['name'] == new_field_name for f in fields): + field_name, operator, value = new_field_name, new_op, new_val + + # 如果当前值已满足该约束,跳过覆盖(保持先前约束的一致性) + if _check_constraint_satisfied(rec, field_name, operator, value, want_true, fields): + return + + if operator == 'not_in': + for f in fields: + if f['name'] == field_name: + pi = f.get('pic_info', {}) + cases = value if isinstance(value, list) else [] + ftype = pi.get('type', 'unknown') + if ftype in ('alphanumeric', 'alphabetic'): + for c in 'ABCDEFGHIJKLMNOPQRSTUVWXYZ': + if c not in cases: + rec[field_name] = c.ljust(pi.get('length', 1), c) + return + else: + for n in range(1, 100): + if str(n) not in cases: + rec[field_name] = str(n).zfill(pi.get('digits', 0) + pi.get('decimal', 0)) + return + return + # 字段间比较(值侧也是字段名) + if any(f['name'] == value for f in fields): + if re.search(r'[+\-*/]', field_name): + _apply_arith_constraint(rec, field_name, operator, value, want_true, fields) + else: + logger.debug(f"字段间比较约束跳过:{field_name} {operator} {value}") + return + for f in fields: + if f['name'] == field_name: + pi = f.get('pic_info', {}) + val = satisfying_value(pi, operator, value, want_true) + rec[field_name] = val + return + + +# ── 记录生成入口 ── + +def sync_redefined_fields(rec, fields): + """赋值/约束后同步 REDEFINES 字段:父字段的值拷贝到所有 REDEFINES 子字段。""" + redefines_map = {} + group_redefines = [] + for f in fields: + if f.get('is_88') or f.get('is_filler'): + continue + if f.get('redefines') and f.get('pic'): + redefines_map.setdefault(f['redefines'], []).append(f['name']) + elif f.get('redefines') and not f.get('pic'): + group_redefines.append((f['name'], f['redefines'])) + for parent_name, child_names in redefines_map.items(): + if parent_name in rec: + for child_name in child_names: + rec[child_name] = rec[parent_name] + for redef_name, target_name in group_redefines: + redef_kids = _children_of(redef_name, fields) + tgt_kids = _children_of(target_name, fields) + tgt_idx = 0 + for i, rk in enumerate(redef_kids): + if tgt_idx >= len(tgt_kids): + break + if i == len(redef_kids) - 1 and len(redef_kids) < len(tgt_kids): + parts = [rec.get(tk['name'], '') for tk in tgt_kids[tgt_idx:]] + rec[rk['name']] = ''.join(parts) + elif i == len(redef_kids) - 1 and len(redef_kids) > len(tgt_kids): + rec[rk['name']] = rec.get(tgt_kids[-1]['name'], '') + else: + rec[rk['name']] = rec.get(tgt_kids[tgt_idx]['name'], '') + tgt_idx += 1 + + +def apply_occurs_depending(rec, fields): + """根据 OCCURS DEPENDING ON 变量的当前值,清零超范围的下标字段。""" + for f in fields: + dep_var = f.get('occurs_depending') + if not dep_var: + continue + name = f['name'] + m = re.search(r'\((\d+)\)$', name) + if not m: + continue + sub = int(m.group(1)) + max_val = int(rec.get(dep_var, 0)) + if sub <= max_val: + continue + pi = f.get('pic_info', {}) + ftype = pi.get('type', 'unknown') + length = pi.get('length', 0) or 1 + if ftype == 'numeric': + rec[name] = '0' * (pi.get('digits', 0) + pi.get('decimal', 0)) + elif ftype in ('alphanumeric', 'alphabetic'): + rec[name] = ' ' * length + else: + rec[name] = '0' * length + + +def generate_records(branch_paths_with_assigns, data_fields, base_assignments=None, file_sec=None): + """生成测试数据记录。 + branch_paths_with_assigns: list of (constraints, path_assignments). + base_assignments: 全局 assignments dict (用于 trace_to_root). + """ + records = [] + if branch_paths_with_assigns: + for seq, (path_cons, path_assign) in enumerate(branch_paths_with_assigns, start=1): + path_cons = _filter_stop(path_cons) + rec = make_base_record(seq, data_fields) + # Pass A: 先传播赋值(MOVE/COMPUTE/READ INTO 等),模拟到决策点前的程序状态 + if isinstance(path_assign, dict): + propagate_assignments(rec, path_assign, data_fields, file_sec=file_sec) + # Pass B: 约束覆盖(确保决策条件满足,覆盖 MOVE 带来的值) + for c in path_cons: + if len(c) == 4: + field, op, val, want = c + apply_constraint(rec, field, op, val, want, data_fields, base_assignments, path_assign) + # Pass C: 同步 REDEFINES(确保共享存储一致) + sync_redefined_fields(rec, data_fields) + # Pass D: OCCURS DEPENDING ON — 清零超范围的下标字段 + apply_occurs_depending(rec, data_fields) + + records.append(rec) + if not records: + rec = make_base_record(1, data_fields) + if base_assignments: + propagate_assignments(rec, base_assignments, data_fields, file_sec=file_sec) + records.append(rec) + return records diff --git a/cobol_testgen/grammar.lark b/cobol_testgen/grammar.lark new file mode 100644 index 0000000..e58af5a --- /dev/null +++ b/cobol_testgen/grammar.lark @@ -0,0 +1,35 @@ +start: data_div_content +data_div_content: (file_section | working_storage | linkage)* +file_section: "FILE" "SECTION" DOT fd+ +fd: "FD" NAME FD_SUFFIX data_item+ +FD_SUFFIX: /(?:"[^"]*"|'[^']*'|[^.])*\./ +working_storage: "WORKING-STORAGE" "SECTION" DOT data_item* +linkage: "LINKAGE" "SECTION" DOT data_item* +data_item: level_num (NAME | "FILLER") clause* DOT +level_num: LEVEL +clause: pic_clause | value_clause | occurs_clause | redefines_clause | usage_clause + | "SYNC" | "SYNCHRONIZED" + | "JUSTIFIED" "RIGHT"? + | "BLANK" "WHEN" "ZERO" + | "GLOBAL" | "EXTERNAL" +pic_clause: "PIC" "IS"? PICTURE_STRING +value_clause: "VALUE" "IS"? value_literal+ +value_literal: INT | SIGNED_NUMBER | STRING | SQSTRING + | "ZERO" | "ZEROS" | "ZEROES" + | "SPACE" | "SPACES" + | "HIGH-VALUE" | "HIGH-VALUES" + | "LOW-VALUE" | "LOW-VALUES" +SQSTRING: /'[^']*'/ +redefines_clause: "REDEFINES" NAME +occurs_clause: "OCCURS" INT "TIMES"? ("DEPENDING" "ON" NAME)? +usage_clause: USAGE_VAL +USAGE_VAL: "COMP" | "COMP-3" | "COMP-5" | "BINARY" | "PACKED-DECIMAL" | "DISPLAY" +LEVEL: /0[1-9]|[1-4][0-9]|49|77|88/ +NAME: /[A-Z][A-Z0-9-]*/ +PICTURE_STRING: /[0-9A-Z()+,\-*\/V]+/i +INT: /[0-9]+/ +DOT: /\./ +%import common.SIGNED_NUMBER +%import common.ESCAPED_STRING -> STRING +%import common.WS +%ignore WS diff --git a/cobol_testgen/models.py b/cobol_testgen/models.py new file mode 100644 index 0000000..4a06fc8 --- /dev/null +++ b/cobol_testgen/models.py @@ -0,0 +1,151 @@ +"""COBOL数据模型 — 所有层共享,无外部依赖""" + +from dataclasses import dataclass, field + + +# ── 字段定义 ── + +@dataclass +class PicInfo: + type: str = 'unknown' # "numeric" | "alphanumeric" | "alphabetic" + digits: int = 0 + decimal: int = 0 + length: int = 0 + signed: bool = False + + +@dataclass +class FieldDef: + name: str + level: int + pic: str | None = None + pic_info: PicInfo | None = None + is_filler: bool = False + occurs_count: int = 0 + occurs_depending: str | None = None + redefines: str | None = None + usage: str | None = None # "COMP" | "COMP-3" | "BINARY" | "PACKED-DECIMAL" | ... + value: str | None = None + values: list[str] | None = None + is_88: bool = False + parent: str | None = None + section: str | None = None + + +# ── 分支树 ── + +class BrSeq: + def __init__(self): + self.children = [] + + def add(self, child): + self.children.append(child) + + +class BrIf: + def __init__(self, condition): + self.condition = condition + self.cond_tree = None # 由 core.py 在解析时赋值 + self.true_seq = BrSeq() + self.false_seq = BrSeq() + + +class BrEval: + def __init__(self, subject): + self.subject = subject + self.subjects = [] # ALSO 多主体: ['WS-A', 'WS-B'],空=普通模式 + self.when_list = [] + self.other_seq = BrSeq() + self.has_other = False + + +class BrPerform: + def __init__(self, perf_type, condition=None, target=None, thru=None, times=None, + varying_var=None, varying_from=None, varying_by=None): + self.perf_type = perf_type + self.condition = condition + self.target = target + self.thru = thru + self.times = times + self.varying_var = varying_var + self.varying_from = varying_from + self.varying_by = varying_by + self.body_seq = BrSeq() + + +class Assign: + """赋值节点:MOVE/COMPUTE/ADD/SUBTRACT/MULTIPLY/DIVIDE""" + def __init__(self, target: str, source_info: dict): + self.target = target + self.source_info = source_info + + +class CallNode: + """CALL 子程序调用节点(黑盒模式)""" + def __init__(self, program_name: str, using_params: list = None): + self.program_name = program_name + self.using_params = using_params or [] + # using_params: [{"name": "WS-A", "mechanism": "reference"}, ...] + # mechanism: "reference" | "content" | "value" + + +# ── 条件树 ── + +class CondLeaf: + def __init__(self, field, op, value): + self.field = field + self.op = op + self.value = value + + +class CondNot: + def __init__(self, child): + self.child = child + + +class CondAnd: + def __init__(self, left, right): + self.left = left + self.right = right + + +class CondOr: + def __init__(self, left, right): + self.left = left + self.right = right + + +class GoTo: + """GO TO 节点:无条件跳转到指定段落""" + def __init__(self, target: str, body_seq: 'BrSeq' = None): + self.target = target + self.body_seq = body_seq or BrSeq() + + +class ExitNode: + """控制流退出节点:EXIT PARAGRAPH / EXIT PERFORM / EXIT SECTION / EXIT PROGRAM""" + def __init__(self, exit_type: str): + self.exit_type = exit_type + + +# ── 约束路径 ── + +Constraint = tuple # (field, op, value, want_true) +Path = list[Constraint] + + +# ── 解析错误 ── + +@dataclass +class ParseError: + line: int + message: str + severity: str = 'warning' + + +@dataclass +class ProcParseResult: + tree: BrSeq | None = None + assignments: dict = field(default_factory=dict) + errors: list[ParseError] = field(default_factory=list) + fallback_to_ai: bool = False diff --git a/cobol_testgen/output.py b/cobol_testgen/output.py new file mode 100644 index 0000000..ef8a5aa --- /dev/null +++ b/cobol_testgen/output.py @@ -0,0 +1,118 @@ +"""输出层:JSON输出(按文件分组入出力 + 工作存储区分)""" + +import json +from pathlib import Path + + +_INVERSE_OP = {'>': '<=', '<': '>=', '=': '<>', '>=': '<', '<=': '>'} + + +def _scenario_text(path_cons): + parts = [] + for c in path_cons: + if len(c) != 4: + continue + field, op, val, want = c + if op == 'not_in': + desc = f"{field} not in {val}" if want else f"{field} in {val}" + elif not want: + desc = f"{field} {_INVERSE_OP.get(op, '?' + op)} {val}" + else: + desc = f"{field} {op} {val}" + parts.append(desc) + return ', '.join(parts) + + +def output_json(records, outpath, roles=None, fd_fields=None, field_to_fd=None, + open_dir=None, path_cons_list=None): + outpath.parent.mkdir(parents=True, exist_ok=True) + if not roles: + with open(outpath, 'w', encoding='utf-8') as f: + json.dump(records, f, ensure_ascii=False, indent=2) + return + + # FD direction lookup + out = [] + for i, rec in enumerate(records): + inp = {} + out_exp = {} + ws = {} + + # Group by FD + if fd_fields and field_to_fd: + for fd_name, fds_set in fd_fields.items(): + direction = (open_dir or {}).get(fd_name, '') + inp_block = {} + out_block = {} + for fname in fds_set: + if fname not in rec: + continue + r = roles.get(fname, 'unused') + val = rec[fname] + if direction in ('INPUT', 'I-O') and r in ('input', 'inout'): + inp_block[fname] = val + if direction in ('OUTPUT', 'I-O') and r in ('output', 'inout'): + out_block[fname] = val + if inp_block: + inp[fd_name] = inp_block + if out_block: + out_exp[fd_name] = out_block + + # Working-storage: not belonging to any FD + for name, val in rec.items(): + if not field_to_fd or name not in field_to_fd: + ws[name] = val + + entry = { + 'input': inp, + 'expected_output': out_exp, + 'working_storage': ws, + } + + if path_cons_list and i < len(path_cons_list): + text = _scenario_text(path_cons_list[i]) + if text: + entry['scenario'] = text + + out.append(entry) + + with open(outpath, 'w', encoding='utf-8') as f: + json.dump(out, f, ensure_ascii=False, indent=2) + + +def output_input_files(records, outdir, stem, roles, fd_fields, field_to_fd, open_dir): + """按 FD 名拆分出力入力 JSON 文件。 + 每个 INPUT / I-O 方向 FD 生成一个文件:{stem}_{fd_name}.json + 内容为路径数 × 记录,每条只含该 FD 的入力字段值。 + """ + input_fds = {} + for fd_name, fds_set in fd_fields.items(): + direction = (open_dir or {}).get(fd_name, '') + if direction not in ('INPUT', 'I-O'): + continue + has_input = any(roles.get(fname, 'unused') in ('input', 'inout') for fname in fds_set) + if not has_input: + continue + input_fds[fd_name] = fds_set + + if not input_fds: + return + + outdir.mkdir(parents=True, exist_ok=True) + + for fd_name, fds_set in input_fds.items(): + fd_records = [] + direction = (open_dir or {}).get(fd_name, '') + for rec in records: + fd_rec = {} + for fname in fds_set: + r = roles.get(fname, 'unused') + if direction in ('INPUT', 'I-O') and r in ('input', 'inout'): + if fname in rec: + fd_rec[fname] = rec[fname] + if fd_rec: + fd_records.append(fd_rec) + + outpath = outdir / f'{stem}_{fd_name}.json' + with open(outpath, 'w', encoding='utf-8') as f: + json.dump(fd_records, f, ensure_ascii=False, indent=2) diff --git a/cobol_testgen/prompts/parse_proc_division.txt b/cobol_testgen/prompts/parse_proc_division.txt new file mode 100644 index 0000000..4062656 --- /dev/null +++ b/cobol_testgen/prompts/parse_proc_division.txt @@ -0,0 +1,596 @@ +你是一个 COBOL 自动化测试数据生成器的核心解析模块。你的任务是将预处理的 COBOL PROCEDURE DIVISION 源码转换为结构化的 JSON 树,用于后续的路径枚举和测试数据生成。 + +## 输入格式 + +你会收到两样东西: +1. **PROCEDURE DIVISION 源码文本** — 已预处理(大写、无注释、缩进规整) +2. **DATA DIVISION 字段列表** — JSON 数组,每个字段包括 name/level/pic/pic_info 等 + +## 输出格式 + +输出一个 JSON 对象,包含两个顶级键: + +### 1. `assignments` (对象) +记录了 PROCEDURE DIVISION 中每个赋值语句的来源信息。键是目标字段名,值是一个对象,类型如下: + +- **move**: 变数对变数 MOVE (e.g., `MOVE WS-A TO WS-B`) + ```json + {"type": "move", "source_vars": ["WS-A"]} + ``` +- **move_literal**: 字面量/定数 MOVE (e.g., `MOVE 'HELLO' TO WS-B`, `MOVE ZERO TO WS-B`) + ```json + {"type": "move_literal", "literal": "HELLO"} + ``` +- **compute**: COMPUTE/ADD/SUBTRACT/MULTIPLY/DIVIDE + - 二元运算 (var OP const / const OP var): + ```json + {"type": "compute", "source_vars": ["WS-A"], "op": "+", "const": 5, "expr": "WS-A + 5"} + ``` + - 变数间运算 (var OP var): + ```json + {"type": "compute", "source_vars": ["WS-A", "WS-B"], "op": "+", "expr": "WS-A + WS-B"} + ``` + - 复杂表达式 (无法解析): + ```json + {"type": "compute", "source_vars": ["WS-A", "WS-B"], "op": null, "const": null, "expr": "WS-A * (WS-B + 1)"} + ``` + +### 2. `tree` (对象) +一个递归的 JSON 树,表示 PROCEDURE DIVISION 的代码结构。不要包含注释、段落标签(仅作为 PERFORM 目标引用)。 + +#### 节点类型 + +**seq**: 顺序序列(子节点列表) +```json +{"type": "seq", "children": [子节点...]} +``` + +**assign**: 赋值语句(MOVE / COMPUTE / ADD / SUBTRACT / MULTIPLY / DIVIDE) +```json +{"type": "assign", "target": "WS-STATUS", "source_info": {"type": "move_literal", "literal": "H"}} +``` +source_info 必须与 assignments 中对应条目一致。 + +**if**: 条件分支 +```json +{ + "type": "if", + "condition": "WS-AMOUNT > 1000", + "true_seq": {"type": "seq", "children": [...]}, + "false_seq": {"type": "seq", "children": [...]} +} +``` +- 如果无 ELSE,false_seq 应为 `{"type": "seq", "children": []}` +- condition 保持原始文本(不加解析) + +**eval**: EVALUATE 多路分支 +```json +{ + "type": "eval", + "subject": "WS-TYPE", + "when_list": [ + {"value": "A", "seq": {"type": "seq", "children": [...]}}, + {"value": "B", "seq": {"type": "seq", "children": [...]}} + ], + "other_seq": {"type": "seq", "children": [...]}, + "has_other": true +} +``` +- WHEN OTHER 时 has_other=true +- 无 WHEN OTHER 时 has_other=false, other_seq 为空 seq + +**call**: CALL 子程序调用 +```json +{"type": "call", "program_name": "SUBPGM", "using_params": [ + {"name": "WS-AMOUNT", "mechanism": "reference"}, + {"name": "WS-RESULT", "mechanism": "reference"} +]} +``` +- CALL 是顺序执行语句(不产生分支),作为 seq 的子节点放在相应位置 +- USING 参数按 COBOL 源码顺序列出 +- mechanism 取值: + - `"reference"`: BY REFERENCE(默认)— 子程序可能修改该变量 + - `"content"`: BY CONTENT — 传副本,调用方变量不会被修改 + - `"value"`: BY VALUE — 传值(仅数值/指针) + - 无 BY 子句时默认为 `"reference"` +- 字面量参数(如 `BY VALUE 100`)不包含字段名,只在 mechanism 为 `"value"` 时保留 + +**perform**: PERFORM 语句 +```json +// 段落调用: +{"type": "perform", "perf_type": "para", "target": "1000-INIT"} + +// PERFORM THRU: +{"type": "perform", "perf_type": "thru", "target": "1000-INIT", "thru": "2000-END"} + +// 内联 PERFORM UNTIL: +{"type": "perform", "perf_type": "until", "condition": "WS-COUNT > 3", + "body_seq": {"type": "seq", "children": [...]}} + +// PERFORM VARYING: +{"type": "perform", "perf_type": "varying", "condition": "WS-I > 10", + "varying_var": "WS-I", "varying_from": "1", "varying_by": "1", + "body_seq": {"type": "seq", "children": [...]}} + +// PERFORM 段落 + UNTIL: +{"type": "perform", "perf_type": "para_until", "target": "2000-HIGH", "condition": "WS-COUNT > 100"} +``` + +### 定数 (Figurative Constants) 处理规则 + +以下定数在 MOVE 时直接用作字面量(保留原值): + +| 定数 | 规则 | +|------|------| +| ZERO / ZEROS / ZEROES | `literal: "0"` | +| SPACE / SPACES | `literal: " "` | +| HIGH-VALUE / HIGH-VALUES | `literal: "HIGH-VALUE"` | +| LOW-VALUE / LOW-VALUES | `literal: "LOW-VALUE"` | +| QUOTE / QUOTES | `literal: "'"` | +| ALL literal | `literal: literal值` | + +## COBOL 语法处理规则 + +### 1. IF 语句 +``` +IF condition + statements... +[ELSE + statements...] +END-IF. +``` +- condition 可以是简单条件、复合条件(AND/OR)、带 NOT 前置 +- true_seq 为 condition 为真时执行的分支,false_seq 为条件为假时的分支 +- IF 可以和 ELSE IF 嵌套,此时结构化为嵌套 if 的 false_seq + +### 2. EVALUATE 语句 +``` +EVALUATE subject + WHEN value1 + statements... + WHEN value2 + statements... + WHEN OTHER + statements... +END-EVALUATE. +``` +- subject 是单个字段 +- value 是具体值或 OTHER +- 每个 WHEN 的 seq 是该分支下的语句序列 +- WHEN 内的 GO TO / STOP RUN 不影响结构 + +### 3. PERFORM 语句 + +多种形态: + +**段落调用**: +``` +PERFORM 1000-INIT +``` + +**段落范围**: +``` +PERFORM 1000-INIT THRU 2000-END +``` + +**内联 UNTIL**: +``` +PERFORM UNTIL condition + statements... +END-PERFORM +``` + +**VARYING**: +``` +PERFORM VARYING WS-I FROM 1 BY 1 UNTIL WS-I > 10 + statements... +END-PERFORM +``` + +**段落 + UNTIL**: +``` +PERFORM 2000-HIGH UNTIL WS-COUNT > 100 +``` + +### 4. 段落 (Paragraphs) + +PROCEDURE DIVISION 中的段落以标签名(后跟句点)开始、以下一个段落标签或文件末尾结束。 + +``` +PARA-NAME. + statement + statement + . +NEXT-PARA. + statement +``` + +段落标签会被 PERFORM 引用。如果代码不在任何 PERFORM 中执行(顶级流程),段落按顺序依次执行,遇到 STOP RUN / GOBACK 结束。 + +在树结构中: +- 顶级流程入口(PROCEDURE DIVISION 后的第一个段落)作为树的根 seq +- 后续每个段落对应一个独立的 seq,只有在被 PERFORM 调用时才执行 +- 段落标签本身不是节点,只作为 PERFORM 的目标引用 + +### 5. CALL 语句 + +CALL 调用子程序,参数通过 USING 传递。 + +``` +CALL 'SUBPGM' USING WS-A WS-B WS-C +CALL 'SUBPGM' USING BY REFERENCE WS-A BY CONTENT WS-B BY VALUE 100 +``` + +- CALL 是顺序执行,不产生分支 +- USING 参数按 COBOL 源码顺序列出 +- 缺省传递机制时默认为 BY REFERENCE +- 字段名参数保持原样,字面量/数值参数如 `BY VALUE 100` 不放入 using_params(因为无字段名) +- CALL 后继续执行下一条语句 + +### 6. 赋值语句 + +| COBOL | JSON 类型 | 示例 source_info | +|-------|-----------|-----------------| +| MOVE 'HELLO' TO WS-A | move_literal | `{"type":"move_literal","literal":"HELLO"}` | +| MOVE WS-B TO WS-A | move | `{"type":"move","source_vars":["WS-B"]}` | +| MOVE ZERO TO WS-A | move_literal | `{"type":"move_literal","literal":"0"}` | +| MOVE SPACE TO WS-A | move_literal | `{"type":"move_literal","literal":" "}` | +| MOVE HIGH-VALUE TO WS-A | move_literal | `{"type":"move_literal","literal":"HIGH-VALUE"}` | +| COMPUTE WS-A = WS-B + 1 | compute (const OP var) | `{"type":"compute","source_vars":["WS-B"],"op":"+","const":1,"expr":"WS-B + 1"}` | +| COMPUTE WS-A = 2 * WS-B | compute (const OP var) | 同上,op="*" | +| COMPUTE WS-A = WS-B + WS-C | compute (var OP var) | `{"type":"compute","source_vars":["WS-B","WS-C"],"op":"+","expr":"WS-B + WS-C"}` | +| COMPUTE WS-A = (WS-B + 1) * WS-C | compute (复杂) | `{"type":"compute","source_vars":["WS-B","WS-C"],"op":null,"const":null,"expr":"(WS-B + 1) * WS-C"}` | +| ADD 5 TO WS-A | compute (const) | `{"type":"compute","source_vars":["WS-A"],"op":"+","const":5,"expr":"WS-A + 5"}` | +| SUBTRACT 3 FROM WS-A | compute (const) | `{"type":"compute","source_vars":["WS-A"],"op":"-","const":3,"expr":"WS-A - 3"}` | +| MULTIPLY 2 BY WS-A | compute (const) | `{"type":"compute","source_vars":["WS-A"],"op":"*","const":2,"expr":"WS-A * 2"}` | +| DIVIDE 4 INTO WS-A | compute (const) | `{"type":"compute","source_vars":["WS-A"],"op":"/","const":4,"expr":"WS-A / 4"}` | + +### 7. 控制流结束 + +| 语句 | 含义 | +|------|------| +| STOP RUN | 程序结束,不执行后续代码 | +| GOBACK | 返回调用者(类似 STOP RUN) | +| EXIT PROGRAM | 返回调用者 | + +这些语句不是树节点,但标记了当前段落/分支的结束。 + +### 8. 88-level 条件名 + +``` +05 CALL-TYPE PIC X(1). + 88 CALL-LOCAL VALUE 'L'. + 88 CALL-DOMESTIC VALUE 'D'. +``` + +在条件中如 `IF CALL-LOCAL`,等价于 `IF CALL-TYPE = 'L'`。条件名可替换为父字段 + 值。 + +## 输出规则总结 + +1. **assignments**: 包含所有出现的赋值语句,**不区分分支**(全局收集) +2. **tree**: 只包含结构化的 if/eval/perform/assign 节点,**不包含段落标签** +3. 注释行(* 在第7列)已被预处理移除 +4. 每个 assign 节点必须与 assignments 中的条目一一对应 +5. condition 保持原始文本,不要解析或转换 +6. 88-level 条件在 tree.condition 中直接替换为父字段条件(如 `IF CALL-TYPE = 'L'`) +7. 赋值中的字段名、字面量保持原始值,多单词字段用连字符(如 WS-AMOUNT) + +## Few-Shot 示例 + +### 示例 1:简单 IF/ELSE +**输入:** +``` +PROCEDURE DIVISION. + IF WS-AMOUNT > 1000 + MOVE 'H' TO WS-STATUS + ELSE + MOVE 'L' TO WS-STATUS + END-IF. + STOP RUN. +``` + +**输出:** +```json +{ + "assignments": { + "WS-STATUS": {"type": "move_literal", "literal": "H"}, + "WS-STATUS": {"type": "move_literal", "literal": "L"} + }, + "tree": { + "type": "seq", + "children": [ + { + "type": "if", + "condition": "WS-AMOUNT > 1000", + "true_seq": { + "type": "seq", + "children": [ + {"type": "assign", "target": "WS-STATUS", "source_info": {"type": "move_literal", "literal": "H"}} + ] + }, + "false_seq": { + "type": "seq", + "children": [ + {"type": "assign", "target": "WS-STATUS", "source_info": {"type": "move_literal", "literal": "L"}} + ] + } + } + ] + } +} +``` + +### 示例 2:EVALUATE +**输入:** +``` +PROCEDURE DIVISION. + EVALUATE WS-TYPE + WHEN 'A' + MOVE 'TYPE-A' TO WS-MEMO + WHEN 'B' + MOVE 'TYPE-B' TO WS-MEMO + WHEN OTHER + MOVE 'OTHER' TO WS-MEMO + END-EVALUATE. + STOP RUN. +``` + +**输出:** +```json +{ + "assignments": { + "WS-MEMO": {"type": "move_literal", "literal": "TYPE-A"}, + "WS-MEMO": {"type": "move_literal", "literal": "TYPE-B"}, + "WS-MEMO": {"type": "move_literal", "literal": "OTHER"} + }, + "tree": { + "type": "seq", + "children": [ + { + "type": "eval", + "subject": "WS-TYPE", + "when_list": [ + {"value": "A", "seq": {"type": "seq", "children": [ + {"type": "assign", "target": "WS-MEMO", "source_info": {"type": "move_literal", "literal": "TYPE-A"}} + ]}}, + {"value": "B", "seq": {"type": "seq", "children": [ + {"type": "assign", "target": "WS-MEMO", "source_info": {"type": "move_literal", "literal": "TYPE-B"}} + ]}} + ], + "other_seq": {"type": "seq", "children": [ + {"type": "assign", "target": "WS-MEMO", "source_info": {"type": "move_literal", "literal": "OTHER"}} + ]}, + "has_other": true + } + ] + } +} +``` + +### 示例 3:嵌套 IF + PERFORM 段落 +**输入:** +``` +PROCEDURE DIVISION. + IF WS-AMOUNT > 5000 + PERFORM 2000-HIGH + ELSE + PERFORM 3000-LOW + END-IF. + STOP RUN. +2000-HIGH. + MOVE 'H' TO WS-STATUS. +3000-LOW. + MOVE 'L' TO WS-STATUS. +``` + +**输出:** +```json +{ + "assignments": { + "WS-STATUS": {"type": "move_literal", "literal": "H"}, + "WS-STATUS": {"type": "move_literal", "literal": "L"} + }, + "tree": { + "type": "seq", + "children": [ + { + "type": "if", + "condition": "WS-AMOUNT > 5000", + "true_seq": {"type": "seq", "children": [ + {"type": "perform", "perf_type": "para", "target": "2000-HIGH"} + ]}, + "false_seq": {"type": "seq", "children": [ + {"type": "perform", "perf_type": "para", "target": "3000-LOW"} + ]} + } + ] + } +} +``` + +### 示例 4:内联 PERFORM UNTIL +**输入:** +``` +PROCEDURE DIVISION. + MOVE 1 TO WS-COUNT. + PERFORM UNTIL WS-COUNT > 10 + ADD 1 TO WS-COUNT + END-PERFORM. + STOP RUN. +``` + +**输出:** +```json +{ + "assignments": { + "WS-COUNT": {"type": "move_literal", "literal": "1"}, + "WS-COUNT": {"type": "compute", "source_vars": ["WS-COUNT"], "op": "+", "const": 1, "expr": "WS-COUNT + 1"} + }, + "tree": { + "type": "seq", + "children": [ + {"type": "assign", "target": "WS-COUNT", "source_info": {"type": "move_literal", "literal": "1"}}, + { + "type": "perform", + "perf_type": "until", + "condition": "WS-COUNT > 10", + "body_seq": {"type": "seq", "children": [ + {"type": "assign", "target": "WS-COUNT", "source_info": {"type": "compute", "source_vars": ["WS-COUNT"], "op": "+", "const": 1, "expr": "WS-COUNT + 1"}} + ]} + } + ] + } +} +``` + +### 示例 5:PERFORM VARYING + 复合条件 +**输入:** +``` +PROCEDURE DIVISION. + MOVE 0 TO WS-TOTAL-CHARGE. + PERFORM VARYING WS-COUNT FROM 1 BY 1 UNTIL WS-COUNT > 3 + IF CALL-HOUR >= 08 AND CALL-HOUR < 22 + MOVE 'Y' TO WS-PEAK-FLAG + ELSE + MOVE 'N' TO WS-PEAK-FLAG + END-IF + END-PERFORM. + STOP RUN. +``` + +**输出:** +```json +{ + "assignments": { + "WS-TOTAL-CHARGE": {"type": "move_literal", "literal": "0"}, + "WS-PEAK-FLAG": {"type": "move_literal", "literal": "Y"}, + "WS-PEAK-FLAG": {"type": "move_literal", "literal": "N"} + }, + "tree": { + "type": "seq", + "children": [ + {"type": "assign", "target": "WS-TOTAL-CHARGE", "source_info": {"type": "move_literal", "literal": "0"}}, + { + "type": "perform", + "perf_type": "varying", + "condition": "WS-COUNT > 3", + "varying_var": "WS-COUNT", + "varying_from": "1", + "varying_by": "1", + "body_seq": {"type": "seq", "children": [ + { + "type": "if", + "condition": "CALL-HOUR >= 08 AND CALL-HOUR < 22", + "true_seq": {"type": "seq", "children": [ + {"type": "assign", "target": "WS-PEAK-FLAG", "source_info": {"type": "move_literal", "literal": "Y"}} + ]}, + "false_seq": {"type": "seq", "children": [ + {"type": "assign", "target": "WS-PEAK-FLAG", "source_info": {"type": "move_literal", "literal": "N"}} + ]} + } + ]} + } + ] + } +} +``` + +### 示例 6:88-level 条件名 +**输入:** +``` +PROCEDURE DIVISION. + IF CALL-LOCAL + MOVE 'L' TO WS-TYPE + END-IF. + STOP RUN. +``` +(DATA: 88 CALL-LOCAL VALUE 'L', parent field CALL-TYPE PIC X(1)) + +**输出:** +```json +{ + "assignments": { + "WS-TYPE": {"type": "move_literal", "literal": "L"} + }, + "tree": { + "type": "seq", + "children": [ + { + "type": "if", + "condition": "CALL-TYPE = 'L'", + "true_seq": {"type": "seq", "children": [ + {"type": "assign", "target": "WS-TYPE", "source_info": {"type": "move_literal", "literal": "L"}} + ]}, + "false_seq": {"type": "seq", "children": []} + } + ] + } +} +``` + +### 示例 7:CALL 子程序调用 +**输入:** +``` +PROCEDURE DIVISION. + MOVE 0 TO WS-RESULT. + IF WS-AMOUNT > 1000 + MOVE 'H' TO WS-STATUS + CALL 'CALCSUB' USING WS-AMOUNT WS-TYPE WS-RESULT + ELSE + MOVE 'L' TO WS-STATUS + CALL 'CALCSUB' USING WS-AMOUNT WS-TYPE WS-RESULT + END-IF. + STOP RUN. +``` + +**输出:** +```json +{ + "assignments": { + "WS-RESULT": {"type": "move_literal", "literal": "0"}, + "WS-STATUS": {"type": "move_literal", "literal": "H"}, + "WS-STATUS": {"type": "move_literal", "literal": "L"} + }, + "tree": { + "type": "seq", + "children": [ + {"type": "assign", "target": "WS-RESULT", "source_info": {"type": "move_literal", "literal": "0"}}, + { + "type": "if", + "condition": "WS-AMOUNT > 1000", + "true_seq": {"type": "seq", "children": [ + {"type": "assign", "target": "WS-STATUS", "source_info": {"type": "move_literal", "literal": "H"}}, + {"type": "call", "program_name": "CALCSUB", "using_params": [ + {"name": "WS-AMOUNT", "mechanism": "reference"}, + {"name": "WS-TYPE", "mechanism": "reference"}, + {"name": "WS-RESULT", "mechanism": "reference"} + ]} + ]}, + "false_seq": {"type": "seq", "children": [ + {"type": "assign", "target": "WS-STATUS", "source_info": {"type": "move_literal", "literal": "L"}}, + {"type": "call", "program_name": "CALCSUB", "using_params": [ + {"name": "WS-AMOUNT", "mechanism": "reference"}, + {"name": "WS-TYPE", "mechanism": "reference"}, + {"name": "WS-RESULT", "mechanism": "reference"} + ]} + ]} + } + ] + } +} +``` + +## 错误处理 + +- 无法识别的语句:跳过该行(不影响整体结构) +- 不完整的语句(如 IF 无 END-IF):尝试合理推断嵌套关系 +- 嵌套段落引用(PERFORM A THRU B):使用 perf_type "thru" +- 字段名与 88-level 名冲突:以字段定义为准 + +## 输出要求 + +- 只输出一个 JSON 对象(无多余文本、无 markdown 标记) +- JSON 必须合法(双引号、正确逗号、无尾逗号) +- assignments 中**每个赋值只记录一次**(不区分分支) +- tree 必须完整包含所有可达代码路径 +- 字段名、字面量保持原始值(不转换大小写,不移动) diff --git a/cobol_testgen/read.py b/cobol_testgen/read.py new file mode 100644 index 0000000..d0f4ef2 --- /dev/null +++ b/cobol_testgen/read.py @@ -0,0 +1,439 @@ +"""??????? + COPYBOOK + DATA DIVISION?? + PIC""" + +import re +from pathlib import Path +from lark import Lark, Transformer, v_args + +from .models import FieldDef, PicInfo + + +# 鈹€鈹€ Preprocessor 鈹€鈹€ + + +def _is_fixed_format(source: str) -> bool: + if re.search(r'>>SOURCE\s+FORMAT\s+IS\s+FREE', source, re.IGNORECASE): + return False + if re.search(r'>>SOURCE\s+FORMAT\s+IS\s+FIXED', source, re.IGNORECASE): + return True + lines = [l for l in source.splitlines() if l.strip()] + fixed_hits = 0 + free_hits = 0 + for line in lines[:10]: + if len(line) >= 72: + free_hits += 1 + elif len(line) >= 7 and line[6] in ('*', '/', '-', 'D'): + fixed_hits += 1 + return fixed_hits >= free_hits if (fixed_hits + free_hits) > 0 else True + + +def preprocess(source: str) -> str: + fixed = _is_fixed_format(source) + lines = [] + for raw_line in source.splitlines(): + line = raw_line.rstrip() + if not line: + lines.append('') + continue + if fixed: + if len(line) >= 7 and line[6] in ('*', '/'): + continue + if len(line) >= 7 and line[6] == '-': + if lines: + lines[-1] = lines[-1] + ' ' + line[7:].lstrip() + continue + if len(line) >= 7 and line[6].upper() == 'D': + continue + content = line[6:] if len(line) >= 7 else line + else: + comment_pos = line.find('*>') + if comment_pos >= 0: + line = line[:comment_pos] + line = line.strip() + if not line: + continue + content = line + lines.append(content.upper()) + return '\n'.join(lines) + + +def extract_data_division(source: str) -> str: + m = re.search(r'DATA\s+DIVISION\s*\.', source) + if not m: + return '' + start = m.end() + end_m = re.search(r'PROCEDURE\s+DIVISION', source[start:]) + if end_m: + end = start + end_m.start() + else: + end = len(source) + return source[start:end].strip() + + +def extract_procedure_division(source: str) -> str: + m = re.search(r'PROCEDURE\s+DIVISION', source) + if not m: + return '' + return source[m.start():].strip() + + +# 鈹€鈹€ COPYBOOK Resolution 鈹€鈹€ + +_COPYBOOK_EXTENSIONS = ['.cpy', '.cbl', '.cpb', ''] + + +def resolve_copybooks(source: str, source_dir: str) -> str: + """Find COPY statements and replace with copybook content.""" + _RE_COPY = re.compile( + r"^\s*COPY\s+(\w[\w-]*)(?:\s+REPLACING\s+(.+?))?\s*\.?\s*$", + re.IGNORECASE + ) + _RE_PAIR = re.compile(r"==(.+?)==\s+BY\s+==(.+?)==", re.IGNORECASE) + + lines = source.split('\n') + result = [] + for line in lines: + m = _RE_COPY.match(line) + if m: + name = m.group(1).upper() + found = None + for ext in _COPYBOOK_EXTENSIONS: + p = Path(source_dir, name + ext) + if p.exists(): + found = p + break + if found: + cb = found.read_text(encoding='utf-8') + if m.group(2): + pairs = _RE_PAIR.findall(m.group(2)) + for old, new in pairs: + cb = re.sub( + re.escape(old.strip()), new.strip(), + cb, flags=re.IGNORECASE + ) + result.append(f' * COPY {name}') + result.append(cb) + else: + result.append(line) + else: + result.append(line) + return '\n'.join(result) + + +# 鈹€鈹€ Lark Grammar 鈹€鈹€ + +_GRAMMAR_CACHE = None + + +def _get_grammar() -> str: + global _GRAMMAR_CACHE + if _GRAMMAR_CACHE is None: + lark_path = Path(__file__).parent / 'grammar.lark' + _GRAMMAR_CACHE = lark_path.read_text(encoding='utf-8') + return _GRAMMAR_CACHE + + +# 鈹€鈹€ Data Transformer 鈹€鈹€ + +@v_args(inline=True) +class DataTransformer(Transformer): + def __init__(self): + super().__init__() + self.fields = [] + self._last_parent = None + self._pending = [] + + def start(self, *items): + for f in self._pending: + f['section'] = f.get('section', 'WORKING-STORAGE') + self.fields.append(f) + self._pending = [] + return self.fields + + def file_section(self, *args): + for f in self._pending: + f['section'] = 'FILE' + self.fields.append(f) + self._pending = [] + return None + + def working_storage(self, *args): + for f in self._pending: + f['section'] = 'WORKING-STORAGE' + self.fields.append(f) + self._pending = [] + return None + + def linkage(self, *args): + for f in self._pending: + f['section'] = 'LINKAGE' + self.fields.append(f) + self._pending = [] + return None + + def data_item(self, level_num, name, *clauses): + level = int(str(level_num)) + name = str(name) + is_filler = (name.upper() == 'FILLER') + pic = None + value = None + values = None + redefines = None + usage = None + occurs_count = 0 + occurs_depending = None + for c in clauses: + if isinstance(c, dict): + if 'pic' in c: + pic = c['pic'] + if 'value' in c: + value = c['value'] + if 'values' in c: + values = c['values'] + if 'redefines' in c: + redefines = c['redefines'] + if 'usage' in c: + usage = c['usage'] + if 'occurs' in c: + occurs_count = c['occurs'] + if 'depends' in c: + occurs_depending = c['depends'] + + base = { + 'level': level, + 'name': name, + 'pic': pic if pic else None, + 'value': value, + 'values': values, + 'is_filler': is_filler, + 'redefines': redefines, + 'usage': usage, + 'occurs': occurs_count, + 'occurs_depending': occurs_depending, + } + + if pic is not None: + self._pending.append(base) + self._last_parent = name + elif level == 88 and value is not None: + base.update({ + 'pic': None, + 'value': value.strip("'").strip('"'), + 'values': [v.strip("'").strip('"') for v in values] if values else None, + 'is_88': True, + 'parent': self._last_parent or '', + }) + self._pending.append(base) + else: + # 组项目(无 PIC,有下级字段) + self._pending.append(base) + self._last_parent = name + return None + + def clause(self, *args): + # ?????????? dict??????? token + result = {} + for a in args: + if isinstance(a, dict): + result.update(a) + elif isinstance(a, str) and a.upper() in ( + 'COMP', 'COMP-3', 'COMP-5', 'BINARY', 'PACKED-DECIMAL', 'DISPLAY', + ): + result['usage'] = a.upper() + return result if result else None + + def pic_clause(self, *args): + return {'pic': str(args[-1])} + + def usage_clause(self, token): + return {'usage': str(token)} + + def value_clause(self, *args): + values = [] + for a in args: + if isinstance(a, str) and a.upper() in ('VALUE', 'IS'): + continue + val = str(a).strip("'").strip('"') + values.append(val) + return {'value': values[0], 'values': values} if values else {'value': None} + + def value_literal(self, token): + return str(token) + + def occurs_clause(self, *args): + result = {'occurs': int(args[0])} + if len(args) >= 2: + result['depends'] = str(args[1]) + return result + + def redefines_clause(self, *args): + return {'redefines': str(args[-1])} + + def level_num(self, token): + return token + + def NAME(self, token): + return str(token) + + def PICTURE_STRING(self, token): + return str(token) + + def INT(self, token): + return int(token) + + +# 鈹€鈹€ PIC Parser 鈹€鈹€ + +def _expand_pic(s: str) -> str: + result = '' + i = 0 + while i < len(s): + if s[i] == '(': + j = s.find(')', i) + if j > i + 1: + count = int(s[i + 1:j]) + if result: + result += result[-1] * (count - 1) + i = j + 1 + continue + result += s[i] + i += 1 + return result + + +def parse_pic(pic_str: str) -> PicInfo: + info = PicInfo() + s = pic_str.upper().strip() + if not s: + return info + if s.startswith('S'): + info.signed = True + s = s[1:] + expanded = _expand_pic(s) + if expanded[0] == '9': + info.type = 'numeric' + if 'V' in expanded: + parts = expanded.split('V') + info.digits = parts[0].count('9') + info.decimal = parts[1].count('9') + else: + info.digits = expanded.count('9') + info.decimal = 0 + elif expanded[0] == 'X': + info.type = 'alphanumeric' + info.length = len(expanded) + elif expanded[0] == 'A': + info.type = 'alphabetic' + info.length = len(expanded) + elif expanded[0] in ('Z', '*', '$', '+', '-'): + info.type = 'numeric-edited' + info.digits = expanded.count('9') + if 'V' in expanded: + info.decimal = expanded.split('V')[1].count('9') + elif '.' in expanded: + info.decimal = expanded.split('.')[1].count('9') + info.length = len(expanded) + elif expanded.endswith('CR') or expanded.endswith('DB'): + info.type = 'numeric-edited' + stripped = expanded[:-2] + info.digits = stripped.count('9') + if 'V' in stripped: + info.decimal = stripped.split('V')[1].count('9') + elif '.' in stripped: + info.decimal = stripped.split('.')[1].count('9') + info.length = len(expanded) + else: + info.type = 'alphanumeric' + info.length = len(expanded) + return info + + +# 鈹€鈹€ DATA DIVISION 鍏ュ彛 鈹€鈹€ + +def parse_data_division(data_div_text: str) -> list[FieldDef]: + """??DATA DIVISION???FieldDef????PIC???""" + grammar = _get_grammar() + parser = Lark(grammar, parser='earley', lexer='dynamic') + tree = parser.parse(data_div_text) + + transformer = DataTransformer() + raw = transformer.transform(tree) + + result = [] + for r in raw: + pic = r.get('pic', '') + info = parse_pic(pic) if pic else None + f = FieldDef( + name=r['name'], + level=r['level'], + pic=pic, + pic_info=info, + is_filler=r.get('is_filler', False), + occurs_count=r.get('occurs', 0), + occurs_depending=r.get('occurs_depending'), + redefines=r.get('redefines'), + usage=r.get('usage'), + value=r.get('value'), + values=r.get('values'), + is_88=r.get('is_88', False), + parent=r.get('parent'), + section=r.get('section'), + ) + result.append(f) + return result + + +# 鈹€鈹€ FILE-CONTROL / FILE SECTION / OPEN 瑙f瀽 鈹€鈹€ + + +def parse_file_control(source: str) -> dict: + """?? FILE-CONTROL??? {?????: ?????}""" + m = re.search(r'FILE-CONTROL\.(.*?)(?=DATA\s+DIVISION|\Z)', source, re.DOTALL | re.IGNORECASE) + if not m: + return {} + fc = m.group(1) + result = {} + for m in re.finditer( + r'SELECT\s+(\w[\w-]*)\s+[^.]*?\bASSIGN\s+TO\s+(["\'])(.*?)\2', + fc, re.IGNORECASE + ): + result[m.group(1).upper()] = m.group(3).upper() + return result + + +def parse_file_section(source: str) -> dict: + """?? FILE SECTION??? {?????: [01?????...]}""" + m = re.search(r'FILE\s+SECTION\.(.*?)(?=WORKING-STORAGE\s+SECTION|LINKAGE\s+SECTION|\Z)', + source, re.DOTALL | re.IGNORECASE) + if not m: + return {} + fs = m.group(1) + result = {} + # ? FD ?????? FD ? + fd_blocks = re.split(r'\n\s*(?=FD\s+)', fs.strip()) + for block in fd_blocks: + m = re.match(r'FD\s+(\w[\w-]*)', block, re.IGNORECASE) + if not m: + continue + name = m.group(1).upper() + # ???????? 01 ???? + recs = re.findall(r'^\s*0{0,1}1\s+(\w[\w-]*)', block, re.MULTILINE) + result[name] = [r.upper() for r in recs] + return result + + +def scan_open_statements(source: str) -> dict: + """?? OPEN ????? {?????: 'INPUT'|'OUTPUT'|'I-O'}""" + dirs = {} + for m in re.finditer( + r'OPEN\s+((?:INPUT|OUTPUT|I-O)\s+[\w\s-]+' + r'(?:\s+(?:INPUT|OUTPUT|I-O)\s+[\w\s-]+)*)', + source, re.IGNORECASE + ): + full = m.group(1) + for seg_m in re.finditer( + r'(INPUT|OUTPUT|I-O)\s+([\w\s-]+)', full, re.IGNORECASE + ): + direction = seg_m.group(1).upper() + for fname in re.findall(r'\w[\w-]*', seg_m.group(2)): + dirs[fname.upper()] = direction + return dirs