feat: complete INSPECT/SEARCH support, fix PERFORM/EVAL coverage marking

- Add INSPECT (TALLYING/REPLACING/CONVERTING) with BEFORE/AFTER INITIAL
- Add SEARCH/SEARCH ALL with element-assignment path enumeration
- Fix _mark_perform compound condition marking via evaluate_tree
- Fix EVALUATE TRUE prior_false to collect all MC/DC false sets
- Add impossible path filtering (Pass A.5) with trace-to-root conflict detection
- Fix multi-line PERFORM VARYING parsing (VARYING/FROM/BY/UNTIL on separate lines)
- Remove dead code: agents.py LLM parser (replaced by rule-based _BrParser)
- 59 unit tests passing, 5 integration programs verified
This commit is contained in:
hangshuo652
2026-06-10 22:56:22 +08:00
parent 0730045e27
commit 7ac887c776
9 changed files with 509 additions and 1005 deletions
+4 -15
View File
@@ -7,10 +7,7 @@ from pathlib import Path
# ── 配置(必须放在本地模块导入之前,避免循环导入) ──
CONFIG = {
"proc_parser": "rule", # "rule" | "ai"
"llm_generator": False, # True=启用LLM路径生成; False=纯规则引擎
}
CONFIG = {}
from .read import preprocess, extract_data_division, extract_procedure_division
from .read import resolve_copybooks, parse_data_division, parse_file_section, scan_open_statements
@@ -18,7 +15,6 @@ from .core import build_branch_tree, classify_field_roles, _init_child_names
from .cond import parse_single_condition, is_field
from .design import enum_paths, generate_records, _filter_stop
from .output import output_json, output_input_files
from . import agents
from .coverage import run_coverage, generate_coverage_index
logger = logging.getLogger(__name__)
@@ -233,14 +229,7 @@ def main():
assignments = {}
if proc_div:
if CONFIG["proc_parser"] == "ai":
try:
result = agents.parse_proc_division_ai(proc_div, fields_dict)
branch_tree, assignments = result
except NotImplementedError:
branch_tree, assignments = build_branch_tree(proc_div, fields_dict)
else:
branch_tree, assignments = build_branch_tree(proc_div, fields_dict)
branch_tree, assignments = build_branch_tree(proc_div, fields_dict)
roles = classify_field_roles(branch_tree, assignments, fields_dict,
source=preprocessed, proc_text=proc_div)
@@ -281,14 +270,14 @@ def main():
cov_result = run_coverage(branch_tree, branch_paths_with_assigns, fields_dict,
source, cov_prefix, index_relpath=index_relpath)
records = generate_records(branch_paths_with_assigns, fields_dict, assignments, file_sec=file_sec)
records, kept_path_cons = generate_records(branch_paths_with_assigns, fields_dict, assignments, file_sec=file_sec)
# 输出 JSON(完整文件)
outpath = outdir / (filepath.stem + '.json')
output_json(records, outpath, roles,
fd_fields=fd_fields, field_to_fd=field_to_fd,
open_dir=open_dir,
path_cons_list=[c for c, a in branch_paths_with_assigns])
path_cons_list=kept_path_cons)
# 输出入力 JSON(按 FD 拆分)
output_input_files(records, outdir, filepath.stem, roles,
-308
View File
@@ -1,308 +0,0 @@
"""AI智能体接口 — 基于DeepSeek的PROCEDURE DIVISION解析"""
import json
import os
import re
from pathlib import Path
from .models import BrSeq, BrIf, BrEval, BrPerform, Assign, CallNode
DEEPSEEK_API_KEY_ENV = "DEEPSEEK_API_KEY"
DEEPSEEK_BASE_URL = "https://api.deepseek.com/v1"
DEEPSEEK_MODEL = "deepseek-chat"
PROMPT_FILE = Path(__file__).parent / "prompts" / "parse_proc_division.txt"
def parse_proc_division_ai(proc_text: str, fields: list = None, spec_doc: str = ""):
"""AI版PROCEDURE DIVISION解析:调用DeepSeek API,返回(branch_tree, assignments)."""
api_key = os.environ.get(DEEPSEEK_API_KEY_ENV)
if not api_key:
raise NotImplementedError(
f"AI agent requires {DEEPSEEK_API_KEY_ENV} environment variable"
)
prompt = _build_prompt(proc_text, fields)
response_text = _call_llm(prompt, api_key)
data = _extract_json(response_text)
if not data:
raise NotImplementedError("AI returned no parsable JSON")
branch_tree = _json_to_tree(data.get("tree", {}))
assignments = data.get("assignments", {})
return branch_tree, assignments
def _build_prompt(proc_text: str, fields: list = None) -> list[dict]:
system = PROMPT_FILE.read_text(encoding="utf-8")
fields_json = json.dumps(fields, ensure_ascii=False, indent=2) if fields else "[]"
user = f"""## PROCEDURE DIVISION 源码
```
{proc_text}
```
## DATA DIVISION 字段列表
```json
{fields_json}
```
"""
return [
{"role": "system", "content": system},
{"role": "user", "content": user},
]
def _call_llm(messages: list[dict], api_key: str) -> str:
try:
from openai import OpenAI
except ImportError:
raise NotImplementedError(
"openai package not installed. Run: pip install openai"
)
client = OpenAI(api_key=api_key, base_url=DEEPSEEK_BASE_URL)
response = client.chat.completions.create(
model=DEEPSEEK_MODEL,
messages=messages,
temperature=0.1,
max_tokens=8192,
)
return response.choices[0].message.content or ""
def _extract_json(text: str) -> dict | None:
stripped = text.strip()
# Try extracting from markdown code block first
m = re.search(r"```(?:json)?\s*\n?(.*?)\n?```", stripped, re.DOTALL)
if m:
stripped = m.group(1).strip()
try:
return json.loads(stripped)
except json.JSONDecodeError:
return None
def _json_to_tree(data: dict):
node_type = data.get("type", "seq")
if node_type == "seq":
node = BrSeq()
for child_data in data.get("children", []):
child = _json_to_tree(child_data)
if child is not None:
node.add(child)
return node
if node_type == "if":
node = BrIf(data.get("condition", ""))
node.true_seq = _json_to_tree(data.get("true_seq", {"type": "seq", "children": []}))
node.false_seq = _json_to_tree(data.get("false_seq", {"type": "seq", "children": []}))
return node
if node_type == "eval":
node = BrEval(data.get("subject", ""))
for w in data.get("when_list", []):
node.when_list.append((w.get("value", ""), _json_to_tree(w.get("seq", {"type": "seq", "children": []}))))
node.other_seq = _json_to_tree(data.get("other_seq", {"type": "seq", "children": []}))
node.has_other = data.get("has_other", False)
return node
if node_type == "perform":
perf_type = data.get("perf_type", "para")
kw = {"perf_type": perf_type}
for k in ("condition", "target", "thru", "times",
"varying_var", "varying_from", "varying_by"):
if k in data:
kw[k] = data[k]
node = BrPerform(**kw)
if "body_seq" in data:
node.body_seq = _json_to_tree(data["body_seq"])
return node
if node_type == "assign":
return Assign(
target=data.get("target", ""),
source_info=data.get("source_info", {}),
)
if node_type == "call":
return CallNode(
program_name=data.get("program_name", ""),
using_params=data.get("using_params", []),
)
return None
# ── LLM 路径生成 ──
def llm_generate_all_paths(tree_root, fields) -> list | None:
"""为整个控制流树生成 MC/DC 路径。返回 [(constraints, assignments), ...] 或 None。"""
api_key = os.environ.get(DEEPSEEK_API_KEY_ENV)
if not api_key:
return None
tree_json = _serialize_tree_for_llm(tree_root)
if tree_json is None:
return None
level88_map = _extract_88_mapping(fields)
messages = _build_path_prompt(tree_json, fields, level88_map)
try:
response = _call_llm(messages, api_key)
data = _extract_json(response)
if data and "paths" in data:
return _parse_llm_paths(data["paths"])
except Exception:
pass
return None
def _serialize_tree_for_llm(node):
if node is None:
return None
from .models import BrSeq, BrIf, BrEval, BrPerform, Assign, CallNode, ExitNode, GoTo
if isinstance(node, BrSeq):
children = []
for child in node.children:
s = _serialize_tree_for_llm(child)
if s is not None:
children.append(s)
return {"type": "seq", "children": children} if children else None
if isinstance(node, BrIf):
return {
"type": "if",
"condition": node.condition,
"true_seq": _serialize_tree_for_llm(node.true_seq) or {"type": "seq", "children": []},
"false_seq": _serialize_tree_for_llm(node.false_seq) or {"type": "seq", "children": []},
}
if isinstance(node, BrEval):
when_list = []
for val, seq in node.when_list:
s = _serialize_tree_for_llm(seq)
when_list.append({"value": val, "seq": s or {"type": "seq", "children": []}})
return {
"type": "eval",
"subject": node.subject,
"when_list": when_list,
"other_seq": _serialize_tree_for_llm(node.other_seq) or {"type": "seq", "children": []},
"has_other": node.has_other,
}
if isinstance(node, BrPerform):
result = {"type": "perform", "perf_type": node.perf_type}
for attr in ("condition", "target", "thru", "times",
"varying_var", "varying_from", "varying_by"):
val = getattr(node, attr, None)
if val is not None:
result[attr] = val
if node.body_seq:
bs = _serialize_tree_for_llm(node.body_seq)
if bs:
result["body_seq"] = bs
return result
# Assign / CallNode / ExitNode / GoTo — 不影响路径生成,可省略
return None
def _extract_88_mapping(fields):
mapping = {}
for f in fields:
if f.get('is_88'):
mapping[f['name']] = {
"parent": f['parent'],
"value": f['value'],
"pic_info": f.get('pic_info', {}),
}
return mapping
def _build_path_prompt(tree_json, fields, level88_map):
system = ("你是 COBOL 测试路径生成专家。"
"请为给定的控制流树生成满足 MC/DC 覆盖的测试路径集。"
"只输出 JSON,不要多余文字。")
reduced_fields = []
for f in fields:
entry = {"name": f["name"], "pic": f.get("pic", "")}
pi = f.get("pic_info", {})
if pi:
entry["pic_info"] = {
"type": pi.get("type"), "digits": pi.get("digits"),
"decimal": pi.get("decimal"), "length": pi.get("length"),
}
if f.get("is_88"):
entry["is_88"] = True
entry["value"] = f.get("value")
entry["parent"] = f.get("parent")
reduced_fields.append(entry)
user = (
"## 控制流树(JSON\n\n"
f"```json\n{json.dumps(tree_json, ensure_ascii=False, indent=2)}\n```\n\n"
"## 字段定义\n\n"
f"```json\n{json.dumps(reduced_fields, ensure_ascii=False, indent=2)}\n```\n\n"
"## 要求\n"
"1. 每个 IF/EVALUATE/PERFORM UNTIL 的每个分支至少被覆盖一次\n"
"2. 复合条件(AND/OR/NOT)需要满足 MC/DC:每个叶条件的独立影响对\n"
"3. 路径数尽量少(最小集优先)\n"
"4. 88-level 条件名要展开为实际字段比较(如 CUST-VIP → WS-CUST-LEVEL='V'\n"
"5. 同一路径中的约束不能自相矛盾(同一字段不能同时等于 'A' 和等于 'B'\n"
"6. 数值边界值合理(>5000 → 5001, <100 → 99\n"
"7. AND 优先级高于 OR\n\n"
"## 输出格式\n\n"
"```json\n"
"{\n"
' "paths": [\n'
" {\n"
' "constraints": [\n'
' {"field": "WS-AMOUNT", "op": ">", "value": "5000", "want_true": true}\n'
" ],\n"
' "assignments": {}\n'
" }\n"
" ]\n"
"}\n"
"```"
)
return [
{"role": "system", "content": system},
{"role": "user", "content": user},
]
def _parse_llm_paths(paths_data):
result = []
for p in paths_data:
constraints = []
for c in p.get("constraints", []):
constraints.append((c["field"], c["op"], str(c["value"]), c["want_true"]))
assignments = p.get("assignments", {})
result.append((constraints, assignments))
return result
def resolve_constraints_ai(paths, fields=None, assignments=None):
"""AI版约束推理(未来实现)"""
raise NotImplementedError("AI agent not yet implemented")
def enhance_metadata_ai(records, fields=None, spec_doc: str = ""):
"""AI版测试用例元数据生成(未来实现)"""
raise NotImplementedError("AI agent not yet implemented")
def analyze_spec_ai(spec_doc: str = ""):
"""AI版式样书解析(未来实现)"""
raise NotImplementedError("AI agent not yet implemented")
+197 -13
View File
@@ -3,7 +3,7 @@
import re
import logging
from datetime import datetime
from .models import BrSeq, BrIf, BrEval, BrPerform, BrSeq, CondLeaf, CondNot, ParseError, Assign, CallNode, ExitNode, GoTo
from .models import BrSeq, BrIf, BrEval, BrPerform, BrSearch, BrSeq, CondLeaf, CondNot, ParseError, Assign, CallNode, ExitNode, GoTo
from .cond import parse_compound_condition, parse_single_condition, collect_leaves
logger = logging.getLogger(__name__)
@@ -12,6 +12,7 @@ logger = logging.getLogger(__name__)
_COBOL_SCOPE_ENDERS = {
'END-IF', 'END-EVALUATE', 'END-PERFORM', 'END-EXEC', 'END-CALL',
'END-READ', 'END-WRITE', 'END-DELETE', 'END-REWRITE', 'END-START',
'END-SEARCH',
'ELSE', 'WHEN', 'OTHER',
}
@@ -22,22 +23,26 @@ def scan_paragraphs(raw_lines):
while i < len(raw_lines):
line = raw_lines[i].strip()
m = re.match(r'^([A-Z0-9][A-Z0-9-]*)\.\s*$', line)
sec_m = re.match(r'^([A-Z][A-Z0-9-]*)\s+SECTION\.?\s*$', line, re.IGNORECASE)
if m and m.group(1) not in _COBOL_SCOPE_ENDERS:
name = m.group(1)
start = i + 1
j = i + 1
while j < len(raw_lines):
nline = raw_lines[j].strip()
nm = re.match(r'^([A-Z0-9][A-Z0-9-]*)\.\s*$', nline)
if nm and nm.group(1) not in _COBOL_SCOPE_ENDERS:
break
if re.match(r'^[A-Z][A-Z0-9-]*\s+SECTION\.\s*$', nline, re.IGNORECASE):
break
j += 1
paragraphs[name] = (start, j - 1)
i = j
elif sec_m:
name = sec_m.group(1).upper()
else:
i += 1
continue
start = i + 1
j = i + 1
while j < len(raw_lines):
nline = raw_lines[j].strip()
nm = re.match(r'^([A-Z0-9][A-Z0-9-]*)\.\s*$', nline)
if nm and nm.group(1) not in _COBOL_SCOPE_ENDERS:
break
if re.match(r'^[A-Z][A-Z0-9-]*\s+SECTION\.\s*$', nline, re.IGNORECASE):
break
j += 1
paragraphs[name] = (start, j - 1)
i = j
return paragraphs
@@ -160,6 +165,10 @@ class _BrParser:
if perf_node:
seq.add(perf_node)
continue
m_search = re.match(r'^SEARCH\b(?:\s+(ALL))?\s+(\w[\w-]*)(?:\s+VARYING\s+(\w[\w-]*))?', line, re.IGNORECASE)
if m_search:
seq.add(self._parse_search(m_search))
continue
m = re.match(r'^INITIALIZE\s+', line)
if m:
init_seq = self._parse_initialize()
@@ -229,6 +238,15 @@ class _BrParser:
if m_set:
seq.add(self._parse_set_true(m_set.group(1)))
continue
m_insp = re.match(r'^INSPECT\s+', line, re.IGNORECASE)
if m_insp:
info = self._parse_inspect(line)
if info:
tgt = info.get('tgt', '')
self.assignments.setdefault(tgt, []).append(info)
seq.add(Assign(tgt, info))
self.advance()
continue
assign_node = self._record_assignment(line)
if assign_node:
seq.add(assign_node)
@@ -243,6 +261,81 @@ class _BrParser:
return True
return False
# ── INSPECT ──
_PIC_FIG_CONV = {'ZERO': '0', 'ZEROS': '0', 'ZEROES': '0',
'SPACE': ' ', 'SPACES': ' '}
@staticmethod
def _expand_figurative(val):
if val.upper() in _BrParser._PIC_FIG_CONV:
return _BrParser._PIC_FIG_CONV[val.upper()]
return val
def _parse_inspect_phrase(self, phrase):
m = re.match(
r'TALLYING\s+(\w[\w-]*)\s+FOR\s+'
r'(LEADING|TRAILING|CHARACTERS)'
r'(?:\s+([\'"])(.*?)\3)?'
r'(?:\s+(BEFORE|AFTER)\s+INITIAL\s+([\'"])(.*?)\6)?\s*$',
phrase, re.IGNORECASE
)
if m:
return ('tally', {
'count_var': m.group(1).upper(),
'kind': m.group(2).upper(),
'char': self._expand_figurative(m.group(4) or ''),
'before_after': (m.group(5) or '').upper(),
'delimiter': self._expand_figurative(m.group(7) or ''),
})
m = re.match(
r'REPLACING\s+'
r'(ALL|LEADING|FIRST|CHARACTERS)\s+'
r'([\'"])(.*?)\2\s+BY\s+'
r'([\'"])(.*?)\4'
r'(?:\s+(BEFORE|AFTER)\s+INITIAL\s+([\'"])(.*?)\7)?\s*$',
phrase, re.IGNORECASE
)
if m:
return ('replace', {
'kind': m.group(1).upper(),
'src': self._expand_figurative(m.group(3)),
'dst': self._expand_figurative(m.group(5)),
'before_after': (m.group(6) or '').upper(),
'delimiter': self._expand_figurative(m.group(8) or ''),
})
m = re.match(
r'CONVERTING\s+([\'"])(.*?)\1\s+TO\s+([\'"])(.*?)\3\s*$',
phrase, re.IGNORECASE
)
if m:
return ('convert', {
'from_chars': self._expand_figurative(m.group(2)),
'to_chars': self._expand_figurative(m.group(4)),
})
return None
def _parse_inspect(self, line):
m = re.match(r'^INSPECT\s+(\w[\w-]*)\s+(.+)$', line, re.IGNORECASE)
if not m:
return None
tgt = m.group(1).upper()
rest = m.group(2).strip()
phrases = re.split(r'\s+(?=(?:TALLYING|REPLACING|CONVERTING)\b)', rest, flags=re.IGNORECASE)
sub_ops = []
for phrase in phrases:
sub = self._parse_inspect_phrase(phrase.strip())
if sub:
sub_ops.append(sub)
if not sub_ops:
return None
return {
'type': 'inspect',
'tgt': tgt,
'source_vars': [tgt],
'sub_ops': sub_ops,
}
def _record_assignment(self, line):
if self.assignments is None:
return None
@@ -503,6 +596,44 @@ class _BrParser:
vars_in = re.findall(r'[A-Z][A-Z0-9-]*', expr.upper())
return {'type': 'compute', 'source_vars': list(set(vars_in)), 'op': None, 'const': None, 'expr': expr}
# ── SEARCH / SEARCH ALL ──
def _parse_search(self, m):
is_all = bool(m.group(1))
table = m.group(2).upper()
varying = m.group(3).upper() if m.group(3) else None
node = BrSearch(table, is_all=is_all, varying=varying)
self.advance()
while self.pos < len(self.lines):
line = self.clean()
if line in ('END-SEARCH', 'END-SEARCH.'):
self.advance()
return node
m_at = re.match(r'^AT\s+END(.+)?$', line, re.IGNORECASE)
if m_at:
self.advance()
rest = m_at.group(1)
if rest and rest.strip():
self.lines.insert(self.pos, rest.strip())
node.at_end_seq = self.parse_seq(
end_check=lambda l: re.match(r'^WHEN\b', l) or l in ('END-SEARCH',)
)
node.has_at_end = True
continue
m_when = re.match(r'^WHEN\s+(.+?)\s*$', line, re.IGNORECASE)
if m_when:
cond_upper = m_when.group(1).strip()
self.advance()
cond_tree = parse_compound_condition(cond_upper, self.fields)
body_seq = self.parse_seq(
end_check=lambda l: re.match(r'^(WHEN|AT\s+END)\b', l) or l in ('END-SEARCH',)
)
node.when_list.append((cond_upper, body_seq))
node.cond_trees.append(cond_tree)
continue
self.advance()
return node
def _parse_if(self):
line = self.clean()
m = re.match(r'^IF\s+(.+?)(?:THEN)?\s*$', line)
@@ -1039,6 +1170,18 @@ def _resolve_subscript(key, rec):
return key
def _apply_before_after(val, before_after, delimiter):
if not delimiter:
return val
if before_after == 'BEFORE':
idx = val.find(delimiter)
return val[:idx] if idx >= 0 else val
if before_after == 'AFTER':
idx = val.find(delimiter)
return val[idx + len(delimiter):] if idx >= 0 else ''
return val
def propagate_assignments(rec, assignments, fields, file_sec=None):
def raw_to_float(val, pi):
if pi.get('type') == 'numeric':
@@ -1233,6 +1376,47 @@ def propagate_assignments(rec, assignments, fields, file_sec=None):
if all_found:
rec[resolved_tgt] = float_to_raw(total, pi_tgt)
# Pass 4.5: INSPECT
for tgt, asgn in flat_list:
if asgn['type'] != 'inspect':
continue
resolved_tgt = _resolve_subscript(tgt, rec)
if resolved_tgt not in rec:
continue
src_val = str(rec[resolved_tgt])
for op_type, params in asgn.get('sub_ops', []):
if op_type == 'tally':
cv = params['count_var'].upper()
cv_pi = pi_map.get(cv, {})
effective = _apply_before_after(src_val, params.get('before_after'), params.get('delimiter'))
cnt = 0
if params['kind'] == 'LEADING':
cnt = len(effective) - len(effective.lstrip(params['char']))
elif params['kind'] == 'TRAILING':
cnt = len(effective) - len(effective.rstrip(params['char']))
else:
cnt = len(effective)
if cv_pi.get('type') == 'numeric':
rec[cv] = float_to_raw(float(cnt), cv_pi)
elif op_type == 'replace':
effective = _apply_before_after(src_val, params.get('before_after'), params.get('delimiter'))
if params['kind'] == 'ALL':
new_val = effective.replace(params['src'], params['dst'])
elif params['kind'] == 'LEADING':
new_val = effective
while new_val.startswith(params['src']):
new_val = new_val[len(params['src']):]
new_val = effective.replace(params['src'], params['dst'], 1)
elif params['kind'] == 'FIRST':
new_val = effective.replace(params['src'], params['dst'], 1)
else:
new_val = params['dst'] * len(effective)
rec[resolved_tgt] = new_val
elif op_type == 'convert':
effective = _apply_before_after(src_val, params.get('before_after'), params.get('delimiter'))
table = str.maketrans(params['from_chars'], params['to_chars'])
rec[resolved_tgt] = effective.translate(table)
# Pass 5: STRING / UNSTRING
for tgt, asgn in flat_list:
if asgn['type'] == 'string_concat':
+97 -11
View File
@@ -6,7 +6,7 @@ from dataclasses import dataclass, field
from pathlib import Path
logger = logging.getLogger(__name__)
from .models import BrSeq, BrIf, BrEval, BrPerform, CondLeaf
from .models import BrSeq, BrIf, BrEval, BrPerform, BrSearch, CondLeaf
from .cond import parse_single_condition, parse_compound_condition, is_field, collect_leaves, evaluate_tree
@@ -83,6 +83,26 @@ def collect_decision_points(node, fields, counter=None):
p, l = _walk_collect(node.other_seq, fields, counter)
points.extend(p); all_leaves.extend(l)
elif isinstance(node, BrSearch):
counter[0] += 1
branch_names = []
for cond_text, seq in node.when_list:
branch_names.append(f'WHEN {cond_text[:40]}')
if node.has_at_end:
branch_names.append('AT END')
dp = DecisionPoint(id=counter[0], kind='SEARCH',
label=node.table_name, branch_names=branch_names)
dp.when_list = node.when_list
dp.cond_trees = node.cond_trees
dp.has_other = node.has_at_end
points.append(dp)
for cond_text, seq in node.when_list:
p, l = _walk_collect(seq, fields, counter)
points.extend(p); all_leaves.extend(l)
if node.has_at_end:
p, l = _walk_collect(node.at_end_seq, fields, counter)
points.extend(p); all_leaves.extend(l)
elif isinstance(node, BrPerform):
if node.perf_type in ('until', 'para_until', 'varying', 'para_varying'):
counter[0] += 1
@@ -92,6 +112,13 @@ def collect_decision_points(node, fields, counter=None):
simple = parse_single_condition(node.condition) if node.condition else None
if simple and is_field(simple[0], fields):
dp.parsed = simple
elif node.condition:
cond_tree = parse_compound_condition(node.condition, fields)
if cond_tree:
leaves = collect_leaves(cond_tree)
if leaves:
dp.cond_tree = cond_tree
dp.cond_leaves = list(leaves)
points.append(dp)
p, l = _walk_collect(node.body_seq, fields, counter)
points.extend(p); all_leaves.extend(l)
@@ -116,9 +143,11 @@ def mark_coverage(decision_points, leaf_stats, branch_paths, fields):
if dp.kind == 'IF':
_mark_if(dp, cons)
elif dp.kind == 'EVALUATE':
_mark_eval(dp, cons)
_mark_eval(dp, cons, fields)
elif dp.kind == 'PERFORM':
_mark_perform(dp, cons)
elif dp.kind == 'SEARCH':
_mark_search(dp, cons, fields)
for leaf in leaf_stats:
for c in cons:
if _match_leaf(c, leaf):
@@ -128,7 +157,7 @@ def mark_coverage(decision_points, leaf_stats, branch_paths, fields):
leaf.covered_false = True
for dp in decision_points:
_infer_implied(dp)
dp.implied_branches = set(dp.active_branches)
def _match_constraint(c, parsed):
@@ -180,18 +209,20 @@ def _mark_if(dp, cons):
dp.active_branches.add('T' if c[3] else 'F')
def _mark_eval(dp, cons):
def _mark_eval(dp, cons, fields=None):
if dp.label == 'TRUE':
matched = False
for when_val, _ in dp.when_list:
parsed = parse_single_condition(when_val)
parsed = parse_single_condition(when_val, fields)
if parsed:
for c in cons:
if _match_constraint(c, parsed):
if _match_constraint(c, parsed) and c[3]:
name = f"WHEN {when_val}"
if name in dp.branch_names:
dp.active_branches.add(name)
matched = True
else:
cond_tree = parse_compound_condition(when_val)
cond_tree = parse_compound_condition(when_val, fields)
if cond_tree and not isinstance(cond_tree, CondLeaf):
leaves = list(collect_leaves(cond_tree))
assignment = {}
@@ -205,6 +236,15 @@ def _mark_eval(dp, cons):
name = f"WHEN {when_val}"
if name in dp.branch_names:
dp.active_branches.add(name)
matched = True
if not matched and 'OTHER' in dp.branch_names:
when_fields = set()
for when_val, _ in dp.when_list:
for c in cons:
if c[0] in when_val:
when_fields.add(c[0])
if when_fields:
dp.active_branches.add('OTHER')
return
for c in cons:
if c[0] == dp.label and c[1] == '=':
@@ -215,6 +255,44 @@ def _mark_eval(dp, cons):
dp.active_branches.add('OTHER')
def _mark_search(dp, cons, fields=None):
branch_masks = [False] * len(dp.branch_names)
for i, (cond_text, body_seq) in enumerate(dp.when_list):
cond_tree = dp.cond_trees[i] if i < len(dp.cond_trees) else None
if not cond_tree:
continue
if isinstance(cond_tree, CondLeaf):
for c in cons:
if len(c) == 4:
base_c = re.sub(r'\s*\(.*?\)\s*$', '', c[0])
base_cond = re.sub(r'\s*\(.*?\)\s*$', '', cond_tree.field)
if base_c == base_cond and c[1] == cond_tree.op \
and str(c[2]) == str(cond_tree.value) and c[3]:
branch_masks[i] = True
break
else:
leaves = list(collect_leaves(cond_tree))
assignment = {}
for leaf in leaves:
for c in cons:
if len(c) == 4:
base_c = re.sub(r'\s*\(.*?\)\s*$', '', c[0])
base_l = re.sub(r'\s*\(.*?\)\s*$', '', leaf.field)
if base_c == base_l and c[1] == leaf.op and str(c[2]) == str(leaf.value):
assignment[leaf] = c[3]
break
if len(assignment) == len(leaves):
if evaluate_tree(cond_tree, assignment):
branch_masks[i] = True
if dp.has_other:
at_end_idx = len(dp.branch_names) - 1
if not any(branch_masks[:at_end_idx]):
branch_masks[at_end_idx] = True
for i, m in enumerate(branch_masks):
if m:
dp.active_branches.add(dp.branch_names[i])
def _mark_perform(dp, cons):
simple = getattr(dp, 'parsed', None)
if simple:
@@ -224,6 +302,18 @@ def _mark_perform(dp, cons):
dp.active_branches.add('Skip')
else:
dp.active_branches.add('Enter')
elif dp.cond_tree and dp.cond_leaves:
assignment = {}
for leaf in dp.cond_leaves:
for c in cons:
if _match_leaf(c, leaf):
assignment[leaf] = c[3]
break
if len(assignment) == len(dp.cond_leaves):
if evaluate_tree(dp.cond_tree, assignment):
dp.active_branches.add('Skip')
else:
dp.active_branches.add('Enter')
else:
for c in cons:
if c[0] == dp.label or any(c[0] == f for f in _get_fields_in_cond(dp.label)):
@@ -237,10 +327,6 @@ def _get_fields_in_cond(cond_text):
return re.findall(r'[A-Z][A-Z0-9-]*', cond_text.upper())
def _infer_implied(dp):
dp.implied_branches.update(dp.active_branches)
# ── 行号定位(基于原始源文本)──
def locate_decision_lines(decision_points, raw_source):
+180 -61
View File
@@ -1,20 +1,15 @@
"""设计层:路径枚举 + 值生成 + 约束应用"""
import os
import re
import logging
from . import agents, CONFIG
from .models import BrSeq, BrIf, BrEval, BrPerform, Assign, CallNode, CondNot, CondLeaf, ExitNode, GoTo
from .models import BrSeq, BrIf, BrEval, BrPerform, BrSearch, Assign, CallNode, CondNot, CondLeaf, ExitNode, GoTo
from .cond import parse_single_condition, parse_compound_condition, is_field, collect_leaves, mcdc_sets, satisfying_value
from .core import trace_to_root, invert_through_chain, propagate_assignments, _basename
logger = logging.getLogger(__name__)
_STOP = ('__STOP__', '', None, True)
_MAX_PATHS = 5000
_FALLBACK_MAX_PATHS = 100
_ACTIVE_MAX_PATHS = _MAX_PATHS
_LLM_FAILED = False
_MAX_PATHS = 10000
def _filter_stop(cons):
@@ -22,46 +17,51 @@ def _filter_stop(cons):
def _cap_paths(paths):
if len(paths) > _ACTIVE_MAX_PATHS:
return paths[:_ACTIVE_MAX_PATHS]
if len(paths) > _MAX_PATHS:
return paths[:_MAX_PATHS]
return paths
def _cap_paths_fair(new_active, child_paths):
"""两阶段公平截断:每个前置路径至少保留一条子路径,再填充剩余配额。"""
if len(new_active) <= _MAX_PATHS:
return new_active
k = len(child_paths)
if k <= 1:
return new_active[:_MAX_PATHS]
# 分离 STOP 路径(不参与组合,直接保留)
stop_paths = [(p, a) for p, a in new_active if any(c is _STOP for c in p)]
combined = [(p, a) for p, a in new_active if not any(c is _STOP for c in p)]
n_pred = len(combined) // k
result = list(stop_paths)
if n_pred <= 1:
result.extend(combined[:_MAX_PATHS - len(result)])
return result[:_MAX_PATHS]
remaining_quota = _MAX_PATHS - len(result)
# Phase 1: 每个前置至少保留一条子路径(轮询分配不同子路径索引)
quota = min(n_pred, remaining_quota)
selected = set()
for p_idx in range(quota):
c_idx = p_idx % k
idx = p_idx * k + c_idx
selected.add(idx)
result.append(combined[idx])
if len(result) >= _MAX_PATHS:
return result[:_MAX_PATHS]
# Phase 2: 用剩余配额填充其余组合
remaining = _MAX_PATHS - len(result)
for idx in range(len(combined)):
if idx not in selected:
result.append(combined[idx])
remaining -= 1
if remaining <= 0:
break
return result[:_MAX_PATHS]
# ── 路径枚举 ──
def _try_llm_enum_paths(node, fields):
global _LLM_FAILED
if _LLM_FAILED:
logger.debug("断路器已跳,跳过 LLM")
return None
if not CONFIG.get("llm_generator", True):
logger.debug("llm_generator 已关闭,降级规则引擎")
return None
if not os.environ.get(agents.DEEPSEEK_API_KEY_ENV):
logger.warning("DEEPSEEK_API_KEY 未设置,降级规则引擎")
return None
try:
result = agents.llm_generate_all_paths(node, fields)
if result is not None:
logger.info(f"LLM 路径生成成功,{len(result)}")
return result
logger.warning("LLM 返回空,降级规则引擎")
except Exception as e:
logger.error(f"LLM API 调用异常: {e}")
_LLM_FAILED = True
return None
def enum_paths(node, fields):
global _ACTIVE_MAX_PATHS
# === LLM 优先(整体替换整个树的路径生成) ===
llm_result = _try_llm_enum_paths(node, fields)
if llm_result is not None:
_ACTIVE_MAX_PATHS = _MAX_PATHS
return llm_result
if _ACTIVE_MAX_PATHS == _MAX_PATHS:
logger.warning("降级到规则引擎(路径上限 5000 → 100)")
_ACTIVE_MAX_PATHS = _FALLBACK_MAX_PATHS
"""枚举路径,每条路径返回 (constraints, assignments).
返回 list[tuple[list[tuple], dict]].
"""
@@ -86,7 +86,7 @@ def enum_paths(node, fields):
merged.setdefault(k, []).extend(v if isinstance(v, list) else [v])
merged_cons = p_cons + list(cp_cons)
new_active.append((merged_cons, merged))
paths = _cap_paths(new_active)
paths = _cap_paths_fair(new_active, child_paths)
return paths
elif isinstance(node, BrIf):
@@ -170,49 +170,56 @@ def enum_paths(node, fields):
return paths
if node.subject == 'TRUE':
paths = []
prior_false = []
prior_false_sets = [] # list[list[Constraint]]
for value, seq in node.when_list:
cond = parse_compound_condition(value, fields)
if cond and isinstance(cond, CondLeaf) and is_field(cond.field, fields):
# Simple condition
sub = _cap_paths(enum_paths(seq, fields))
for sp_cons, sp_assign in (sub or [([], {})]):
constraints = list(prior_false)
constraints = [c for pf in prior_false_sets for c in pf]
constraints.append((cond.field, cond.op, cond.value, True))
paths.append((constraints + sp_cons, sp_assign))
prior_false.append((cond.field, cond.op, cond.value, False))
prior_false_sets.append([(cond.field, cond.op, cond.value, False)])
elif cond:
# Compound condition — use MC/DC for path generation
leaves = collect_leaves(cond)
if leaves and all(is_field(l.field, fields) for l in leaves):
sets = mcdc_sets(cond, fields)
if sets:
sub = _cap_paths(enum_paths(seq, fields))
false_set = None
new_false_sets = []
for cs, decision in sets:
if decision:
for sp_cons, sp_assign in (sub or [([], {})]):
paths.append((list(prior_false) + list(cs) + sp_cons, sp_assign))
elif false_set is None:
false_set = cs
if false_set is not None:
prior_false.extend(false_set)
else:
prior_false = []
if not prior_false_sets:
for sp_cons, sp_assign in (sub or [([], {})]):
paths.append((list(cs) + sp_cons, sp_assign))
else:
for pf_set in prior_false_sets:
for sp_cons, sp_assign in (sub or [([], {})]):
paths.append((list(pf_set) + list(cs) + sp_cons, sp_assign))
else:
new_false_sets.append(cs)
if not new_false_sets:
prior_false_sets = []
break
combined = []
for pf_set in prior_false_sets:
for nf_set in new_false_sets:
combined.append(list(pf_set) + list(nf_set))
prior_false_sets = combined
else:
prior_false = []
prior_false_sets = []
break
else:
prior_false = []
prior_false_sets = []
break
else:
prior_false = []
prior_false_sets = []
break
if node.has_other:
sub = _cap_paths(enum_paths(node.other_seq, fields))
for sp_cons, sp_assign in (sub or [([], {})]):
paths.append((list(prior_false) + sp_cons, sp_assign))
constraints = [c for pf in prior_false_sets for c in pf]
paths.append((constraints + sp_cons, sp_assign))
return paths
if not is_field(node.subject, fields):
return [([], {})]
@@ -228,6 +235,9 @@ def enum_paths(node, fields):
paths.append(([(node.subject, 'not_in', case_vals, True)] + sp_cons, sp_assign))
return paths
elif isinstance(node, BrSearch):
return _enum_search_paths(node, fields)
elif isinstance(node, BrPerform):
if node.perf_type in ('para', 'thru'):
if node.body_seq:
@@ -743,12 +753,90 @@ def apply_occurs_depending(rec, fields):
rec[name] = '0' * length
def _non_match_for(cond_leaf, fields):
if not fields or not cond_leaf:
return None
base = re.sub(r'\s*\(.*?\)\s*$', '', cond_leaf.field)
for f in fields:
if re.sub(r'\s*\(.*?\)\s*$', '', f['name']) == base:
pic = f.get('pic_info', {})
if pic.get('type') == 'numeric':
return '0'
return ' '
return None
def _enum_search_paths(node, fields):
# 从条件字段名推断 OCCURS 数;如 WS-CODE-VAL(WS-IDX) → 查 WS-CODE-VAL(j) 最大 j
occurs_count = 1
if node.when_list and node.cond_trees and node.cond_trees[0]:
ct = node.cond_trees[0]
if isinstance(ct, CondLeaf):
base = re.sub(r'\s*\(.*?\)\s*$', '', ct.field)
for f in fields:
m = re.match(rf'^{re.escape(base)}\((\d+)\)$', f['name'])
if m:
occurs_count = max(occurs_count, int(m.group(1)))
if occurs_count <= 1:
# 再查父组名下各字段的后缀
parent = node.table_name
for f in fields:
m = re.match(rf'^{re.escape(parent)}\((\d+)\)$', f['name'])
if m:
occurs_count = max(occurs_count, int(m.group(1)))
paths = []
for i, (cond_text, body_seq) in enumerate(node.when_list):
cond_tree = node.cond_trees[i] if i < len(node.cond_trees) else None
sub = _cap_paths(enum_paths(body_seq, fields))
if not sub:
sub = [([], {})]
extra_assign = {}
if cond_tree and isinstance(cond_tree, CondLeaf):
base = re.sub(r'\s*\(.*?\)\s*$', '', cond_tree.field)
matching_val = cond_tree.value
elem_key = f'{base}({i + 1})'
extra_assign[elem_key] = [{'type': 'move_literal', 'literal': matching_val}]
non_match = _non_match_for(cond_tree, fields) or ' '
for j in range(i):
prev_key = f'{base}({j + 1})'
extra_assign[prev_key] = [{'type': 'move_literal', 'literal': non_match}]
for sp_cons, sp_assign in (sub or [([], {})]):
merged_assign = dict(extra_assign)
for k, v in sp_assign.items():
merged_assign.setdefault(k, []).extend(v if isinstance(v, list) else [v])
paths.append((sp_cons, merged_assign))
if node.has_at_end:
sub = _cap_paths(enum_paths(node.at_end_seq, fields))
for sp_cons, sp_assign in (sub or [([], {})]):
extra_assign = {}
non_match = ' '
if node.when_list:
ct = node.cond_trees[0]
if ct and isinstance(ct, CondLeaf):
non_match = _non_match_for(ct, fields) or ' '
base = re.sub(r'\s*\(.*?\)\s*$', '', ct.field)
for j in range(max(occurs_count, 1)):
extra_assign[f'{base}({j + 1})'] = [{'type': 'move_literal', 'literal': non_match}]
merged_assign = dict(extra_assign)
for k, v in sp_assign.items():
merged_assign.setdefault(k, []).extend(v if isinstance(v, list) else [v])
paths.append((sp_cons, merged_assign))
return paths
def generate_records(branch_paths_with_assigns, data_fields, base_assignments=None, file_sec=None):
"""生成测试数据记录。
branch_paths_with_assigns: list of (constraints, path_assignments).
base_assignments: 全局 assignments dict (用于 trace_to_root).
返回: (records, kept_path_cons) — kept_path_cons 是与 records 一一对应的约束。
"""
records = []
kept_path_cons = []
if branch_paths_with_assigns:
for seq, (path_cons, path_assign) in enumerate(branch_paths_with_assigns, start=1):
path_cons = _filter_stop(path_cons)
@@ -756,20 +844,51 @@ def generate_records(branch_paths_with_assigns, data_fields, base_assignments=No
# Pass A: 先传播赋值(MOVE/COMPUTE/READ INTO 等),模拟到决策点前的程序状态
if isinstance(path_assign, dict):
propagate_assignments(rec, path_assign, data_fields, file_sec=file_sec)
# Pass A.5: 检查约束是否经过链追溯到字面量截断(不可能路径)
skip_impossible = False
if base_assignments and isinstance(path_assign, dict):
for c in path_cons:
if len(c) == 4 and not skip_impossible:
field, op, val, want = c
root_var, chain = trace_to_root(field, base_assignments, data_fields, path_assign)
if root_var != field:
new_fn, new_op, new_val = invert_through_chain(root_var, chain, op, val)
if any(f['name'] == new_fn for f in data_fields):
asgn_val = path_assign.get(root_var)
if asgn_val is not None:
asgn_list = asgn_val if isinstance(asgn_val, list) else [asgn_val]
if asgn_list and asgn_list[-1]['type'] == 'move_literal' and root_var in rec:
if not _check_constraint_satisfied(rec, root_var, new_op, new_val, want, data_fields):
skip_impossible = True
break
if skip_impossible:
continue
# Pass B: 约束覆盖(确保决策条件满足,覆盖 MOVE 带来的值)
for c in path_cons:
if len(c) == 4:
field, op, val, want = c
apply_constraint(rec, field, op, val, want, data_fields, base_assignments, path_assign)
# Pass B.5: 前向再传播变量间MOVE,保持约束修改后的链一致性
if isinstance(path_assign, dict):
forward = {}
for tgt, asgn_val in path_assign.items():
asgn_list = asgn_val if isinstance(asgn_val, list) else [asgn_val]
filtered = [a for a in asgn_list if a['type'] == 'move' and a.get('source_vars')]
if filtered:
forward[tgt] = filtered
if forward:
propagate_assignments(rec, forward, data_fields, file_sec=file_sec)
# Pass C: 同步 REDEFINES(确保共享存储一致)
sync_redefined_fields(rec, data_fields)
# Pass D: OCCURS DEPENDING ON — 清零超范围的下标字段
apply_occurs_depending(rec, data_fields)
records.append(rec)
kept_path_cons.append(path_cons)
if not records:
rec = make_base_record(1, data_fields)
if base_assignments:
propagate_assignments(rec, base_assignments, data_fields, file_sec=file_sec)
records.append(rec)
return records
kept_path_cons.append([])
return records, kept_path_cons
+12
View File
@@ -115,6 +115,18 @@ class CondOr:
self.right = right
class BrSearch:
"""SEARCH / SEARCH ALL 表查找"""
def __init__(self, table_name, is_all=False, varying=None):
self.table_name = table_name
self.is_all = is_all
self.varying = varying.upper() if varying else None
self.at_end_seq = BrSeq()
self.when_list = [] # [(condition_text, BrSeq)]
self.cond_trees = [] # [cond_tree, ...]
self.has_at_end = False
class GoTo:
"""GO TO 节点:无条件跳转到指定段落"""
def __init__(self, target: str, body_seq: 'BrSeq' = None):
@@ -1,596 +0,0 @@
你是一个 COBOL 自动化测试数据生成器的核心解析模块。你的任务是将预处理的 COBOL PROCEDURE DIVISION 源码转换为结构化的 JSON 树,用于后续的路径枚举和测试数据生成。
## 输入格式
你会收到两样东西:
1. **PROCEDURE DIVISION 源码文本** — 已预处理(大写、无注释、缩进规整)
2. **DATA DIVISION 字段列表** — JSON 数组,每个字段包括 name/level/pic/pic_info 等
## 输出格式
输出一个 JSON 对象,包含两个顶级键:
### 1. `assignments` (对象)
记录了 PROCEDURE DIVISION 中每个赋值语句的来源信息。键是目标字段名,值是一个对象,类型如下:
- **move**: 变数对变数 MOVE (e.g., `MOVE WS-A TO WS-B`)
```json
{"type": "move", "source_vars": ["WS-A"]}
```
- **move_literal**: 字面量/定数 MOVE (e.g., `MOVE 'HELLO' TO WS-B`, `MOVE ZERO TO WS-B`)
```json
{"type": "move_literal", "literal": "HELLO"}
```
- **compute**: COMPUTE/ADD/SUBTRACT/MULTIPLY/DIVIDE
- 二元运算 (var OP const / const OP var):
```json
{"type": "compute", "source_vars": ["WS-A"], "op": "+", "const": 5, "expr": "WS-A + 5"}
```
- 变数间运算 (var OP var):
```json
{"type": "compute", "source_vars": ["WS-A", "WS-B"], "op": "+", "expr": "WS-A + WS-B"}
```
- 复杂表达式 (无法解析):
```json
{"type": "compute", "source_vars": ["WS-A", "WS-B"], "op": null, "const": null, "expr": "WS-A * (WS-B + 1)"}
```
### 2. `tree` (对象)
一个递归的 JSON 树,表示 PROCEDURE DIVISION 的代码结构。不要包含注释、段落标签(仅作为 PERFORM 目标引用)。
#### 节点类型
**seq**: 顺序序列(子节点列表)
```json
{"type": "seq", "children": [子节点...]}
```
**assign**: 赋值语句(MOVE / COMPUTE / ADD / SUBTRACT / MULTIPLY / DIVIDE
```json
{"type": "assign", "target": "WS-STATUS", "source_info": {"type": "move_literal", "literal": "H"}}
```
source_info 必须与 assignments 中对应条目一致。
**if**: 条件分支
```json
{
"type": "if",
"condition": "WS-AMOUNT > 1000",
"true_seq": {"type": "seq", "children": [...]},
"false_seq": {"type": "seq", "children": [...]}
}
```
- 如果无 ELSEfalse_seq 应为 `{"type": "seq", "children": []}`
- condition 保持原始文本(不加解析)
**eval**: EVALUATE 多路分支
```json
{
"type": "eval",
"subject": "WS-TYPE",
"when_list": [
{"value": "A", "seq": {"type": "seq", "children": [...]}},
{"value": "B", "seq": {"type": "seq", "children": [...]}}
],
"other_seq": {"type": "seq", "children": [...]},
"has_other": true
}
```
- WHEN OTHER 时 has_other=true
- 无 WHEN OTHER 时 has_other=false, other_seq 为空 seq
**call**: CALL 子程序调用
```json
{"type": "call", "program_name": "SUBPGM", "using_params": [
{"name": "WS-AMOUNT", "mechanism": "reference"},
{"name": "WS-RESULT", "mechanism": "reference"}
]}
```
- CALL 是顺序执行语句(不产生分支),作为 seq 的子节点放在相应位置
- USING 参数按 COBOL 源码顺序列出
- mechanism 取值:
- `"reference"`: BY REFERENCE(默认)— 子程序可能修改该变量
- `"content"`: BY CONTENT — 传副本,调用方变量不会被修改
- `"value"`: BY VALUE — 传值(仅数值/指针)
- 无 BY 子句时默认为 `"reference"`
- 字面量参数(如 `BY VALUE 100`)不包含字段名,只在 mechanism 为 `"value"` 时保留
**perform**: PERFORM 语句
```json
// 段落调用:
{"type": "perform", "perf_type": "para", "target": "1000-INIT"}
// PERFORM THRU:
{"type": "perform", "perf_type": "thru", "target": "1000-INIT", "thru": "2000-END"}
// 内联 PERFORM UNTIL:
{"type": "perform", "perf_type": "until", "condition": "WS-COUNT > 3",
"body_seq": {"type": "seq", "children": [...]}}
// PERFORM VARYING:
{"type": "perform", "perf_type": "varying", "condition": "WS-I > 10",
"varying_var": "WS-I", "varying_from": "1", "varying_by": "1",
"body_seq": {"type": "seq", "children": [...]}}
// PERFORM 段落 + UNTIL:
{"type": "perform", "perf_type": "para_until", "target": "2000-HIGH", "condition": "WS-COUNT > 100"}
```
### 定数 (Figurative Constants) 处理规则
以下定数在 MOVE 时直接用作字面量(保留原值):
| 定数 | 规则 |
|------|------|
| ZERO / ZEROS / ZEROES | `literal: "0"` |
| SPACE / SPACES | `literal: " "` |
| HIGH-VALUE / HIGH-VALUES | `literal: "HIGH-VALUE"` |
| LOW-VALUE / LOW-VALUES | `literal: "LOW-VALUE"` |
| QUOTE / QUOTES | `literal: "'"` |
| ALL literal | `literal: literal值` |
## COBOL 语法处理规则
### 1. IF 语句
```
IF condition
statements...
[ELSE
statements...]
END-IF.
```
- condition 可以是简单条件、复合条件(AND/OR)、带 NOT 前置
- true_seq 为 condition 为真时执行的分支,false_seq 为条件为假时的分支
- IF 可以和 ELSE IF 嵌套,此时结构化为嵌套 if 的 false_seq
### 2. EVALUATE 语句
```
EVALUATE subject
WHEN value1
statements...
WHEN value2
statements...
WHEN OTHER
statements...
END-EVALUATE.
```
- subject 是单个字段
- value 是具体值或 OTHER
- 每个 WHEN 的 seq 是该分支下的语句序列
- WHEN 内的 GO TO / STOP RUN 不影响结构
### 3. PERFORM 语句
多种形态:
**段落调用**:
```
PERFORM 1000-INIT
```
**段落范围**:
```
PERFORM 1000-INIT THRU 2000-END
```
**内联 UNTIL**:
```
PERFORM UNTIL condition
statements...
END-PERFORM
```
**VARYING**:
```
PERFORM VARYING WS-I FROM 1 BY 1 UNTIL WS-I > 10
statements...
END-PERFORM
```
**段落 + UNTIL**:
```
PERFORM 2000-HIGH UNTIL WS-COUNT > 100
```
### 4. 段落 (Paragraphs)
PROCEDURE DIVISION 中的段落以标签名(后跟句点)开始、以下一个段落标签或文件末尾结束。
```
PARA-NAME.
statement
statement
.
NEXT-PARA.
statement
```
段落标签会被 PERFORM 引用。如果代码不在任何 PERFORM 中执行(顶级流程),段落按顺序依次执行,遇到 STOP RUN / GOBACK 结束。
在树结构中:
- 顶级流程入口(PROCEDURE DIVISION 后的第一个段落)作为树的根 seq
- 后续每个段落对应一个独立的 seq,只有在被 PERFORM 调用时才执行
- 段落标签本身不是节点,只作为 PERFORM 的目标引用
### 5. CALL 语句
CALL 调用子程序,参数通过 USING 传递。
```
CALL 'SUBPGM' USING WS-A WS-B WS-C
CALL 'SUBPGM' USING BY REFERENCE WS-A BY CONTENT WS-B BY VALUE 100
```
- CALL 是顺序执行,不产生分支
- USING 参数按 COBOL 源码顺序列出
- 缺省传递机制时默认为 BY REFERENCE
- 字段名参数保持原样,字面量/数值参数如 `BY VALUE 100` 不放入 using_params(因为无字段名)
- CALL 后继续执行下一条语句
### 6. 赋值语句
| COBOL | JSON 类型 | 示例 source_info |
|-------|-----------|-----------------|
| MOVE 'HELLO' TO WS-A | move_literal | `{"type":"move_literal","literal":"HELLO"}` |
| MOVE WS-B TO WS-A | move | `{"type":"move","source_vars":["WS-B"]}` |
| MOVE ZERO TO WS-A | move_literal | `{"type":"move_literal","literal":"0"}` |
| MOVE SPACE TO WS-A | move_literal | `{"type":"move_literal","literal":" "}` |
| MOVE HIGH-VALUE TO WS-A | move_literal | `{"type":"move_literal","literal":"HIGH-VALUE"}` |
| COMPUTE WS-A = WS-B + 1 | compute (const OP var) | `{"type":"compute","source_vars":["WS-B"],"op":"+","const":1,"expr":"WS-B + 1"}` |
| COMPUTE WS-A = 2 * WS-B | compute (const OP var) | 同上,op="*" |
| COMPUTE WS-A = WS-B + WS-C | compute (var OP var) | `{"type":"compute","source_vars":["WS-B","WS-C"],"op":"+","expr":"WS-B + WS-C"}` |
| COMPUTE WS-A = (WS-B + 1) * WS-C | compute (复杂) | `{"type":"compute","source_vars":["WS-B","WS-C"],"op":null,"const":null,"expr":"(WS-B + 1) * WS-C"}` |
| ADD 5 TO WS-A | compute (const) | `{"type":"compute","source_vars":["WS-A"],"op":"+","const":5,"expr":"WS-A + 5"}` |
| SUBTRACT 3 FROM WS-A | compute (const) | `{"type":"compute","source_vars":["WS-A"],"op":"-","const":3,"expr":"WS-A - 3"}` |
| MULTIPLY 2 BY WS-A | compute (const) | `{"type":"compute","source_vars":["WS-A"],"op":"*","const":2,"expr":"WS-A * 2"}` |
| DIVIDE 4 INTO WS-A | compute (const) | `{"type":"compute","source_vars":["WS-A"],"op":"/","const":4,"expr":"WS-A / 4"}` |
### 7. 控制流结束
| 语句 | 含义 |
|------|------|
| STOP RUN | 程序结束,不执行后续代码 |
| GOBACK | 返回调用者(类似 STOP RUN |
| EXIT PROGRAM | 返回调用者 |
这些语句不是树节点,但标记了当前段落/分支的结束。
### 8. 88-level 条件名
```
05 CALL-TYPE PIC X(1).
88 CALL-LOCAL VALUE 'L'.
88 CALL-DOMESTIC VALUE 'D'.
```
在条件中如 `IF CALL-LOCAL`,等价于 `IF CALL-TYPE = 'L'`。条件名可替换为父字段 + 值。
## 输出规则总结
1. **assignments**: 包含所有出现的赋值语句,**不区分分支**(全局收集)
2. **tree**: 只包含结构化的 if/eval/perform/assign 节点,**不包含段落标签**
3. 注释行(* 在第7列)已被预处理移除
4. 每个 assign 节点必须与 assignments 中的条目一一对应
5. condition 保持原始文本,不要解析或转换
6. 88-level 条件在 tree.condition 中直接替换为父字段条件(如 `IF CALL-TYPE = 'L'`
7. 赋值中的字段名、字面量保持原始值,多单词字段用连字符(如 WS-AMOUNT)
## Few-Shot 示例
### 示例 1:简单 IF/ELSE
**输入:**
```
PROCEDURE DIVISION.
IF WS-AMOUNT > 1000
MOVE 'H' TO WS-STATUS
ELSE
MOVE 'L' TO WS-STATUS
END-IF.
STOP RUN.
```
**输出:**
```json
{
"assignments": {
"WS-STATUS": {"type": "move_literal", "literal": "H"},
"WS-STATUS": {"type": "move_literal", "literal": "L"}
},
"tree": {
"type": "seq",
"children": [
{
"type": "if",
"condition": "WS-AMOUNT > 1000",
"true_seq": {
"type": "seq",
"children": [
{"type": "assign", "target": "WS-STATUS", "source_info": {"type": "move_literal", "literal": "H"}}
]
},
"false_seq": {
"type": "seq",
"children": [
{"type": "assign", "target": "WS-STATUS", "source_info": {"type": "move_literal", "literal": "L"}}
]
}
}
]
}
}
```
### 示例 2EVALUATE
**输入:**
```
PROCEDURE DIVISION.
EVALUATE WS-TYPE
WHEN 'A'
MOVE 'TYPE-A' TO WS-MEMO
WHEN 'B'
MOVE 'TYPE-B' TO WS-MEMO
WHEN OTHER
MOVE 'OTHER' TO WS-MEMO
END-EVALUATE.
STOP RUN.
```
**输出:**
```json
{
"assignments": {
"WS-MEMO": {"type": "move_literal", "literal": "TYPE-A"},
"WS-MEMO": {"type": "move_literal", "literal": "TYPE-B"},
"WS-MEMO": {"type": "move_literal", "literal": "OTHER"}
},
"tree": {
"type": "seq",
"children": [
{
"type": "eval",
"subject": "WS-TYPE",
"when_list": [
{"value": "A", "seq": {"type": "seq", "children": [
{"type": "assign", "target": "WS-MEMO", "source_info": {"type": "move_literal", "literal": "TYPE-A"}}
]}},
{"value": "B", "seq": {"type": "seq", "children": [
{"type": "assign", "target": "WS-MEMO", "source_info": {"type": "move_literal", "literal": "TYPE-B"}}
]}}
],
"other_seq": {"type": "seq", "children": [
{"type": "assign", "target": "WS-MEMO", "source_info": {"type": "move_literal", "literal": "OTHER"}}
]},
"has_other": true
}
]
}
}
```
### 示例 3:嵌套 IF + PERFORM 段落
**输入:**
```
PROCEDURE DIVISION.
IF WS-AMOUNT > 5000
PERFORM 2000-HIGH
ELSE
PERFORM 3000-LOW
END-IF.
STOP RUN.
2000-HIGH.
MOVE 'H' TO WS-STATUS.
3000-LOW.
MOVE 'L' TO WS-STATUS.
```
**输出:**
```json
{
"assignments": {
"WS-STATUS": {"type": "move_literal", "literal": "H"},
"WS-STATUS": {"type": "move_literal", "literal": "L"}
},
"tree": {
"type": "seq",
"children": [
{
"type": "if",
"condition": "WS-AMOUNT > 5000",
"true_seq": {"type": "seq", "children": [
{"type": "perform", "perf_type": "para", "target": "2000-HIGH"}
]},
"false_seq": {"type": "seq", "children": [
{"type": "perform", "perf_type": "para", "target": "3000-LOW"}
]}
}
]
}
}
```
### 示例 4:内联 PERFORM UNTIL
**输入:**
```
PROCEDURE DIVISION.
MOVE 1 TO WS-COUNT.
PERFORM UNTIL WS-COUNT > 10
ADD 1 TO WS-COUNT
END-PERFORM.
STOP RUN.
```
**输出:**
```json
{
"assignments": {
"WS-COUNT": {"type": "move_literal", "literal": "1"},
"WS-COUNT": {"type": "compute", "source_vars": ["WS-COUNT"], "op": "+", "const": 1, "expr": "WS-COUNT + 1"}
},
"tree": {
"type": "seq",
"children": [
{"type": "assign", "target": "WS-COUNT", "source_info": {"type": "move_literal", "literal": "1"}},
{
"type": "perform",
"perf_type": "until",
"condition": "WS-COUNT > 10",
"body_seq": {"type": "seq", "children": [
{"type": "assign", "target": "WS-COUNT", "source_info": {"type": "compute", "source_vars": ["WS-COUNT"], "op": "+", "const": 1, "expr": "WS-COUNT + 1"}}
]}
}
]
}
}
```
### 示例 5PERFORM VARYING + 复合条件
**输入:**
```
PROCEDURE DIVISION.
MOVE 0 TO WS-TOTAL-CHARGE.
PERFORM VARYING WS-COUNT FROM 1 BY 1 UNTIL WS-COUNT > 3
IF CALL-HOUR >= 08 AND CALL-HOUR < 22
MOVE 'Y' TO WS-PEAK-FLAG
ELSE
MOVE 'N' TO WS-PEAK-FLAG
END-IF
END-PERFORM.
STOP RUN.
```
**输出:**
```json
{
"assignments": {
"WS-TOTAL-CHARGE": {"type": "move_literal", "literal": "0"},
"WS-PEAK-FLAG": {"type": "move_literal", "literal": "Y"},
"WS-PEAK-FLAG": {"type": "move_literal", "literal": "N"}
},
"tree": {
"type": "seq",
"children": [
{"type": "assign", "target": "WS-TOTAL-CHARGE", "source_info": {"type": "move_literal", "literal": "0"}},
{
"type": "perform",
"perf_type": "varying",
"condition": "WS-COUNT > 3",
"varying_var": "WS-COUNT",
"varying_from": "1",
"varying_by": "1",
"body_seq": {"type": "seq", "children": [
{
"type": "if",
"condition": "CALL-HOUR >= 08 AND CALL-HOUR < 22",
"true_seq": {"type": "seq", "children": [
{"type": "assign", "target": "WS-PEAK-FLAG", "source_info": {"type": "move_literal", "literal": "Y"}}
]},
"false_seq": {"type": "seq", "children": [
{"type": "assign", "target": "WS-PEAK-FLAG", "source_info": {"type": "move_literal", "literal": "N"}}
]}
}
]}
}
]
}
}
```
### 示例 688-level 条件名
**输入:**
```
PROCEDURE DIVISION.
IF CALL-LOCAL
MOVE 'L' TO WS-TYPE
END-IF.
STOP RUN.
```
(DATA: 88 CALL-LOCAL VALUE 'L', parent field CALL-TYPE PIC X(1))
**输出:**
```json
{
"assignments": {
"WS-TYPE": {"type": "move_literal", "literal": "L"}
},
"tree": {
"type": "seq",
"children": [
{
"type": "if",
"condition": "CALL-TYPE = 'L'",
"true_seq": {"type": "seq", "children": [
{"type": "assign", "target": "WS-TYPE", "source_info": {"type": "move_literal", "literal": "L"}}
]},
"false_seq": {"type": "seq", "children": []}
}
]
}
}
```
### 示例 7CALL 子程序调用
**输入:**
```
PROCEDURE DIVISION.
MOVE 0 TO WS-RESULT.
IF WS-AMOUNT > 1000
MOVE 'H' TO WS-STATUS
CALL 'CALCSUB' USING WS-AMOUNT WS-TYPE WS-RESULT
ELSE
MOVE 'L' TO WS-STATUS
CALL 'CALCSUB' USING WS-AMOUNT WS-TYPE WS-RESULT
END-IF.
STOP RUN.
```
**输出:**
```json
{
"assignments": {
"WS-RESULT": {"type": "move_literal", "literal": "0"},
"WS-STATUS": {"type": "move_literal", "literal": "H"},
"WS-STATUS": {"type": "move_literal", "literal": "L"}
},
"tree": {
"type": "seq",
"children": [
{"type": "assign", "target": "WS-RESULT", "source_info": {"type": "move_literal", "literal": "0"}},
{
"type": "if",
"condition": "WS-AMOUNT > 1000",
"true_seq": {"type": "seq", "children": [
{"type": "assign", "target": "WS-STATUS", "source_info": {"type": "move_literal", "literal": "H"}},
{"type": "call", "program_name": "CALCSUB", "using_params": [
{"name": "WS-AMOUNT", "mechanism": "reference"},
{"name": "WS-TYPE", "mechanism": "reference"},
{"name": "WS-RESULT", "mechanism": "reference"}
]}
]},
"false_seq": {"type": "seq", "children": [
{"type": "assign", "target": "WS-STATUS", "source_info": {"type": "move_literal", "literal": "L"}},
{"type": "call", "program_name": "CALCSUB", "using_params": [
{"name": "WS-AMOUNT", "mechanism": "reference"},
{"name": "WS-TYPE", "mechanism": "reference"},
{"name": "WS-RESULT", "mechanism": "reference"}
]}
]}
}
]
}
}
```
## 错误处理
- 无法识别的语句:跳过该行(不影响整体结构)
- 不完整的语句(如 IF 无 END-IF):尝试合理推断嵌套关系
- 嵌套段落引用(PERFORM A THRU B):使用 perf_type "thru"
- 字段名与 88-level 名冲突:以字段定义为准
## 输出要求
- 只输出一个 JSON 对象(无多余文本、无 markdown 标记)
- JSON 必须合法(双引号、正确逗号、无尾逗号)
- assignments 中**每个赋值只记录一次**(不区分分支)
- tree 必须完整包含所有可达代码路径
- 字段名、字面量保持原始值(不转换大小写,不移动)
+1 -1
View File
@@ -52,7 +52,7 @@ def preprocess(source: str) -> str:
if not line:
continue
content = line
lines.append(content.upper())
lines.append(re.sub(r'\s+FALSE\s+[^\s.]+', '', content.upper()))
return '\n'.join(lines)