From 0b0a013f5194e827ebba0a74238d70d6eef18a13 Mon Sep 17 00:00:00 2001 From: NB-076 Date: Sun, 21 Jun 2026 12:52:04 +0800 Subject: [PATCH] fix: 3 critical parsing bugs found through statement benchmark testing Bug 1: ELSE IF breaks IF false_seq parsing (core.py) - _parse_if checked self.clean() == 'ELSE' which fails on 'ELSE IF ...' - Fix: use startswith('ELSE'), reinsert IF portion for recursive parse - Impact: ALL ELSE IF chains were silently dropped (huge branch loss) Bug 2: READ skip loop greedily consumes subsequent statements (core.py) - READ's AT END / NOT AT END skip loop used bare advance() with no statement boundary detection - Fix: add _stmt_boundary regex that stops on IF/PERFORM/READ/etc. - Impact: everything after first READ was consumed as 'AT END' lines Bug 3: _walk() in extract_structure doesn't descend into BrPerform (__init__.py) - Branch counting _walk() only handled BrIf/BrEval/BrSeq - IF statements inside PERFORM bodies were never counted - Fix: add BrPerform.body_seq and BrSearch descent Combined impact: matching programs (MT01-33) now correctly report their branches instead of 0. Full regression: 749 passed (unchanged). --- cobol_testgen/__init__.py | 8 +++++++- cobol_testgen/core.py | 19 +++++++++++++++++-- 2 files changed, 24 insertions(+), 3 deletions(-) diff --git a/cobol_testgen/__init__.py b/cobol_testgen/__init__.py index d8e2814..e16aab1 100644 --- a/cobol_testgen/__init__.py +++ b/cobol_testgen/__init__.py @@ -372,7 +372,7 @@ def extract_structure(cobol_source: str) -> dict: file_sec = parse_file_section(preprocessed) open_dir = scan_open_statements(proc_div) if proc_div else {} - from .models import BrIf, BrEval, BrSeq, BrPerform, Assign, CondAnd, CondOr + from .models import BrIf, BrEval, BrSeq, BrPerform, BrSearch, Assign, CondAnd, CondOr decision_points = [] total_branches = 0 @@ -403,6 +403,12 @@ def extract_structure(cobol_source: str) -> dict: elif isinstance(node, BrSeq): for child in node.children: _walk(child, counter) + elif isinstance(node, BrPerform): + _walk(node.body_seq, counter) + elif isinstance(node, BrSearch): + _walk(node.at_end_seq, counter) + for _, seq in node.when_list: + _walk(seq, counter) if branch_tree: _walk(branch_tree, [0]) diff --git a/cobol_testgen/core.py b/cobol_testgen/core.py index 5f6a0f3..29430f2 100644 --- a/cobol_testgen/core.py +++ b/cobol_testgen/core.py @@ -211,11 +211,21 @@ class _BrParser: seq.add(Assign(tgt, info)) self.advance() # 跳过 READ 语句剩余行(AT END / NOT AT END / END-READ) + # 遇到新的语句关键词时停止,避免贪婪吞咽后续内容 + _stmt_boundary = re.compile( + r'^(IF |EVALUATE |PERFORM |SEARCH |INITIALIZE |STRING |' + r'UNSTRING |CALL |ACCEPT |READ |WRITE |REWRITE |SET |' + r'INSPECT |MOVE |COMPUTE |ADD |SUBTRACT |MULTIPLY |DIVIDE |' + r'GO\s+TO |GOBACK |STOP\s+RUN|EXIT\s|CLOSE |OPEN |DISPLAY |' + r'DELETE |START |' + r'END-IF|END-PERFORM|END-EVALUATE|END-READ)', re.IGNORECASE) while self.pos < len(self.lines): cl = self.clean() if cl in ('END-READ', 'END-READ.'): self.advance() break + if _stmt_boundary.match(cl): + break self.advance() continue m_set_false = re.match(r'^SET\s+(\w[\w-]*)\s+TO\s+FALSE\s*$', line, re.IGNORECASE) @@ -658,8 +668,13 @@ class _BrParser: node = BrIf(cond_text) node.cond_tree = parse_compound_condition(node.condition, self.fields) node.true_seq = self.parse_seq(['ELSE', 'END-IF']) - if self.clean() == 'ELSE': - self.advance() + clean = self.clean() + if clean.startswith('ELSE'): + self.advance() # consume ELSE keyword + rest = clean[4:].strip() if len(clean) > 4 else '' + # ELSE IF → reinsert IF statement as next line for recursive parse + if rest.upper().startswith('IF '): + self.lines.insert(self.pos, rest) node.false_seq = self.parse_seq(['END-IF']) if self.clean() == 'END-IF': self.advance()