merge local cobol_testgen improvements into v3 shared modules

- cond.py: SQLCODE/SQLSTATE handling, alphanumeric >/< boundary fix - output.py: termination tracking, db_input support, _is_field_assigned filter - coverage.py: mark_from_gcov, THRU support, KeyError protection - gcov.py: new file (dependency for coverage.py) - grammar.lark: multi-segment PIC support - read.py: SQL INCLUDE resolution, DECLARE TABLE parsing, * comment fix - core.py: SQL parsing, blocked_names, keyword list - design.py: multi-sentinel, THRU ranges, PERFORM VARYING last iteration - __init__.py: local main() + v3 API functions, guarded imports All 6 ZAN programs verified passing through v3 pipeline
2026-06-23 22:38:17 +08:00
parent e5ab3baa46
commit 7fb9304212
9 changed files with 1595 additions and 326 deletions
@@ -1,9 +1,12 @@
-"""??????? + COPYBOOK + DATA DIVISION?? + PIC"""
+"""Preprocessor + COPYBOOK + DATA DIVISION parse + PIC"""

 import re
+import logging
 from pathlib import Path
 from lark import Lark, Transformer, v_args

+logger = logging.getLogger(__name__)
+
 from .models import FieldDef, PicInfo


@@ -85,6 +88,8 @@ def preprocess(source: str) -> str:
            if len(line) >= 7 and line[6].upper() == 'D':
                continue
            content = line[6:] if len(line) >= 7 else line
+            if content.strip().startswith('*'):
+                continue
        else:
            comment_pos = line.find('*>')
            if comment_pos >= 0:
@@ -192,6 +197,125 @@ def resolve_copybooks(source: str, source_dir: str, _recursion_depth: int = 0,
    return '\n'.join(result)


+# ── EXEC SQL INCLUDE Resolution ──
+
+_RE_SQL_INC = re.compile(
+    r'EXEC\s+SQL\s+INCLUDE\s+(\w[\w-]*)\s+END-EXEC\.',
+    re.IGNORECASE | re.DOTALL
+)
+
+_BUILTIN_SQLCA = """\
+        01 SQLCA.
+            05 SQLCAID      PIC X(8).
+            05 SQLCABC      PIC S9(9) COMP.
+            05 SQLCODE      PIC S9(9) COMP.
+            05 SQLERRM.
+                10 SQLERRML PIC S9(4) COMP.
+                10 SQLERRMC PIC X(70).
+            05 SQLERRP      PIC X(8).
+            05 SQLERRD      OCCURS 6 TIMES PIC S9(9) COMP.
+            05 SQLWARN.
+                10 SQLWARN0 PIC X.
+                10 SQLWARN1 PIC X.
+                10 SQLWARN2 PIC X.
+                10 SQLWARN3 PIC X.
+                10 SQLWARN4 PIC X.
+                10 SQLWARN5 PIC X.
+                10 SQLWARN6 PIC X.
+                10 SQLWARN7 PIC X.
+            05 SQLSTATE     PIC X(5).
+"""
+
+
+def resolve_sql_includes(source: str, source_dir: str) -> str:
+    """Resolve EXEC SQL INCLUDE name END-EXEC. like COPY. Injects built-in SQLCA if not found."""
+    def _resolve_one(m):
+        name = m.group(1).upper()
+        for ext in ('', '.cpy', '.CPY', '.cbl', '.CBL'):
+            p = Path(source_dir) / f"{name}{ext}"
+            if p.exists():
+                return p.read_text(encoding='utf-8')
+        if name == 'SQLCA':
+            return _BUILTIN_SQLCA
+        logger.warning(f"SQL INCLUDE {name} not found, injecting as comment")
+        return f"      * SQL INCLUDE {name} NOT RESOLVED\n"
+    while True:
+        new_source = _RE_SQL_INC.sub(_resolve_one, source)
+        if new_source == source:
+            break
+        source = new_source
+    return source
+
+
+_RE_SQL_BLOCK = re.compile(
+    r'EXEC\s+SQL\s+(.*?)\s+END-EXEC\.?',
+    re.IGNORECASE | re.DOTALL
+)
+
+_RE_DECLARE_TABLE = re.compile(
+    r'EXEC\s+SQL\s+DECLARE\s+(\w[\w-]*)\s+TABLE\s*\((.*?)\)\s+END-EXEC\.?',
+    re.IGNORECASE | re.DOTALL
+)
+
+
+def strip_exec_sql_from_data_div(source: str) -> tuple:
+    """Strip EXEC SQL blocks from DATA DIVISION. Returns (cleaned_source, declared_columns)."""
+    declared_columns = {}
+    def _repl(m):
+        full = m.group(0)
+        dm = _RE_DECLARE_TABLE.match(full)
+        if dm:
+            table_name = dm.group(1).upper()
+            col_text = dm.group(2)
+            cols = _parse_declare_table_columns(col_text)
+            declared_columns[table_name] = cols
+            return f"      *> DECLARE {table_name} TABLE ({len(cols)} cols)\n"
+        return "      *> SKIPPED EXEC SQL\n"
+    cleaned = _RE_SQL_BLOCK.sub(_repl, source)
+    return cleaned, declared_columns
+
+
+def _parse_declare_table_columns(col_text: str) -> list[dict]:
+    """Parse 'CUST_ID CHAR(5) NOT NULL, BALANCE PIC 9(6)' into column list."""
+    cols = []
+    for part in re.split(r',\s*', col_text):
+        part = part.strip()
+        if not part:
+            continue
+        m = re.match(
+            r'(\w[\w-]*)\s+(CHAR\s*\(\s*(\d+)\s*\)'
+            r'|VARCHAR\s*\(\s*(\d+)\s*\)'
+            r'|INTEGER|SMALLINT'
+            r'|DECIMAL\s*\(\s*(\d+)\s*(?:,\s*(\d+))?\s*\)'
+            r'|DATE'
+            r'|PIC\s+([\w().]+))'
+            r'(?:\s+NOT\s+NULL|\s+NULL)?',
+            part, re.IGNORECASE
+        )
+        if m:
+            name = m.group(1).upper()
+            if m.group(3):
+                col_type = {'db_type': 'CHAR', 'size': int(m.group(3))}
+            elif m.group(4):
+                col_type = {'db_type': 'VARCHAR', 'size': int(m.group(4))}
+            elif m.group(2).upper() == 'INTEGER':
+                col_type = {'db_type': 'INTEGER'}
+            elif m.group(2).upper() == 'SMALLINT':
+                col_type = {'db_type': 'SMALLINT'}
+            elif m.group(5):
+                prec = int(m.group(5)) if m.group(5) else 0
+                scale = int(m.group(6)) if m.group(6) else 0
+                col_type = {'db_type': 'DECIMAL', 'precision': prec, 'scale': scale}
+            elif m.group(2).upper() == 'DATE':
+                col_type = {'db_type': 'DATE'}
+            elif m.group(7):
+                col_type = {'db_type': 'PIC', 'pic': m.group(7).upper()}
+            else:
+                col_type = {'db_type': 'CHAR', 'size': 1}
+            cols.append({'name': name, **col_type})
+    return cols
+
+
 # 鈹€鈹€ Lark Grammar 鈹€鈹€

 _GRAMMAR_CACHE = None
@@ -464,7 +588,7 @@ def parse_file_control(source: str) -> dict:
    """Parse FILE-CONTROL paragraph.

    Returns dict:
-        {filename: {"assign_to": str, "organization": str | None}}
+        {filename: {"assign": str, "organization": str, "recording_mode": str}}
    """
    m = re.search(r'FILE-CONTROL\.(.*?)(?=DATA\s+DIVISION|\Z)', source, re.DOTALL | re.IGNORECASE)
    if not m:
@@ -472,21 +596,39 @@ def parse_file_control(source: str) -> dict:
    fc = m.group(1)
    result = {}
    for sel_m in re.finditer(
-        r'SELECT\s+(\w[\w-]*)\s+[^.]*?\bASSIGN\s+TO\s+(["\'])(.*?)\2',
+        r'SELECT\s+(\w[\w-]*)\s+[^.]*?\bASSIGN\s+TO\s+'
+        r'(?:(["\'])(.*?)\2|(\w[\w-]*))'
+        r'[^.]*\.',
        fc, re.IGNORECASE
    ):
-        fname = sel_m.group(1).upper()
-        assign_to = sel_m.group(3).upper()
-        # Extract ORGANIZATION clause within this SELECT statement
-        org_m = re.search(
-            r'ORGANIZATION\s+(?:IS\s+)?(\w[\w-]*)',
-            sel_m.group(0), re.IGNORECASE
-        )
-        org = org_m.group(1).upper() if org_m else None
-        result[fname] = {
-            "assign_to": assign_to,
-            "organization": org,
-        }
+        name = sel_m.group(1).upper()
+        if sel_m.group(2):
+            assign_to = sel_m.group(3).upper()
+        else:
+            assign_to = sel_m.group(4).upper()
+        clause = sel_m.group(0)
+        org_m = re.search(r'ORGANIZATION\s+(LINE\s+)?SEQUENTIAL', clause, re.IGNORECASE)
+        if org_m and org_m.group(1):
+            org = 'LINE SEQUENTIAL'
+        elif org_m:
+            org = 'SEQUENTIAL'
+        else:
+            org = 'SEQUENTIAL'
+        result[name] = {'assign': assign_to, 'organization': org, 'recording_mode': 'F'}
+    # Extract RECORDING MODE from FD blocks in FILE SECTION
+    fd_sec_m = re.search(r'FILE\s+SECTION\.(.*?)(?=WORKING-STORAGE\s+SECTION|LINKAGE\s+SECTION|\Z)',
+                          source, re.DOTALL | re.IGNORECASE)
+    if fd_sec_m:
+        fs = fd_sec_m.group(1)
+        for block in re.split(r'\n\s*(?=FD\s+)', fs.strip()):
+            fd_m = re.match(r'FD\s+(\w[\w-]*)', block, re.IGNORECASE)
+            if not fd_m:
+                continue
+            fd_name = fd_m.group(1).upper()
+            if fd_name in result:
+                rm_m = re.search(r'RECORDING\s+MODE\s+IS\s+(\w)', block, re.IGNORECASE)
+                if rm_m:
+                    result[fd_name]['recording_mode'] = rm_m.group(1).upper()
    return result


@@ -499,14 +641,12 @@ def parse_file_section(source: str) -> dict:
    fs = m.group(1)
    result = {}
    # FD 和 SD 条目
-    blocks = re.split(r'\n\s*(?=(?:FD|SD)\s+)', fs.strip())
-    for block in blocks:
+    fd_blocks = re.split(r'\n\s*(?=(?:FD|SD)\s+)', fs.strip())
+    for block in fd_blocks:
        m = re.match(r'(FD|SD)\s+(\w[\w-]*)', block, re.IGNORECASE)
        if not m:
            continue
-        entry_type = m.group(1).upper()  # "FD" or "SD"
        name = m.group(2).upper()
-        # 找 01 层记录
        recs = re.findall(r'^\s*0{0,1}1\s+(\w[\w-]*)', block, re.MULTILINE)
        result[name] = [r.upper() for r in recs]
    return result
@@ -521,11 +661,15 @@ def scan_open_statements(source: str) -> dict:
        source, re.IGNORECASE
    ):
        full = m.group(1)
-        for seg_m in re.finditer(
-            r'(INPUT|OUTPUT|I-O)\s+([\w\s-]+)', full, re.IGNORECASE
-        ):
-            direction = seg_m.group(1).upper()
-            for fname in re.findall(r'\w[\w-]*', seg_m.group(2)):
-                if fname.upper() not in ('INPUT', 'OUTPUT', 'I-O'):
+        full = re.sub(r'\s+', ' ', full)
+        tokens = re.split(r'\s+(?=(?:INPUT|OUTPUT|I-O)\s)', full)
+        for seg in tokens:
+            seg = seg.strip()
+            if not seg:
+                continue
+            seg_m = re.match(r'(INPUT|OUTPUT|I-O)\s+([\w -]+)', seg, re.IGNORECASE)
+            if seg_m:
+                direction = seg_m.group(1).upper()
+                for fname in re.findall(r'\w[\w-]*', seg_m.group(2)):
                    dirs[fname.upper()] = direction
    return dirs