diff --git a/cobol_testgen/grammar.lark b/cobol_testgen/grammar.lark index c91f2f1..8943d3d 100644 --- a/cobol_testgen/grammar.lark +++ b/cobol_testgen/grammar.lark @@ -7,14 +7,15 @@ FD_SUFFIX: /(?:"[^"]*"|'[^']*'|[^.])*\./ working_storage: "WORKING-STORAGE" "SECTION" DOT data_item* linkage: "LINKAGE" "SECTION" DOT data_item* data_item: level_num (NAME | "FILLER") clause* DOT -level_num: LEVEL +level_num: INT clause: pic_clause | value_clause | occurs_clause | redefines_clause | usage_clause | "SYNC" | "SYNCHRONIZED" | "JUSTIFIED" "RIGHT"? | "BLANK" "WHEN" "ZERO" | "GLOBAL" | "EXTERNAL" pic_clause: "PIC" "IS"? PICTURE_STRING -value_clause: "VALUE" "IS"? value_literal+ +value_clause: "VALUE" "IS"? value_list +value_list: value_literal (","? value_literal)* value_literal: INT | SIGNED_NUMBER | STRING | SQSTRING | "ZERO" | "ZEROS" | "ZEROES" | "SPACE" | "SPACES" @@ -27,9 +28,9 @@ key_clause: ("ASCENDING" | "DESCENDING") "KEY" "IS"? NAME (","? NAME)* indexed_clause: "INDEXED" "BY" NAME (","? NAME)* usage_clause: USAGE_VAL USAGE_VAL: "COMP" | "COMP-3" | "COMP-5" | "BINARY" | "PACKED-DECIMAL" | "DISPLAY" -LEVEL: /0[1-9]|[1-4][0-9]|49|77|88/ +LEVEL: /0[1-9]|[0-4][0-9]|49|77|88|[0-9]+/ NAME: /[A-Z][A-Z0-9-]*/i -PICTURE_STRING: /[0-9A-Z()+,\-*\/V]+/i +PICTURE_STRING: /[0-9A-Z()+,\-*\/V]+(?:\.[0-9A-Z()+,\-*\/V]+)?/i INT: /[0-9]+/ DOT: /\./ %import common.SIGNED_NUMBER diff --git a/cobol_testgen/read.py b/cobol_testgen/read.py index 1e8367c..806cf08 100644 --- a/cobol_testgen/read.py +++ b/cobol_testgen/read.py @@ -95,7 +95,7 @@ _COPYBOOK_EXTENSIONS = ['.cpy', '.cbl', '.cpb', ''] def resolve_copybooks(source: str, source_dir: str, _recursion_depth: int = 0) -> str: """Find COPY statements and replace with copybook content.""" _RE_COPY = re.compile( - r"^\s*COPY\s+(\w[\w-]*)(?:\s+REPLACING\s+(.+?))?\s*\.?\s*$", + r"^\s*COPY\s+(\w[\w-]*|\"[^\"]*\"|\'[^\']*\')(?:\s+REPLACING\s+(.+?))?\s*\.?\s*$", re.IGNORECASE ) _RE_PAIR = re.compile(r"==(.+?)==\s+BY\s+==(.+?)==", re.IGNORECASE) @@ -105,7 +105,8 @@ def resolve_copybooks(source: str, source_dir: str, _recursion_depth: int = 0) - for line in lines: m = _RE_COPY.match(line) if m: - name = m.group(1).upper() + raw_name = m.group(1) + name = raw_name.strip('"').strip("'").upper() found = None for ext in _COPYBOOK_EXTENSIONS: p = Path(source_dir, name + ext) diff --git a/test-data/s14_benchmark_suite.py b/test-data/s14_benchmark_suite.py new file mode 100644 index 0000000..d869baa --- /dev/null +++ b/test-data/s14_benchmark_suite.py @@ -0,0 +1,117 @@ +"""S14: External benchmark suite — 58 telecom billing COBOL programs""" +import sys, os, time, json +sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..')) +P=0;F=0;BUGS=[] +def ck(v,m=""): global P,F; (P:=P+1) if v else (F:=F+1,BUGS.append(m)) +def sec(n): print(f"\n--- {n} ---") + +ROOT = "D:/cobol-java/cobol-test-programs/" +from cobol_testgen import extract_structure, generate_data +from hina.pipeline.pipeline import classify_program +from hina.classifier import detect_keyword + +progs = [] +for d in sorted(os.listdir(ROOT)): + dp = os.path.join(ROOT, d) + if os.path.isdir(dp): + for f in sorted(os.listdir(dp)): + if f.endswith(".cbl"): + progs.append(os.path.join(dp, f)) +print(f"Total: {len(progs)} programs") + +sec("PARSE: Extract structure for all 58 programs") +parse_ok=0; parse_fail=0 +for fp in progs: + name = os.path.relpath(fp, ROOT).replace("\\","/") + src = open(fp, encoding="utf-8-sig").read() + try: + st = extract_structure(src) + parse_ok += 1 + except Exception as e: + parse_fail += 1 + ck(False, f"PARSE: {name} -> {str(e)[:40]}") +ck(parse_fail == 0, f"Parse: {parse_fail}/{len(progs)} FAIL") + +sec("CLASSIFY: Directory name vs classification match") +# Expected types from directory names +expected_map = { + "matching": ["01","02","03","16","17","18","19","20","22","32","33"], + "keybreak": ["07","08","30"], + "divide": ["10","11","12"], + "validation": ["13","27","31"], + "csv": ["15","21"], + "select": ["23"], + "search": ["24","26"], + "subprogram": ["25"], + "sort": ["34"], + "merge": ["35"], + "evaluate": ["06"], + "branch": ["05"], + "edit": ["04"], + "cics": ["14"], + "sysin": ["28"], + "ascii": ["29"], + "pipeline": ["pipeline"], +} +for fp in progs: + name = os.path.relpath(fp, ROOT).replace("\\","/") + src = open(fp, encoding="utf-8-sig").read() + try: + cp = classify_program(src) + cat = cp.get("category", "?") + except: + cat = "ERROR" + # Check if directory name indicates matching type + dir_id = name.split("-")[0] if "-" in name else name[:2] + # Matching programs should say マッチング + if dir_id in ["01","02","03","16","17","18","19","20","22"]: + is_matching = "マッチング" in str(cat) or "matching" in str(cat).lower() + if not is_matching: + BUGS.append(f"MISCLASSIFY: {name} -> {cat}") + ck(False, f"CLASSIFY: {name} expected matching, got {cat}") + # Division programs should say DIVIDE + if dir_id in ["10","11","12"]: + if "DIVIDE" not in str(cat).upper() and "divide" not in str(cat).lower(): + BUGS.append(f"MISCLASSIFY: {name} (divide) -> {cat}") + # Sort programs should say SORT + if dir_id == "34": + if "SORT" not in str(cat).upper() and "sort" not in str(cat).lower(): + BUGS.append(f"MISCLASSIFY: {name} (sort) -> {cat}") + +ck(len([b for b in BUGS if "MISCLASSIFY" in b]) <= 10, f"Classification mismatch count") + +sec("GENERATE: Non-zero data produce") +zero_data = 0 +max_recs = 0; max_name = "" +for fp in progs: + name = os.path.relpath(fp, ROOT).replace("\\","/") + src = open(fp, encoding="utf-8-sig").read() + try: + st = extract_structure(src) + recs = generate_data(src, st) + if len(recs) == 0: + zero_data += 1 + if len(recs) > max_recs: + max_recs = len(recs); max_name = name + except: + zero_data += 1 +ck(zero_data <= len(progs) * 0.5, f"Generate: {zero_data}/{len(progs)} zero records") +ck(max_recs < 10000, f"Max records: {max_recs} ({max_name}) - path explosion risk") + +sec("PERF: Average performance") +times = [] +for fp in progs[:10]: + src = open(fp, encoding="utf-8-sig").read() + t0=time.time(); st=extract_structure(src); t1=time.time() + times.append(t1-t0) +avg = sum(times)/len(times) +ck(avg < 5.0, f"Avg extract time: {avg:.3f}s (max 5s)") + +sec("SUMMARY") +print(f"\n{'='*55}") +print(f"S14: {P} PASS / {F} FAIL") +print(f"Bugs found: {len(BUGS)}") +for b in BUGS: + print(f" {b}") +print(f"{'='*55}") +if F > 0: sys.exit(1)