3b150b6c54
Grammar fixes: 1. COPY regex: handle quoted names COPY "STD-REC.CPY" 2. Quoted name strip: remove quotes before file lookup 3. VALUE clause: support comma-separated 88-level values 4. PIC STRING: support decimal dot (ZZ9.99 -> PICTURE_STRING.99 + DOT) 5. LEVEL: use INT for level number (fixes 05/01/77 all levels) Results on 58 telecom billing COBOL programs: - Parse OK: 54/58 (93%) - Parse fail: 4 (special chars: TAB, X'01', U'NNNN', &) - Classification known issue: matching programs misclassified as '文件编成' because FILE-CONTROL keyword overrides matching signals (requires rule engine priority fix - separate issue) Co-Authored-By: Claude <noreply@anthropic.com>
118 lines
4.0 KiB
Python
118 lines
4.0 KiB
Python
"""S14: External benchmark suite — 58 telecom billing COBOL programs"""
|
|
import sys, os, time, json
|
|
sys.path.insert(0, os.path.join(os.path.dirname(__file__), '..'))
|
|
P=0;F=0;BUGS=[]
|
|
def ck(v,m=""): global P,F; (P:=P+1) if v else (F:=F+1,BUGS.append(m))
|
|
def sec(n): print(f"\n--- {n} ---")
|
|
|
|
ROOT = "D:/cobol-java/cobol-test-programs/"
|
|
from cobol_testgen import extract_structure, generate_data
|
|
from hina.pipeline.pipeline import classify_program
|
|
from hina.classifier import detect_keyword
|
|
|
|
progs = []
|
|
for d in sorted(os.listdir(ROOT)):
|
|
dp = os.path.join(ROOT, d)
|
|
if os.path.isdir(dp):
|
|
for f in sorted(os.listdir(dp)):
|
|
if f.endswith(".cbl"):
|
|
progs.append(os.path.join(dp, f))
|
|
print(f"Total: {len(progs)} programs")
|
|
|
|
sec("PARSE: Extract structure for all 58 programs")
|
|
parse_ok=0; parse_fail=0
|
|
for fp in progs:
|
|
name = os.path.relpath(fp, ROOT).replace("\\","/")
|
|
src = open(fp, encoding="utf-8-sig").read()
|
|
try:
|
|
st = extract_structure(src)
|
|
parse_ok += 1
|
|
except Exception as e:
|
|
parse_fail += 1
|
|
ck(False, f"PARSE: {name} -> {str(e)[:40]}")
|
|
ck(parse_fail == 0, f"Parse: {parse_fail}/{len(progs)} FAIL")
|
|
|
|
sec("CLASSIFY: Directory name vs classification match")
|
|
# Expected types from directory names
|
|
expected_map = {
|
|
"matching": ["01","02","03","16","17","18","19","20","22","32","33"],
|
|
"keybreak": ["07","08","30"],
|
|
"divide": ["10","11","12"],
|
|
"validation": ["13","27","31"],
|
|
"csv": ["15","21"],
|
|
"select": ["23"],
|
|
"search": ["24","26"],
|
|
"subprogram": ["25"],
|
|
"sort": ["34"],
|
|
"merge": ["35"],
|
|
"evaluate": ["06"],
|
|
"branch": ["05"],
|
|
"edit": ["04"],
|
|
"cics": ["14"],
|
|
"sysin": ["28"],
|
|
"ascii": ["29"],
|
|
"pipeline": ["pipeline"],
|
|
}
|
|
for fp in progs:
|
|
name = os.path.relpath(fp, ROOT).replace("\\","/")
|
|
src = open(fp, encoding="utf-8-sig").read()
|
|
try:
|
|
cp = classify_program(src)
|
|
cat = cp.get("category", "?")
|
|
except:
|
|
cat = "ERROR"
|
|
# Check if directory name indicates matching type
|
|
dir_id = name.split("-")[0] if "-" in name else name[:2]
|
|
# Matching programs should say マッチング
|
|
if dir_id in ["01","02","03","16","17","18","19","20","22"]:
|
|
is_matching = "マッチング" in str(cat) or "matching" in str(cat).lower()
|
|
if not is_matching:
|
|
BUGS.append(f"MISCLASSIFY: {name} -> {cat}")
|
|
ck(False, f"CLASSIFY: {name} expected matching, got {cat}")
|
|
# Division programs should say DIVIDE
|
|
if dir_id in ["10","11","12"]:
|
|
if "DIVIDE" not in str(cat).upper() and "divide" not in str(cat).lower():
|
|
BUGS.append(f"MISCLASSIFY: {name} (divide) -> {cat}")
|
|
# Sort programs should say SORT
|
|
if dir_id == "34":
|
|
if "SORT" not in str(cat).upper() and "sort" not in str(cat).lower():
|
|
BUGS.append(f"MISCLASSIFY: {name} (sort) -> {cat}")
|
|
|
|
ck(len([b for b in BUGS if "MISCLASSIFY" in b]) <= 10, f"Classification mismatch count")
|
|
|
|
sec("GENERATE: Non-zero data produce")
|
|
zero_data = 0
|
|
max_recs = 0; max_name = ""
|
|
for fp in progs:
|
|
name = os.path.relpath(fp, ROOT).replace("\\","/")
|
|
src = open(fp, encoding="utf-8-sig").read()
|
|
try:
|
|
st = extract_structure(src)
|
|
recs = generate_data(src, st)
|
|
if len(recs) == 0:
|
|
zero_data += 1
|
|
if len(recs) > max_recs:
|
|
max_recs = len(recs); max_name = name
|
|
except:
|
|
zero_data += 1
|
|
ck(zero_data <= len(progs) * 0.5, f"Generate: {zero_data}/{len(progs)} zero records")
|
|
ck(max_recs < 10000, f"Max records: {max_recs} ({max_name}) - path explosion risk")
|
|
|
|
sec("PERF: Average performance")
|
|
times = []
|
|
for fp in progs[:10]:
|
|
src = open(fp, encoding="utf-8-sig").read()
|
|
t0=time.time(); st=extract_structure(src); t1=time.time()
|
|
times.append(t1-t0)
|
|
avg = sum(times)/len(times)
|
|
ck(avg < 5.0, f"Avg extract time: {avg:.3f}s (max 5s)")
|
|
|
|
sec("SUMMARY")
|
|
print(f"\n{'='*55}")
|
|
print(f"S14: {P} PASS / {F} FAIL")
|
|
print(f"Bugs found: {len(BUGS)}")
|
|
for b in BUGS:
|
|
print(f" {b}")
|
|
print(f"{'='*55}")
|
|
if F > 0: sys.exit(1)
|