fix: Lark grammar + parse_file_section SD/ASCENDING KEY support
Bug fixes found through statement benchmark testing: 1. grammar.lark: Add ASCENDING/DESCENDING KEY IS + INDEXED BY to occurs_clause — fixes HINA024 (SEARCH ALL) parsing crash 2. grammar.lark: Add SD (Sort Description) entry type to file_section — fixes HINA034 (SORT), ST01, ST02 parsing crashes 3. read.py parse_file_section(): Handle SD blocks alongside FD blocks — enables SORT/MERGE file structure extraction 4 previously crashing files now parse successfully: - HINA024.cbl (SEARCH ALL): paras=3, files=0 - HINA034.cbl (SORT): paras=1, files=3 - ST01_SORT.cbl: paras=2, files=3 - ST02_MERGE.cbl: paras=1, files=4 Regression: 749 passed (unchanged — classify_program internally caught the crashes, so tests already 'passed'; real improvement is in data quality: structure extraction now works for these programs)
This commit is contained in:
@@ -1,7 +1,8 @@
|
|||||||
start: data_div_content
|
start: data_div_content
|
||||||
data_div_content: (file_section | working_storage | linkage)*
|
data_div_content: (file_section | working_storage | linkage)*
|
||||||
file_section: "FILE" "SECTION" DOT fd+
|
file_section: "FILE" "SECTION" DOT (fd | sd)+
|
||||||
fd: "FD" NAME FD_SUFFIX data_item+
|
fd: "FD" NAME FD_SUFFIX data_item+
|
||||||
|
sd: "SD" NAME FD_SUFFIX data_item*
|
||||||
FD_SUFFIX: /(?:"[^"]*"|'[^']*'|[^.])*\./
|
FD_SUFFIX: /(?:"[^"]*"|'[^']*'|[^.])*\./
|
||||||
working_storage: "WORKING-STORAGE" "SECTION" DOT data_item*
|
working_storage: "WORKING-STORAGE" "SECTION" DOT data_item*
|
||||||
linkage: "LINKAGE" "SECTION" DOT data_item*
|
linkage: "LINKAGE" "SECTION" DOT data_item*
|
||||||
@@ -21,7 +22,9 @@ value_literal: INT | SIGNED_NUMBER | STRING | SQSTRING
|
|||||||
| "LOW-VALUE" | "LOW-VALUES"
|
| "LOW-VALUE" | "LOW-VALUES"
|
||||||
SQSTRING: /'[^']*'/
|
SQSTRING: /'[^']*'/
|
||||||
redefines_clause: "REDEFINES" NAME
|
redefines_clause: "REDEFINES" NAME
|
||||||
occurs_clause: "OCCURS" INT "TIMES"? ("DEPENDING" "ON" NAME)?
|
occurs_clause: "OCCURS" INT "TIMES"? ("DEPENDING" "ON" NAME)? key_clause? indexed_clause?
|
||||||
|
key_clause: ("ASCENDING" | "DESCENDING") "KEY" "IS"? NAME (","? NAME)*
|
||||||
|
indexed_clause: "INDEXED" "BY" NAME (","? NAME)*
|
||||||
usage_clause: USAGE_VAL
|
usage_clause: USAGE_VAL
|
||||||
USAGE_VAL: "COMP" | "COMP-3" | "COMP-5" | "BINARY" | "PACKED-DECIMAL" | "DISPLAY"
|
USAGE_VAL: "COMP" | "COMP-3" | "COMP-5" | "BINARY" | "PACKED-DECIMAL" | "DISPLAY"
|
||||||
LEVEL: /0[1-9]|[1-4][0-9]|49|77|88/
|
LEVEL: /0[1-9]|[1-4][0-9]|49|77|88/
|
||||||
|
|||||||
@@ -425,14 +425,15 @@ def parse_file_section(source: str) -> dict:
|
|||||||
return {}
|
return {}
|
||||||
fs = m.group(1)
|
fs = m.group(1)
|
||||||
result = {}
|
result = {}
|
||||||
# ? FD ?????? FD ?
|
# FD 和 SD 条目
|
||||||
fd_blocks = re.split(r'\n\s*(?=FD\s+)', fs.strip())
|
blocks = re.split(r'\n\s*(?=(?:FD|SD)\s+)', fs.strip())
|
||||||
for block in fd_blocks:
|
for block in blocks:
|
||||||
m = re.match(r'FD\s+(\w[\w-]*)', block, re.IGNORECASE)
|
m = re.match(r'(FD|SD)\s+(\w[\w-]*)', block, re.IGNORECASE)
|
||||||
if not m:
|
if not m:
|
||||||
continue
|
continue
|
||||||
name = m.group(1).upper()
|
entry_type = m.group(1).upper() # "FD" or "SD"
|
||||||
# ???????? 01 ????
|
name = m.group(2).upper()
|
||||||
|
# 找 01 层记录
|
||||||
recs = re.findall(r'^\s*0{0,1}1\s+(\w[\w-]*)', block, re.MULTILINE)
|
recs = re.findall(r'^\s*0{0,1}1\s+(\w[\w-]*)', block, re.MULTILINE)
|
||||||
result[name] = [r.upper() for r in recs]
|
result[name] = [r.upper() for r in recs]
|
||||||
return result
|
return result
|
||||||
|
|||||||
Reference in New Issue
Block a user