COBOL layout parser in Python
up vote
3
down vote
favorite
I don't know anything about parsers but I had to write something to read COBOL for a work project. What are some things that I could improve with my Python coding and parser design?
Note: This isn't COBOL feature complete yet, just the things I need for the project.
"""
Parse COBOL copybook files to a Python list for EBCDIC reading
"""
#! /usr/bin/env python
import re
from os import path
# Only the handful I care about really
COBOL_KEYWORDS = {"COMP-3", "PIC", "REDEFINES", "OCCURS", "TIMES", "SIGN", "IS"}
class COBOLParser:
"""
Takes a file path as an argument. Run the parse method on returned object
to convert to a python readable format.
"""
def __init__(self, file: str):
if not path.isfile(file):
raise FileNotFoundError
self.file = file
# This here object keeps track of what level ID groups are in in. This
# is not needed in the final data, but is nescessary for determining
# what level to place a new item on after leaving a group so it is
# stored seperately
self.group_levels =
def parse(self) -> list:
"""
Parse and return self.file as a list of python dictionaries
"""
parse_out =
with open(self.file, "rt") as cobol_file:
line_number = 0
previous_field_level = 0
previous_group_level = 0
unfinished_item = {}
for line in cobol_file:
line_number += 1
# skip comment lines and empty lines
if not line.strip() or line.strip()[0] == "*":
continue
parse_out = self.parse_line(parse_out, line, line_number)
# Check for duplicate names
group = (
self._item_level(parse_out, self.group_levels[-1][1])
if self.group_levels
else parse_out
)
if (
group
and not unfinished_item
and group[-1:][0]["Name"] != "FILLER"
and group[-1:][0]["Name"]
in [item["Name"] for item in group[:-1]]
):
raise InvalidCOBOLError(
line_number, "Duplicate names in a group."
)
# Check for proper line ending
if (
unfinished_item
and unfinished_item["Name"]
!= self._lowest_dict(parse_out)["Name"]
):
raise InvalidCOBOLError(
line_number - 1, "Unended line was not continued."
)
# Make sure that the level is lesser than or equal to the last
# field level if previous level is a field. Then set a new
# value for previous_line_level.
if (
not unfinished_item
and previous_field_level
and int(line.strip()[:2]) > previous_field_level
):
raise InvalidCOBOLError(
line_number, "Field has sub entries at line {}"
)
# If a group was created the item after the group needs to be a
# member of said group, otherwise, raise error
if (
not unfinished_item
and previous_group_level
and int(line.strip()[:2]) <= previous_group_level
):
raise InvalidCOBOLError(
line_number - 1, "Group has no sub elements at line {}."
)
# Save information about last read line for easier error
# checking.
if (
not unfinished_item
and self._lowest_dict(parse_out)["Type"] == "Field"
):
previous_field_level = int(line.strip()[:2])
previous_group_level = 0
elif not unfinished_item:
previous_group_level = int(line.strip()[:2])
previous_field_level = 0
# Check for EOL character in string
line_ended_check = r"(w|d|))+.(s|$)"
if not re.search(line_ended_check, line):
unfinished_item = self._lowest_dict(parse_out)
else:
unfinished_item = {}
return parse_out
def parse_line(
self, out_builder: list, line: str, line_number: int
) -> list:
"""
Parses a COBOL line and creates a new item in the output builder. If
the line is a continuation of a previous line, just add the new info to
the previously added entry.
"""
if out_builder:
last_element = self._lowest_dict(out_builder)
else:
last_element = {}
items = [x for x in line.strip().split()]
# Strip out closing periods from items, then strip out items that come
# after the period
for item in items:
if item[-1:] == ".":
line_ended = True
last_item = items.index(item)
items = items[: last_item + 1]
items[last_item] = items[last_item][:-1]
else:
line_ended = False
try:
# Check first item (should be level if not a line continuation)
# Level should be two digits, representing a number between 01 and
# 49. COBOL standards also allow 66 and 88 as levels with specific
# rules. These are not supported yet but if they are in the future
# replace below regex with the following: (?!00)(66|88|[0-4][[0-9])
if re.match(r"(?!00)[0-4][0-9]", items[0]):
current_level = items[0]
# Check whether line is a continuation of previous line
# As of right now, only PIC and Usage are allowed to continue
# onto another line, (both only existing on fields
elif (
items[0] == "PIC"
and last_element
and "Format" not in last_element.keys()
):
last_element["Format"] = self._clause_value(items, "PIC")
last_element["Type"] = "Field"
# Check if continued line also has usage clause
# PIC is always two items long, check after
if len(items) == 3 and self._valid_usage(items[2]):
last_element["Usage"] = items[2]
return out_builder
elif (
last_element
and last_element["Type"] == "Field"
and self._valid_usage(items[0])
and "Usage" not in last_element.keys()
):
last_element["Usage"] = items[0]
return out_builder
else:
raise InvalidCOBOLError(
line_number, "Input does not resemble COBOL at line {}"
)
# At this point we know this is a new field or group. Get the group
# name and save it in a dictionary representing the new item. Also
# check for invalid names
new_item = {"Name": items[1]}
# Get the list of fields for the group the current item belongs to
current_group = self._item_level(out_builder, int(current_level))
if new_item["Name"] in COBOL_KEYWORDS:
raise InvalidCOBOLError(
line_number,
"Field or group name at line {} matches a COBOL keyword",
)
try:
clause_error = InvalidCOBOLError(
line_number,
"A clause was declared but no definition was given.",
)
if "REDEFINES" in items:
new_item["Redefines"] = self._clause_value(
items, "REDEFINES"
)
if new_item["Redefines"] in COBOL_KEYWORDS:
raise clause_error
if "OCCURS" in items:
if items[items.index("OCCURS") + 2] != "TIMES":
raise clause_error
try:
new_item["Occurs"] = int(
self._clause_value(items, "OCCURS")
)
except ValueError:
raise InvalidCOBOLError(
line_number,
"Occurs clause must specify an integer value at line {}.",
)
if "PIC" not in items and line_ended:
# Append the newly added group to group_levels
self.group_levels.append(
(new_item["Name"], int(current_level))
)
new_item["Type"] = "Group"
new_item["Fields"] =
current_group.append(new_item)
return out_builder
# Item is field
new_item["Type"] = "Field"
if "PIC" in items:
new_item["Format"] = self._clause_value(items, "PIC")
if new_item["Format"] in COBOL_KEYWORDS:
raise clause_error
# Check for usage clause.
usage_index = items.index("PIC") + 2
if len(items) > usage_index:
if self._valid_usage(items[usage_index]):
new_item["Usage"] = items[usage_index]
else:
raise InvalidCOBOLError(
line_number,
(
"Usage clause does not match an existing "
"definition at line {}"
),
)
except IndexError:
raise clause_error
current_group = self._item_level(out_builder, int(current_level))
current_group.append(new_item)
return out_builder
except IndexError:
raise InvalidCOBOLError(
line_number, "Input does not resemble COBOL at line {}"
)
def _item_level(self, struct: list, current_level: int) -> list:
"""
Returns a list corresponding what group an item should belong to.
"""
if not struct or not self.group_levels:
return struct
# We only care about the last level of a matching group. Check groups
# in reverse.
if current_level > self.group_levels[-1][1]:
return self._lowest_list(struct)
for group in self.group_levels[::-1]:
# Return the fields of the first group that has a lower level than
# the current item's level.
if group[1] < current_level:
return self._lowest_list(struct, group[0])
return struct
def _lowest_dict(self, struct: list) -> dict:
"""
Returns the deepest dictionary at the bottom of provided structure.
"""
last_element = struct[-1:][0]
if "Fields" in last_element.keys() and last_element["Fields"]:
return self._lowest_dict(last_element["Fields"])
return last_element
def _lowest_list(self, struct: list, name: str = None) -> list:
"""
Returns the deepest list at the bottom of provided stucture. If a name
parameter is provided, stop searching and return list with matching
name.
"""
if not struct:
return struct
last_element = struct[-1]
if (
name
and last_element["Name"] == name
and "Fields" in last_element.keys()
):
return last_element["Fields"]
if "Fields" in last_element.keys():
return self._lowest_list(last_element["Fields"])
return struct
@staticmethod
def _clause_value(items: list, clause: str) -> str:
"""
Returns the item from a list of items following the provided clause.
"""
value = items[items.index(clause) + 1]
return value
@staticmethod
def _valid_usage(usage: str) -> bool:
"""
Returns bool indicating whether provided usage is valid or not.
"""
# Not really an indication of valid usages as much as a list of what
# usages the EBCDIC reader we use supports.
valid_usages = ["COMP-3"]
return usage in valid_usages
class InvalidCOBOLError(Exception):
"""
Produces an error message with a line number showing which line of code
contains the Invalid COBOL. msg parameter should contain a set of empty
square brackets, although if not, a set will be appended to the end of the
message.
"""
def __init__(self, line, msg=None):
if msg is None:
# Try to not let this happen
msg = (
"There was an unspecified error while parsing the COBOL at "
"line {}. Please contact a developer for assistance."
)
elif not "{}" in msg:
msg = msg + " (Line {})."
msg = msg.format(line)
super(InvalidCOBOLError, self).__init__(msg)
python parsing cobol
New contributor
Manderton is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
add a comment |
up vote
3
down vote
favorite
I don't know anything about parsers but I had to write something to read COBOL for a work project. What are some things that I could improve with my Python coding and parser design?
Note: This isn't COBOL feature complete yet, just the things I need for the project.
"""
Parse COBOL copybook files to a Python list for EBCDIC reading
"""
#! /usr/bin/env python
import re
from os import path
# Only the handful I care about really
COBOL_KEYWORDS = {"COMP-3", "PIC", "REDEFINES", "OCCURS", "TIMES", "SIGN", "IS"}
class COBOLParser:
"""
Takes a file path as an argument. Run the parse method on returned object
to convert to a python readable format.
"""
def __init__(self, file: str):
if not path.isfile(file):
raise FileNotFoundError
self.file = file
# This here object keeps track of what level ID groups are in in. This
# is not needed in the final data, but is nescessary for determining
# what level to place a new item on after leaving a group so it is
# stored seperately
self.group_levels =
def parse(self) -> list:
"""
Parse and return self.file as a list of python dictionaries
"""
parse_out =
with open(self.file, "rt") as cobol_file:
line_number = 0
previous_field_level = 0
previous_group_level = 0
unfinished_item = {}
for line in cobol_file:
line_number += 1
# skip comment lines and empty lines
if not line.strip() or line.strip()[0] == "*":
continue
parse_out = self.parse_line(parse_out, line, line_number)
# Check for duplicate names
group = (
self._item_level(parse_out, self.group_levels[-1][1])
if self.group_levels
else parse_out
)
if (
group
and not unfinished_item
and group[-1:][0]["Name"] != "FILLER"
and group[-1:][0]["Name"]
in [item["Name"] for item in group[:-1]]
):
raise InvalidCOBOLError(
line_number, "Duplicate names in a group."
)
# Check for proper line ending
if (
unfinished_item
and unfinished_item["Name"]
!= self._lowest_dict(parse_out)["Name"]
):
raise InvalidCOBOLError(
line_number - 1, "Unended line was not continued."
)
# Make sure that the level is lesser than or equal to the last
# field level if previous level is a field. Then set a new
# value for previous_line_level.
if (
not unfinished_item
and previous_field_level
and int(line.strip()[:2]) > previous_field_level
):
raise InvalidCOBOLError(
line_number, "Field has sub entries at line {}"
)
# If a group was created the item after the group needs to be a
# member of said group, otherwise, raise error
if (
not unfinished_item
and previous_group_level
and int(line.strip()[:2]) <= previous_group_level
):
raise InvalidCOBOLError(
line_number - 1, "Group has no sub elements at line {}."
)
# Save information about last read line for easier error
# checking.
if (
not unfinished_item
and self._lowest_dict(parse_out)["Type"] == "Field"
):
previous_field_level = int(line.strip()[:2])
previous_group_level = 0
elif not unfinished_item:
previous_group_level = int(line.strip()[:2])
previous_field_level = 0
# Check for EOL character in string
line_ended_check = r"(w|d|))+.(s|$)"
if not re.search(line_ended_check, line):
unfinished_item = self._lowest_dict(parse_out)
else:
unfinished_item = {}
return parse_out
def parse_line(
self, out_builder: list, line: str, line_number: int
) -> list:
"""
Parses a COBOL line and creates a new item in the output builder. If
the line is a continuation of a previous line, just add the new info to
the previously added entry.
"""
if out_builder:
last_element = self._lowest_dict(out_builder)
else:
last_element = {}
items = [x for x in line.strip().split()]
# Strip out closing periods from items, then strip out items that come
# after the period
for item in items:
if item[-1:] == ".":
line_ended = True
last_item = items.index(item)
items = items[: last_item + 1]
items[last_item] = items[last_item][:-1]
else:
line_ended = False
try:
# Check first item (should be level if not a line continuation)
# Level should be two digits, representing a number between 01 and
# 49. COBOL standards also allow 66 and 88 as levels with specific
# rules. These are not supported yet but if they are in the future
# replace below regex with the following: (?!00)(66|88|[0-4][[0-9])
if re.match(r"(?!00)[0-4][0-9]", items[0]):
current_level = items[0]
# Check whether line is a continuation of previous line
# As of right now, only PIC and Usage are allowed to continue
# onto another line, (both only existing on fields
elif (
items[0] == "PIC"
and last_element
and "Format" not in last_element.keys()
):
last_element["Format"] = self._clause_value(items, "PIC")
last_element["Type"] = "Field"
# Check if continued line also has usage clause
# PIC is always two items long, check after
if len(items) == 3 and self._valid_usage(items[2]):
last_element["Usage"] = items[2]
return out_builder
elif (
last_element
and last_element["Type"] == "Field"
and self._valid_usage(items[0])
and "Usage" not in last_element.keys()
):
last_element["Usage"] = items[0]
return out_builder
else:
raise InvalidCOBOLError(
line_number, "Input does not resemble COBOL at line {}"
)
# At this point we know this is a new field or group. Get the group
# name and save it in a dictionary representing the new item. Also
# check for invalid names
new_item = {"Name": items[1]}
# Get the list of fields for the group the current item belongs to
current_group = self._item_level(out_builder, int(current_level))
if new_item["Name"] in COBOL_KEYWORDS:
raise InvalidCOBOLError(
line_number,
"Field or group name at line {} matches a COBOL keyword",
)
try:
clause_error = InvalidCOBOLError(
line_number,
"A clause was declared but no definition was given.",
)
if "REDEFINES" in items:
new_item["Redefines"] = self._clause_value(
items, "REDEFINES"
)
if new_item["Redefines"] in COBOL_KEYWORDS:
raise clause_error
if "OCCURS" in items:
if items[items.index("OCCURS") + 2] != "TIMES":
raise clause_error
try:
new_item["Occurs"] = int(
self._clause_value(items, "OCCURS")
)
except ValueError:
raise InvalidCOBOLError(
line_number,
"Occurs clause must specify an integer value at line {}.",
)
if "PIC" not in items and line_ended:
# Append the newly added group to group_levels
self.group_levels.append(
(new_item["Name"], int(current_level))
)
new_item["Type"] = "Group"
new_item["Fields"] =
current_group.append(new_item)
return out_builder
# Item is field
new_item["Type"] = "Field"
if "PIC" in items:
new_item["Format"] = self._clause_value(items, "PIC")
if new_item["Format"] in COBOL_KEYWORDS:
raise clause_error
# Check for usage clause.
usage_index = items.index("PIC") + 2
if len(items) > usage_index:
if self._valid_usage(items[usage_index]):
new_item["Usage"] = items[usage_index]
else:
raise InvalidCOBOLError(
line_number,
(
"Usage clause does not match an existing "
"definition at line {}"
),
)
except IndexError:
raise clause_error
current_group = self._item_level(out_builder, int(current_level))
current_group.append(new_item)
return out_builder
except IndexError:
raise InvalidCOBOLError(
line_number, "Input does not resemble COBOL at line {}"
)
def _item_level(self, struct: list, current_level: int) -> list:
"""
Returns a list corresponding what group an item should belong to.
"""
if not struct or not self.group_levels:
return struct
# We only care about the last level of a matching group. Check groups
# in reverse.
if current_level > self.group_levels[-1][1]:
return self._lowest_list(struct)
for group in self.group_levels[::-1]:
# Return the fields of the first group that has a lower level than
# the current item's level.
if group[1] < current_level:
return self._lowest_list(struct, group[0])
return struct
def _lowest_dict(self, struct: list) -> dict:
"""
Returns the deepest dictionary at the bottom of provided structure.
"""
last_element = struct[-1:][0]
if "Fields" in last_element.keys() and last_element["Fields"]:
return self._lowest_dict(last_element["Fields"])
return last_element
def _lowest_list(self, struct: list, name: str = None) -> list:
"""
Returns the deepest list at the bottom of provided stucture. If a name
parameter is provided, stop searching and return list with matching
name.
"""
if not struct:
return struct
last_element = struct[-1]
if (
name
and last_element["Name"] == name
and "Fields" in last_element.keys()
):
return last_element["Fields"]
if "Fields" in last_element.keys():
return self._lowest_list(last_element["Fields"])
return struct
@staticmethod
def _clause_value(items: list, clause: str) -> str:
"""
Returns the item from a list of items following the provided clause.
"""
value = items[items.index(clause) + 1]
return value
@staticmethod
def _valid_usage(usage: str) -> bool:
"""
Returns bool indicating whether provided usage is valid or not.
"""
# Not really an indication of valid usages as much as a list of what
# usages the EBCDIC reader we use supports.
valid_usages = ["COMP-3"]
return usage in valid_usages
class InvalidCOBOLError(Exception):
"""
Produces an error message with a line number showing which line of code
contains the Invalid COBOL. msg parameter should contain a set of empty
square brackets, although if not, a set will be appended to the end of the
message.
"""
def __init__(self, line, msg=None):
if msg is None:
# Try to not let this happen
msg = (
"There was an unspecified error while parsing the COBOL at "
"line {}. Please contact a developer for assistance."
)
elif not "{}" in msg:
msg = msg + " (Line {})."
msg = msg.format(line)
super(InvalidCOBOLError, self).__init__(msg)
python parsing cobol
New contributor
Manderton is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
1
An alternative approach is to look at cb2xml sourceforge.net/projects/cb2xml) it will convert the Cobol to Xml. cb2xml is written java but there is an example of reading the Xml in python. cb2xml. Alternative cb2xml is written using sablecc; sablecc can generate pyhtonas well as java.
– Bruce Martin
2 days ago
there is also stingray sourceforge.net/projects/stingrayreader
– Bruce Martin
2 days ago
1
There are a lot of quirks inCOBOL, the existing parsers will deal with many of them. Takecomp-3it can also be written ascomputational-3, it can be specified with/without usage and at either field or group level.
– Bruce Martin
2 days ago
add a comment |
up vote
3
down vote
favorite
up vote
3
down vote
favorite
I don't know anything about parsers but I had to write something to read COBOL for a work project. What are some things that I could improve with my Python coding and parser design?
Note: This isn't COBOL feature complete yet, just the things I need for the project.
"""
Parse COBOL copybook files to a Python list for EBCDIC reading
"""
#! /usr/bin/env python
import re
from os import path
# Only the handful I care about really
COBOL_KEYWORDS = {"COMP-3", "PIC", "REDEFINES", "OCCURS", "TIMES", "SIGN", "IS"}
class COBOLParser:
"""
Takes a file path as an argument. Run the parse method on returned object
to convert to a python readable format.
"""
def __init__(self, file: str):
if not path.isfile(file):
raise FileNotFoundError
self.file = file
# This here object keeps track of what level ID groups are in in. This
# is not needed in the final data, but is nescessary for determining
# what level to place a new item on after leaving a group so it is
# stored seperately
self.group_levels =
def parse(self) -> list:
"""
Parse and return self.file as a list of python dictionaries
"""
parse_out =
with open(self.file, "rt") as cobol_file:
line_number = 0
previous_field_level = 0
previous_group_level = 0
unfinished_item = {}
for line in cobol_file:
line_number += 1
# skip comment lines and empty lines
if not line.strip() or line.strip()[0] == "*":
continue
parse_out = self.parse_line(parse_out, line, line_number)
# Check for duplicate names
group = (
self._item_level(parse_out, self.group_levels[-1][1])
if self.group_levels
else parse_out
)
if (
group
and not unfinished_item
and group[-1:][0]["Name"] != "FILLER"
and group[-1:][0]["Name"]
in [item["Name"] for item in group[:-1]]
):
raise InvalidCOBOLError(
line_number, "Duplicate names in a group."
)
# Check for proper line ending
if (
unfinished_item
and unfinished_item["Name"]
!= self._lowest_dict(parse_out)["Name"]
):
raise InvalidCOBOLError(
line_number - 1, "Unended line was not continued."
)
# Make sure that the level is lesser than or equal to the last
# field level if previous level is a field. Then set a new
# value for previous_line_level.
if (
not unfinished_item
and previous_field_level
and int(line.strip()[:2]) > previous_field_level
):
raise InvalidCOBOLError(
line_number, "Field has sub entries at line {}"
)
# If a group was created the item after the group needs to be a
# member of said group, otherwise, raise error
if (
not unfinished_item
and previous_group_level
and int(line.strip()[:2]) <= previous_group_level
):
raise InvalidCOBOLError(
line_number - 1, "Group has no sub elements at line {}."
)
# Save information about last read line for easier error
# checking.
if (
not unfinished_item
and self._lowest_dict(parse_out)["Type"] == "Field"
):
previous_field_level = int(line.strip()[:2])
previous_group_level = 0
elif not unfinished_item:
previous_group_level = int(line.strip()[:2])
previous_field_level = 0
# Check for EOL character in string
line_ended_check = r"(w|d|))+.(s|$)"
if not re.search(line_ended_check, line):
unfinished_item = self._lowest_dict(parse_out)
else:
unfinished_item = {}
return parse_out
def parse_line(
self, out_builder: list, line: str, line_number: int
) -> list:
"""
Parses a COBOL line and creates a new item in the output builder. If
the line is a continuation of a previous line, just add the new info to
the previously added entry.
"""
if out_builder:
last_element = self._lowest_dict(out_builder)
else:
last_element = {}
items = [x for x in line.strip().split()]
# Strip out closing periods from items, then strip out items that come
# after the period
for item in items:
if item[-1:] == ".":
line_ended = True
last_item = items.index(item)
items = items[: last_item + 1]
items[last_item] = items[last_item][:-1]
else:
line_ended = False
try:
# Check first item (should be level if not a line continuation)
# Level should be two digits, representing a number between 01 and
# 49. COBOL standards also allow 66 and 88 as levels with specific
# rules. These are not supported yet but if they are in the future
# replace below regex with the following: (?!00)(66|88|[0-4][[0-9])
if re.match(r"(?!00)[0-4][0-9]", items[0]):
current_level = items[0]
# Check whether line is a continuation of previous line
# As of right now, only PIC and Usage are allowed to continue
# onto another line, (both only existing on fields
elif (
items[0] == "PIC"
and last_element
and "Format" not in last_element.keys()
):
last_element["Format"] = self._clause_value(items, "PIC")
last_element["Type"] = "Field"
# Check if continued line also has usage clause
# PIC is always two items long, check after
if len(items) == 3 and self._valid_usage(items[2]):
last_element["Usage"] = items[2]
return out_builder
elif (
last_element
and last_element["Type"] == "Field"
and self._valid_usage(items[0])
and "Usage" not in last_element.keys()
):
last_element["Usage"] = items[0]
return out_builder
else:
raise InvalidCOBOLError(
line_number, "Input does not resemble COBOL at line {}"
)
# At this point we know this is a new field or group. Get the group
# name and save it in a dictionary representing the new item. Also
# check for invalid names
new_item = {"Name": items[1]}
# Get the list of fields for the group the current item belongs to
current_group = self._item_level(out_builder, int(current_level))
if new_item["Name"] in COBOL_KEYWORDS:
raise InvalidCOBOLError(
line_number,
"Field or group name at line {} matches a COBOL keyword",
)
try:
clause_error = InvalidCOBOLError(
line_number,
"A clause was declared but no definition was given.",
)
if "REDEFINES" in items:
new_item["Redefines"] = self._clause_value(
items, "REDEFINES"
)
if new_item["Redefines"] in COBOL_KEYWORDS:
raise clause_error
if "OCCURS" in items:
if items[items.index("OCCURS") + 2] != "TIMES":
raise clause_error
try:
new_item["Occurs"] = int(
self._clause_value(items, "OCCURS")
)
except ValueError:
raise InvalidCOBOLError(
line_number,
"Occurs clause must specify an integer value at line {}.",
)
if "PIC" not in items and line_ended:
# Append the newly added group to group_levels
self.group_levels.append(
(new_item["Name"], int(current_level))
)
new_item["Type"] = "Group"
new_item["Fields"] =
current_group.append(new_item)
return out_builder
# Item is field
new_item["Type"] = "Field"
if "PIC" in items:
new_item["Format"] = self._clause_value(items, "PIC")
if new_item["Format"] in COBOL_KEYWORDS:
raise clause_error
# Check for usage clause.
usage_index = items.index("PIC") + 2
if len(items) > usage_index:
if self._valid_usage(items[usage_index]):
new_item["Usage"] = items[usage_index]
else:
raise InvalidCOBOLError(
line_number,
(
"Usage clause does not match an existing "
"definition at line {}"
),
)
except IndexError:
raise clause_error
current_group = self._item_level(out_builder, int(current_level))
current_group.append(new_item)
return out_builder
except IndexError:
raise InvalidCOBOLError(
line_number, "Input does not resemble COBOL at line {}"
)
def _item_level(self, struct: list, current_level: int) -> list:
"""
Returns a list corresponding what group an item should belong to.
"""
if not struct or not self.group_levels:
return struct
# We only care about the last level of a matching group. Check groups
# in reverse.
if current_level > self.group_levels[-1][1]:
return self._lowest_list(struct)
for group in self.group_levels[::-1]:
# Return the fields of the first group that has a lower level than
# the current item's level.
if group[1] < current_level:
return self._lowest_list(struct, group[0])
return struct
def _lowest_dict(self, struct: list) -> dict:
"""
Returns the deepest dictionary at the bottom of provided structure.
"""
last_element = struct[-1:][0]
if "Fields" in last_element.keys() and last_element["Fields"]:
return self._lowest_dict(last_element["Fields"])
return last_element
def _lowest_list(self, struct: list, name: str = None) -> list:
"""
Returns the deepest list at the bottom of provided stucture. If a name
parameter is provided, stop searching and return list with matching
name.
"""
if not struct:
return struct
last_element = struct[-1]
if (
name
and last_element["Name"] == name
and "Fields" in last_element.keys()
):
return last_element["Fields"]
if "Fields" in last_element.keys():
return self._lowest_list(last_element["Fields"])
return struct
@staticmethod
def _clause_value(items: list, clause: str) -> str:
"""
Returns the item from a list of items following the provided clause.
"""
value = items[items.index(clause) + 1]
return value
@staticmethod
def _valid_usage(usage: str) -> bool:
"""
Returns bool indicating whether provided usage is valid or not.
"""
# Not really an indication of valid usages as much as a list of what
# usages the EBCDIC reader we use supports.
valid_usages = ["COMP-3"]
return usage in valid_usages
class InvalidCOBOLError(Exception):
"""
Produces an error message with a line number showing which line of code
contains the Invalid COBOL. msg parameter should contain a set of empty
square brackets, although if not, a set will be appended to the end of the
message.
"""
def __init__(self, line, msg=None):
if msg is None:
# Try to not let this happen
msg = (
"There was an unspecified error while parsing the COBOL at "
"line {}. Please contact a developer for assistance."
)
elif not "{}" in msg:
msg = msg + " (Line {})."
msg = msg.format(line)
super(InvalidCOBOLError, self).__init__(msg)
python parsing cobol
New contributor
Manderton is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
I don't know anything about parsers but I had to write something to read COBOL for a work project. What are some things that I could improve with my Python coding and parser design?
Note: This isn't COBOL feature complete yet, just the things I need for the project.
"""
Parse COBOL copybook files to a Python list for EBCDIC reading
"""
#! /usr/bin/env python
import re
from os import path
# Only the handful I care about really
COBOL_KEYWORDS = {"COMP-3", "PIC", "REDEFINES", "OCCURS", "TIMES", "SIGN", "IS"}
class COBOLParser:
"""
Takes a file path as an argument. Run the parse method on returned object
to convert to a python readable format.
"""
def __init__(self, file: str):
if not path.isfile(file):
raise FileNotFoundError
self.file = file
# This here object keeps track of what level ID groups are in in. This
# is not needed in the final data, but is nescessary for determining
# what level to place a new item on after leaving a group so it is
# stored seperately
self.group_levels =
def parse(self) -> list:
"""
Parse and return self.file as a list of python dictionaries
"""
parse_out =
with open(self.file, "rt") as cobol_file:
line_number = 0
previous_field_level = 0
previous_group_level = 0
unfinished_item = {}
for line in cobol_file:
line_number += 1
# skip comment lines and empty lines
if not line.strip() or line.strip()[0] == "*":
continue
parse_out = self.parse_line(parse_out, line, line_number)
# Check for duplicate names
group = (
self._item_level(parse_out, self.group_levels[-1][1])
if self.group_levels
else parse_out
)
if (
group
and not unfinished_item
and group[-1:][0]["Name"] != "FILLER"
and group[-1:][0]["Name"]
in [item["Name"] for item in group[:-1]]
):
raise InvalidCOBOLError(
line_number, "Duplicate names in a group."
)
# Check for proper line ending
if (
unfinished_item
and unfinished_item["Name"]
!= self._lowest_dict(parse_out)["Name"]
):
raise InvalidCOBOLError(
line_number - 1, "Unended line was not continued."
)
# Make sure that the level is lesser than or equal to the last
# field level if previous level is a field. Then set a new
# value for previous_line_level.
if (
not unfinished_item
and previous_field_level
and int(line.strip()[:2]) > previous_field_level
):
raise InvalidCOBOLError(
line_number, "Field has sub entries at line {}"
)
# If a group was created the item after the group needs to be a
# member of said group, otherwise, raise error
if (
not unfinished_item
and previous_group_level
and int(line.strip()[:2]) <= previous_group_level
):
raise InvalidCOBOLError(
line_number - 1, "Group has no sub elements at line {}."
)
# Save information about last read line for easier error
# checking.
if (
not unfinished_item
and self._lowest_dict(parse_out)["Type"] == "Field"
):
previous_field_level = int(line.strip()[:2])
previous_group_level = 0
elif not unfinished_item:
previous_group_level = int(line.strip()[:2])
previous_field_level = 0
# Check for EOL character in string
line_ended_check = r"(w|d|))+.(s|$)"
if not re.search(line_ended_check, line):
unfinished_item = self._lowest_dict(parse_out)
else:
unfinished_item = {}
return parse_out
def parse_line(
self, out_builder: list, line: str, line_number: int
) -> list:
"""
Parses a COBOL line and creates a new item in the output builder. If
the line is a continuation of a previous line, just add the new info to
the previously added entry.
"""
if out_builder:
last_element = self._lowest_dict(out_builder)
else:
last_element = {}
items = [x for x in line.strip().split()]
# Strip out closing periods from items, then strip out items that come
# after the period
for item in items:
if item[-1:] == ".":
line_ended = True
last_item = items.index(item)
items = items[: last_item + 1]
items[last_item] = items[last_item][:-1]
else:
line_ended = False
try:
# Check first item (should be level if not a line continuation)
# Level should be two digits, representing a number between 01 and
# 49. COBOL standards also allow 66 and 88 as levels with specific
# rules. These are not supported yet but if they are in the future
# replace below regex with the following: (?!00)(66|88|[0-4][[0-9])
if re.match(r"(?!00)[0-4][0-9]", items[0]):
current_level = items[0]
# Check whether line is a continuation of previous line
# As of right now, only PIC and Usage are allowed to continue
# onto another line, (both only existing on fields
elif (
items[0] == "PIC"
and last_element
and "Format" not in last_element.keys()
):
last_element["Format"] = self._clause_value(items, "PIC")
last_element["Type"] = "Field"
# Check if continued line also has usage clause
# PIC is always two items long, check after
if len(items) == 3 and self._valid_usage(items[2]):
last_element["Usage"] = items[2]
return out_builder
elif (
last_element
and last_element["Type"] == "Field"
and self._valid_usage(items[0])
and "Usage" not in last_element.keys()
):
last_element["Usage"] = items[0]
return out_builder
else:
raise InvalidCOBOLError(
line_number, "Input does not resemble COBOL at line {}"
)
# At this point we know this is a new field or group. Get the group
# name and save it in a dictionary representing the new item. Also
# check for invalid names
new_item = {"Name": items[1]}
# Get the list of fields for the group the current item belongs to
current_group = self._item_level(out_builder, int(current_level))
if new_item["Name"] in COBOL_KEYWORDS:
raise InvalidCOBOLError(
line_number,
"Field or group name at line {} matches a COBOL keyword",
)
try:
clause_error = InvalidCOBOLError(
line_number,
"A clause was declared but no definition was given.",
)
if "REDEFINES" in items:
new_item["Redefines"] = self._clause_value(
items, "REDEFINES"
)
if new_item["Redefines"] in COBOL_KEYWORDS:
raise clause_error
if "OCCURS" in items:
if items[items.index("OCCURS") + 2] != "TIMES":
raise clause_error
try:
new_item["Occurs"] = int(
self._clause_value(items, "OCCURS")
)
except ValueError:
raise InvalidCOBOLError(
line_number,
"Occurs clause must specify an integer value at line {}.",
)
if "PIC" not in items and line_ended:
# Append the newly added group to group_levels
self.group_levels.append(
(new_item["Name"], int(current_level))
)
new_item["Type"] = "Group"
new_item["Fields"] =
current_group.append(new_item)
return out_builder
# Item is field
new_item["Type"] = "Field"
if "PIC" in items:
new_item["Format"] = self._clause_value(items, "PIC")
if new_item["Format"] in COBOL_KEYWORDS:
raise clause_error
# Check for usage clause.
usage_index = items.index("PIC") + 2
if len(items) > usage_index:
if self._valid_usage(items[usage_index]):
new_item["Usage"] = items[usage_index]
else:
raise InvalidCOBOLError(
line_number,
(
"Usage clause does not match an existing "
"definition at line {}"
),
)
except IndexError:
raise clause_error
current_group = self._item_level(out_builder, int(current_level))
current_group.append(new_item)
return out_builder
except IndexError:
raise InvalidCOBOLError(
line_number, "Input does not resemble COBOL at line {}"
)
def _item_level(self, struct: list, current_level: int) -> list:
"""
Returns a list corresponding what group an item should belong to.
"""
if not struct or not self.group_levels:
return struct
# We only care about the last level of a matching group. Check groups
# in reverse.
if current_level > self.group_levels[-1][1]:
return self._lowest_list(struct)
for group in self.group_levels[::-1]:
# Return the fields of the first group that has a lower level than
# the current item's level.
if group[1] < current_level:
return self._lowest_list(struct, group[0])
return struct
def _lowest_dict(self, struct: list) -> dict:
"""
Returns the deepest dictionary at the bottom of provided structure.
"""
last_element = struct[-1:][0]
if "Fields" in last_element.keys() and last_element["Fields"]:
return self._lowest_dict(last_element["Fields"])
return last_element
def _lowest_list(self, struct: list, name: str = None) -> list:
"""
Returns the deepest list at the bottom of provided stucture. If a name
parameter is provided, stop searching and return list with matching
name.
"""
if not struct:
return struct
last_element = struct[-1]
if (
name
and last_element["Name"] == name
and "Fields" in last_element.keys()
):
return last_element["Fields"]
if "Fields" in last_element.keys():
return self._lowest_list(last_element["Fields"])
return struct
@staticmethod
def _clause_value(items: list, clause: str) -> str:
"""
Returns the item from a list of items following the provided clause.
"""
value = items[items.index(clause) + 1]
return value
@staticmethod
def _valid_usage(usage: str) -> bool:
"""
Returns bool indicating whether provided usage is valid or not.
"""
# Not really an indication of valid usages as much as a list of what
# usages the EBCDIC reader we use supports.
valid_usages = ["COMP-3"]
return usage in valid_usages
class InvalidCOBOLError(Exception):
"""
Produces an error message with a line number showing which line of code
contains the Invalid COBOL. msg parameter should contain a set of empty
square brackets, although if not, a set will be appended to the end of the
message.
"""
def __init__(self, line, msg=None):
if msg is None:
# Try to not let this happen
msg = (
"There was an unspecified error while parsing the COBOL at "
"line {}. Please contact a developer for assistance."
)
elif not "{}" in msg:
msg = msg + " (Line {})."
msg = msg.format(line)
super(InvalidCOBOLError, self).__init__(msg)
python parsing cobol
python parsing cobol
New contributor
Manderton is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
New contributor
Manderton is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
New contributor
Manderton is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
asked 2 days ago
Manderton
161
161
New contributor
Manderton is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
New contributor
Manderton is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
Manderton is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
1
An alternative approach is to look at cb2xml sourceforge.net/projects/cb2xml) it will convert the Cobol to Xml. cb2xml is written java but there is an example of reading the Xml in python. cb2xml. Alternative cb2xml is written using sablecc; sablecc can generate pyhtonas well as java.
– Bruce Martin
2 days ago
there is also stingray sourceforge.net/projects/stingrayreader
– Bruce Martin
2 days ago
1
There are a lot of quirks inCOBOL, the existing parsers will deal with many of them. Takecomp-3it can also be written ascomputational-3, it can be specified with/without usage and at either field or group level.
– Bruce Martin
2 days ago
add a comment |
1
An alternative approach is to look at cb2xml sourceforge.net/projects/cb2xml) it will convert the Cobol to Xml. cb2xml is written java but there is an example of reading the Xml in python. cb2xml. Alternative cb2xml is written using sablecc; sablecc can generate pyhtonas well as java.
– Bruce Martin
2 days ago
there is also stingray sourceforge.net/projects/stingrayreader
– Bruce Martin
2 days ago
1
There are a lot of quirks inCOBOL, the existing parsers will deal with many of them. Takecomp-3it can also be written ascomputational-3, it can be specified with/without usage and at either field or group level.
– Bruce Martin
2 days ago
1
1
An alternative approach is to look at cb2xml sourceforge.net/projects/cb2xml) it will convert the Cobol to Xml. cb2xml is written java but there is an example of reading the Xml in python. cb2xml. Alternative cb2xml is written using sablecc; sablecc can generate pyhtonas well as java.
– Bruce Martin
2 days ago
An alternative approach is to look at cb2xml sourceforge.net/projects/cb2xml) it will convert the Cobol to Xml. cb2xml is written java but there is an example of reading the Xml in python. cb2xml. Alternative cb2xml is written using sablecc; sablecc can generate pyhtonas well as java.
– Bruce Martin
2 days ago
there is also stingray sourceforge.net/projects/stingrayreader
– Bruce Martin
2 days ago
there is also stingray sourceforge.net/projects/stingrayreader
– Bruce Martin
2 days ago
1
1
There are a lot of quirks in
COBOL, the existing parsers will deal with many of them. Take comp-3 it can also be written as computational-3, it can be specified with/without usage and at either field or group level.– Bruce Martin
2 days ago
There are a lot of quirks in
COBOL, the existing parsers will deal with many of them. Take comp-3 it can also be written as computational-3, it can be specified with/without usage and at either field or group level.– Bruce Martin
2 days ago
add a comment |
1 Answer
1
active
oldest
votes
up vote
0
down vote
group[-1:][0]["Name"]
This expression is reused, so assign it to a variable name.
in [item["Name"] for item in group[:-1]]
For a membership test, a set is a better idea than a list.
int(line.strip()[:2])
This is reused a bunch of times, so make a variable.
line_ended_check = r"(w|d|))+.(s|$)"
You shouldn't initialize this regex where it is. It needs to be compiled once, outside of all of your parsing loops, using re.compile.
elif not "{}" in msg:
You should probably use elif "{}" not in msg: .
add a comment |
1 Answer
1
active
oldest
votes
1 Answer
1
active
oldest
votes
active
oldest
votes
active
oldest
votes
up vote
0
down vote
group[-1:][0]["Name"]
This expression is reused, so assign it to a variable name.
in [item["Name"] for item in group[:-1]]
For a membership test, a set is a better idea than a list.
int(line.strip()[:2])
This is reused a bunch of times, so make a variable.
line_ended_check = r"(w|d|))+.(s|$)"
You shouldn't initialize this regex where it is. It needs to be compiled once, outside of all of your parsing loops, using re.compile.
elif not "{}" in msg:
You should probably use elif "{}" not in msg: .
add a comment |
up vote
0
down vote
group[-1:][0]["Name"]
This expression is reused, so assign it to a variable name.
in [item["Name"] for item in group[:-1]]
For a membership test, a set is a better idea than a list.
int(line.strip()[:2])
This is reused a bunch of times, so make a variable.
line_ended_check = r"(w|d|))+.(s|$)"
You shouldn't initialize this regex where it is. It needs to be compiled once, outside of all of your parsing loops, using re.compile.
elif not "{}" in msg:
You should probably use elif "{}" not in msg: .
add a comment |
up vote
0
down vote
up vote
0
down vote
group[-1:][0]["Name"]
This expression is reused, so assign it to a variable name.
in [item["Name"] for item in group[:-1]]
For a membership test, a set is a better idea than a list.
int(line.strip()[:2])
This is reused a bunch of times, so make a variable.
line_ended_check = r"(w|d|))+.(s|$)"
You shouldn't initialize this regex where it is. It needs to be compiled once, outside of all of your parsing loops, using re.compile.
elif not "{}" in msg:
You should probably use elif "{}" not in msg: .
group[-1:][0]["Name"]
This expression is reused, so assign it to a variable name.
in [item["Name"] for item in group[:-1]]
For a membership test, a set is a better idea than a list.
int(line.strip()[:2])
This is reused a bunch of times, so make a variable.
line_ended_check = r"(w|d|))+.(s|$)"
You shouldn't initialize this regex where it is. It needs to be compiled once, outside of all of your parsing loops, using re.compile.
elif not "{}" in msg:
You should probably use elif "{}" not in msg: .
answered yesterday
Reinderien
1,482616
1,482616
add a comment |
add a comment |
Manderton is a new contributor. Be nice, and check out our Code of Conduct.
Manderton is a new contributor. Be nice, and check out our Code of Conduct.
Manderton is a new contributor. Be nice, and check out our Code of Conduct.
Manderton is a new contributor. Be nice, and check out our Code of Conduct.
Thanks for contributing an answer to Code Review Stack Exchange!
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
Use MathJax to format equations. MathJax reference.
To learn more, see our tips on writing great answers.
Some of your past answers have not been well-received, and you're in danger of being blocked from answering.
Please pay close attention to the following guidance:
- Please be sure to answer the question. Provide details and share your research!
But avoid …
- Asking for help, clarification, or responding to other answers.
- Making statements based on opinion; back them up with references or personal experience.
To learn more, see our tips on writing great answers.
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
StackExchange.ready(
function () {
StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f208789%2fcobol-layout-parser-in-python%23new-answer', 'question_page');
}
);
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Sign up or log in
StackExchange.ready(function () {
StackExchange.helpers.onClickDraftSave('#login-link');
});
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Sign up using Google
Sign up using Facebook
Sign up using Email and Password
Post as a guest
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
Required, but never shown
1
An alternative approach is to look at cb2xml sourceforge.net/projects/cb2xml) it will convert the Cobol to Xml. cb2xml is written java but there is an example of reading the Xml in python. cb2xml. Alternative cb2xml is written using sablecc; sablecc can generate pyhtonas well as java.
– Bruce Martin
2 days ago
there is also stingray sourceforge.net/projects/stingrayreader
– Bruce Martin
2 days ago
1
There are a lot of quirks in
COBOL, the existing parsers will deal with many of them. Takecomp-3it can also be written ascomputational-3, it can be specified with/without usage and at either field or group level.– Bruce Martin
2 days ago