COBOL layout parser in Python











up vote
3
down vote

favorite












I don't know anything about parsers but I had to write something to read COBOL for a work project. What are some things that I could improve with my Python coding and parser design?



Note: This isn't COBOL feature complete yet, just the things I need for the project.



"""
Parse COBOL copybook files to a Python list for EBCDIC reading
"""

#! /usr/bin/env python

import re
from os import path

# Only the handful I care about really
COBOL_KEYWORDS = {"COMP-3", "PIC", "REDEFINES", "OCCURS", "TIMES", "SIGN", "IS"}


class COBOLParser:
"""
Takes a file path as an argument. Run the parse method on returned object
to convert to a python readable format.
"""

def __init__(self, file: str):
if not path.isfile(file):
raise FileNotFoundError
self.file = file

# This here object keeps track of what level ID groups are in in. This
# is not needed in the final data, but is nescessary for determining
# what level to place a new item on after leaving a group so it is
# stored seperately
self.group_levels =

def parse(self) -> list:
"""
Parse and return self.file as a list of python dictionaries
"""
parse_out =
with open(self.file, "rt") as cobol_file:
line_number = 0
previous_field_level = 0
previous_group_level = 0
unfinished_item = {}
for line in cobol_file:
line_number += 1

# skip comment lines and empty lines
if not line.strip() or line.strip()[0] == "*":
continue

parse_out = self.parse_line(parse_out, line, line_number)

# Check for duplicate names
group = (
self._item_level(parse_out, self.group_levels[-1][1])
if self.group_levels
else parse_out
)
if (
group
and not unfinished_item
and group[-1:][0]["Name"] != "FILLER"
and group[-1:][0]["Name"]
in [item["Name"] for item in group[:-1]]
):
raise InvalidCOBOLError(
line_number, "Duplicate names in a group."
)

# Check for proper line ending
if (
unfinished_item
and unfinished_item["Name"]
!= self._lowest_dict(parse_out)["Name"]
):
raise InvalidCOBOLError(
line_number - 1, "Unended line was not continued."
)

# Make sure that the level is lesser than or equal to the last
# field level if previous level is a field. Then set a new
# value for previous_line_level.
if (
not unfinished_item
and previous_field_level
and int(line.strip()[:2]) > previous_field_level
):
raise InvalidCOBOLError(
line_number, "Field has sub entries at line {}"
)

# If a group was created the item after the group needs to be a
# member of said group, otherwise, raise error
if (
not unfinished_item
and previous_group_level
and int(line.strip()[:2]) <= previous_group_level
):
raise InvalidCOBOLError(
line_number - 1, "Group has no sub elements at line {}."
)

# Save information about last read line for easier error
# checking.
if (
not unfinished_item
and self._lowest_dict(parse_out)["Type"] == "Field"
):
previous_field_level = int(line.strip()[:2])
previous_group_level = 0
elif not unfinished_item:
previous_group_level = int(line.strip()[:2])
previous_field_level = 0

# Check for EOL character in string
line_ended_check = r"(w|d|))+.(s|$)"
if not re.search(line_ended_check, line):
unfinished_item = self._lowest_dict(parse_out)
else:
unfinished_item = {}

return parse_out

def parse_line(
self, out_builder: list, line: str, line_number: int
) -> list:
"""
Parses a COBOL line and creates a new item in the output builder. If
the line is a continuation of a previous line, just add the new info to
the previously added entry.
"""
if out_builder:
last_element = self._lowest_dict(out_builder)
else:
last_element = {}

items = [x for x in line.strip().split()]

# Strip out closing periods from items, then strip out items that come
# after the period
for item in items:
if item[-1:] == ".":
line_ended = True
last_item = items.index(item)
items = items[: last_item + 1]
items[last_item] = items[last_item][:-1]
else:
line_ended = False

try:
# Check first item (should be level if not a line continuation)
# Level should be two digits, representing a number between 01 and
# 49. COBOL standards also allow 66 and 88 as levels with specific
# rules. These are not supported yet but if they are in the future
# replace below regex with the following: (?!00)(66|88|[0-4][[0-9])
if re.match(r"(?!00)[0-4][0-9]", items[0]):
current_level = items[0]

# Check whether line is a continuation of previous line
# As of right now, only PIC and Usage are allowed to continue
# onto another line, (both only existing on fields
elif (
items[0] == "PIC"
and last_element
and "Format" not in last_element.keys()
):
last_element["Format"] = self._clause_value(items, "PIC")
last_element["Type"] = "Field"

# Check if continued line also has usage clause
# PIC is always two items long, check after
if len(items) == 3 and self._valid_usage(items[2]):
last_element["Usage"] = items[2]
return out_builder
elif (
last_element
and last_element["Type"] == "Field"
and self._valid_usage(items[0])
and "Usage" not in last_element.keys()
):
last_element["Usage"] = items[0]
return out_builder
else:
raise InvalidCOBOLError(
line_number, "Input does not resemble COBOL at line {}"
)

# At this point we know this is a new field or group. Get the group
# name and save it in a dictionary representing the new item. Also
# check for invalid names
new_item = {"Name": items[1]}

# Get the list of fields for the group the current item belongs to
current_group = self._item_level(out_builder, int(current_level))
if new_item["Name"] in COBOL_KEYWORDS:
raise InvalidCOBOLError(
line_number,
"Field or group name at line {} matches a COBOL keyword",
)

try:
clause_error = InvalidCOBOLError(
line_number,
"A clause was declared but no definition was given.",
)
if "REDEFINES" in items:
new_item["Redefines"] = self._clause_value(
items, "REDEFINES"
)
if new_item["Redefines"] in COBOL_KEYWORDS:
raise clause_error

if "OCCURS" in items:
if items[items.index("OCCURS") + 2] != "TIMES":
raise clause_error

try:
new_item["Occurs"] = int(
self._clause_value(items, "OCCURS")
)
except ValueError:
raise InvalidCOBOLError(
line_number,
"Occurs clause must specify an integer value at line {}.",
)

if "PIC" not in items and line_ended:
# Append the newly added group to group_levels
self.group_levels.append(
(new_item["Name"], int(current_level))
)
new_item["Type"] = "Group"
new_item["Fields"] =
current_group.append(new_item)
return out_builder

# Item is field
new_item["Type"] = "Field"
if "PIC" in items:
new_item["Format"] = self._clause_value(items, "PIC")

if new_item["Format"] in COBOL_KEYWORDS:
raise clause_error

# Check for usage clause.
usage_index = items.index("PIC") + 2
if len(items) > usage_index:
if self._valid_usage(items[usage_index]):
new_item["Usage"] = items[usage_index]
else:
raise InvalidCOBOLError(
line_number,
(
"Usage clause does not match an existing "
"definition at line {}"
),
)

except IndexError:
raise clause_error

current_group = self._item_level(out_builder, int(current_level))
current_group.append(new_item)
return out_builder

except IndexError:
raise InvalidCOBOLError(
line_number, "Input does not resemble COBOL at line {}"
)

def _item_level(self, struct: list, current_level: int) -> list:
"""
Returns a list corresponding what group an item should belong to.
"""
if not struct or not self.group_levels:
return struct
# We only care about the last level of a matching group. Check groups
# in reverse.
if current_level > self.group_levels[-1][1]:
return self._lowest_list(struct)
for group in self.group_levels[::-1]:

# Return the fields of the first group that has a lower level than
# the current item's level.
if group[1] < current_level:
return self._lowest_list(struct, group[0])
return struct

def _lowest_dict(self, struct: list) -> dict:
"""
Returns the deepest dictionary at the bottom of provided structure.
"""
last_element = struct[-1:][0]
if "Fields" in last_element.keys() and last_element["Fields"]:
return self._lowest_dict(last_element["Fields"])
return last_element

def _lowest_list(self, struct: list, name: str = None) -> list:
"""
Returns the deepest list at the bottom of provided stucture. If a name
parameter is provided, stop searching and return list with matching
name.
"""
if not struct:
return struct
last_element = struct[-1]
if (
name
and last_element["Name"] == name
and "Fields" in last_element.keys()
):
return last_element["Fields"]
if "Fields" in last_element.keys():
return self._lowest_list(last_element["Fields"])
return struct

@staticmethod
def _clause_value(items: list, clause: str) -> str:
"""
Returns the item from a list of items following the provided clause.
"""
value = items[items.index(clause) + 1]
return value

@staticmethod
def _valid_usage(usage: str) -> bool:
"""
Returns bool indicating whether provided usage is valid or not.
"""
# Not really an indication of valid usages as much as a list of what
# usages the EBCDIC reader we use supports.
valid_usages = ["COMP-3"]
return usage in valid_usages

class InvalidCOBOLError(Exception):
"""
Produces an error message with a line number showing which line of code
contains the Invalid COBOL. msg parameter should contain a set of empty
square brackets, although if not, a set will be appended to the end of the
message.
"""

def __init__(self, line, msg=None):
if msg is None:
# Try to not let this happen
msg = (
"There was an unspecified error while parsing the COBOL at "
"line {}. Please contact a developer for assistance."
)
elif not "{}" in msg:
msg = msg + " (Line {})."
msg = msg.format(line)
super(InvalidCOBOLError, self).__init__(msg)









share|improve this question







New contributor




Manderton is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
















  • 1




    An alternative approach is to look at cb2xml sourceforge.net/projects/cb2xml) it will convert the Cobol to Xml. cb2xml is written java but there is an example of reading the Xml in python. cb2xml. Alternative cb2xml is written using sablecc; sablecc can generate pyhtonas well as java.
    – Bruce Martin
    2 days ago










  • there is also stingray sourceforge.net/projects/stingrayreader
    – Bruce Martin
    2 days ago






  • 1




    There are a lot of quirks in COBOL, the existing parsers will deal with many of them. Take comp-3 it can also be written as computational-3, it can be specified with/without usage and at either field or group level.
    – Bruce Martin
    2 days ago















up vote
3
down vote

favorite












I don't know anything about parsers but I had to write something to read COBOL for a work project. What are some things that I could improve with my Python coding and parser design?



Note: This isn't COBOL feature complete yet, just the things I need for the project.



"""
Parse COBOL copybook files to a Python list for EBCDIC reading
"""

#! /usr/bin/env python

import re
from os import path

# Only the handful I care about really
COBOL_KEYWORDS = {"COMP-3", "PIC", "REDEFINES", "OCCURS", "TIMES", "SIGN", "IS"}


class COBOLParser:
"""
Takes a file path as an argument. Run the parse method on returned object
to convert to a python readable format.
"""

def __init__(self, file: str):
if not path.isfile(file):
raise FileNotFoundError
self.file = file

# This here object keeps track of what level ID groups are in in. This
# is not needed in the final data, but is nescessary for determining
# what level to place a new item on after leaving a group so it is
# stored seperately
self.group_levels =

def parse(self) -> list:
"""
Parse and return self.file as a list of python dictionaries
"""
parse_out =
with open(self.file, "rt") as cobol_file:
line_number = 0
previous_field_level = 0
previous_group_level = 0
unfinished_item = {}
for line in cobol_file:
line_number += 1

# skip comment lines and empty lines
if not line.strip() or line.strip()[0] == "*":
continue

parse_out = self.parse_line(parse_out, line, line_number)

# Check for duplicate names
group = (
self._item_level(parse_out, self.group_levels[-1][1])
if self.group_levels
else parse_out
)
if (
group
and not unfinished_item
and group[-1:][0]["Name"] != "FILLER"
and group[-1:][0]["Name"]
in [item["Name"] for item in group[:-1]]
):
raise InvalidCOBOLError(
line_number, "Duplicate names in a group."
)

# Check for proper line ending
if (
unfinished_item
and unfinished_item["Name"]
!= self._lowest_dict(parse_out)["Name"]
):
raise InvalidCOBOLError(
line_number - 1, "Unended line was not continued."
)

# Make sure that the level is lesser than or equal to the last
# field level if previous level is a field. Then set a new
# value for previous_line_level.
if (
not unfinished_item
and previous_field_level
and int(line.strip()[:2]) > previous_field_level
):
raise InvalidCOBOLError(
line_number, "Field has sub entries at line {}"
)

# If a group was created the item after the group needs to be a
# member of said group, otherwise, raise error
if (
not unfinished_item
and previous_group_level
and int(line.strip()[:2]) <= previous_group_level
):
raise InvalidCOBOLError(
line_number - 1, "Group has no sub elements at line {}."
)

# Save information about last read line for easier error
# checking.
if (
not unfinished_item
and self._lowest_dict(parse_out)["Type"] == "Field"
):
previous_field_level = int(line.strip()[:2])
previous_group_level = 0
elif not unfinished_item:
previous_group_level = int(line.strip()[:2])
previous_field_level = 0

# Check for EOL character in string
line_ended_check = r"(w|d|))+.(s|$)"
if not re.search(line_ended_check, line):
unfinished_item = self._lowest_dict(parse_out)
else:
unfinished_item = {}

return parse_out

def parse_line(
self, out_builder: list, line: str, line_number: int
) -> list:
"""
Parses a COBOL line and creates a new item in the output builder. If
the line is a continuation of a previous line, just add the new info to
the previously added entry.
"""
if out_builder:
last_element = self._lowest_dict(out_builder)
else:
last_element = {}

items = [x for x in line.strip().split()]

# Strip out closing periods from items, then strip out items that come
# after the period
for item in items:
if item[-1:] == ".":
line_ended = True
last_item = items.index(item)
items = items[: last_item + 1]
items[last_item] = items[last_item][:-1]
else:
line_ended = False

try:
# Check first item (should be level if not a line continuation)
# Level should be two digits, representing a number between 01 and
# 49. COBOL standards also allow 66 and 88 as levels with specific
# rules. These are not supported yet but if they are in the future
# replace below regex with the following: (?!00)(66|88|[0-4][[0-9])
if re.match(r"(?!00)[0-4][0-9]", items[0]):
current_level = items[0]

# Check whether line is a continuation of previous line
# As of right now, only PIC and Usage are allowed to continue
# onto another line, (both only existing on fields
elif (
items[0] == "PIC"
and last_element
and "Format" not in last_element.keys()
):
last_element["Format"] = self._clause_value(items, "PIC")
last_element["Type"] = "Field"

# Check if continued line also has usage clause
# PIC is always two items long, check after
if len(items) == 3 and self._valid_usage(items[2]):
last_element["Usage"] = items[2]
return out_builder
elif (
last_element
and last_element["Type"] == "Field"
and self._valid_usage(items[0])
and "Usage" not in last_element.keys()
):
last_element["Usage"] = items[0]
return out_builder
else:
raise InvalidCOBOLError(
line_number, "Input does not resemble COBOL at line {}"
)

# At this point we know this is a new field or group. Get the group
# name and save it in a dictionary representing the new item. Also
# check for invalid names
new_item = {"Name": items[1]}

# Get the list of fields for the group the current item belongs to
current_group = self._item_level(out_builder, int(current_level))
if new_item["Name"] in COBOL_KEYWORDS:
raise InvalidCOBOLError(
line_number,
"Field or group name at line {} matches a COBOL keyword",
)

try:
clause_error = InvalidCOBOLError(
line_number,
"A clause was declared but no definition was given.",
)
if "REDEFINES" in items:
new_item["Redefines"] = self._clause_value(
items, "REDEFINES"
)
if new_item["Redefines"] in COBOL_KEYWORDS:
raise clause_error

if "OCCURS" in items:
if items[items.index("OCCURS") + 2] != "TIMES":
raise clause_error

try:
new_item["Occurs"] = int(
self._clause_value(items, "OCCURS")
)
except ValueError:
raise InvalidCOBOLError(
line_number,
"Occurs clause must specify an integer value at line {}.",
)

if "PIC" not in items and line_ended:
# Append the newly added group to group_levels
self.group_levels.append(
(new_item["Name"], int(current_level))
)
new_item["Type"] = "Group"
new_item["Fields"] =
current_group.append(new_item)
return out_builder

# Item is field
new_item["Type"] = "Field"
if "PIC" in items:
new_item["Format"] = self._clause_value(items, "PIC")

if new_item["Format"] in COBOL_KEYWORDS:
raise clause_error

# Check for usage clause.
usage_index = items.index("PIC") + 2
if len(items) > usage_index:
if self._valid_usage(items[usage_index]):
new_item["Usage"] = items[usage_index]
else:
raise InvalidCOBOLError(
line_number,
(
"Usage clause does not match an existing "
"definition at line {}"
),
)

except IndexError:
raise clause_error

current_group = self._item_level(out_builder, int(current_level))
current_group.append(new_item)
return out_builder

except IndexError:
raise InvalidCOBOLError(
line_number, "Input does not resemble COBOL at line {}"
)

def _item_level(self, struct: list, current_level: int) -> list:
"""
Returns a list corresponding what group an item should belong to.
"""
if not struct or not self.group_levels:
return struct
# We only care about the last level of a matching group. Check groups
# in reverse.
if current_level > self.group_levels[-1][1]:
return self._lowest_list(struct)
for group in self.group_levels[::-1]:

# Return the fields of the first group that has a lower level than
# the current item's level.
if group[1] < current_level:
return self._lowest_list(struct, group[0])
return struct

def _lowest_dict(self, struct: list) -> dict:
"""
Returns the deepest dictionary at the bottom of provided structure.
"""
last_element = struct[-1:][0]
if "Fields" in last_element.keys() and last_element["Fields"]:
return self._lowest_dict(last_element["Fields"])
return last_element

def _lowest_list(self, struct: list, name: str = None) -> list:
"""
Returns the deepest list at the bottom of provided stucture. If a name
parameter is provided, stop searching and return list with matching
name.
"""
if not struct:
return struct
last_element = struct[-1]
if (
name
and last_element["Name"] == name
and "Fields" in last_element.keys()
):
return last_element["Fields"]
if "Fields" in last_element.keys():
return self._lowest_list(last_element["Fields"])
return struct

@staticmethod
def _clause_value(items: list, clause: str) -> str:
"""
Returns the item from a list of items following the provided clause.
"""
value = items[items.index(clause) + 1]
return value

@staticmethod
def _valid_usage(usage: str) -> bool:
"""
Returns bool indicating whether provided usage is valid or not.
"""
# Not really an indication of valid usages as much as a list of what
# usages the EBCDIC reader we use supports.
valid_usages = ["COMP-3"]
return usage in valid_usages

class InvalidCOBOLError(Exception):
"""
Produces an error message with a line number showing which line of code
contains the Invalid COBOL. msg parameter should contain a set of empty
square brackets, although if not, a set will be appended to the end of the
message.
"""

def __init__(self, line, msg=None):
if msg is None:
# Try to not let this happen
msg = (
"There was an unspecified error while parsing the COBOL at "
"line {}. Please contact a developer for assistance."
)
elif not "{}" in msg:
msg = msg + " (Line {})."
msg = msg.format(line)
super(InvalidCOBOLError, self).__init__(msg)









share|improve this question







New contributor




Manderton is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.
















  • 1




    An alternative approach is to look at cb2xml sourceforge.net/projects/cb2xml) it will convert the Cobol to Xml. cb2xml is written java but there is an example of reading the Xml in python. cb2xml. Alternative cb2xml is written using sablecc; sablecc can generate pyhtonas well as java.
    – Bruce Martin
    2 days ago










  • there is also stingray sourceforge.net/projects/stingrayreader
    – Bruce Martin
    2 days ago






  • 1




    There are a lot of quirks in COBOL, the existing parsers will deal with many of them. Take comp-3 it can also be written as computational-3, it can be specified with/without usage and at either field or group level.
    – Bruce Martin
    2 days ago













up vote
3
down vote

favorite









up vote
3
down vote

favorite











I don't know anything about parsers but I had to write something to read COBOL for a work project. What are some things that I could improve with my Python coding and parser design?



Note: This isn't COBOL feature complete yet, just the things I need for the project.



"""
Parse COBOL copybook files to a Python list for EBCDIC reading
"""

#! /usr/bin/env python

import re
from os import path

# Only the handful I care about really
COBOL_KEYWORDS = {"COMP-3", "PIC", "REDEFINES", "OCCURS", "TIMES", "SIGN", "IS"}


class COBOLParser:
"""
Takes a file path as an argument. Run the parse method on returned object
to convert to a python readable format.
"""

def __init__(self, file: str):
if not path.isfile(file):
raise FileNotFoundError
self.file = file

# This here object keeps track of what level ID groups are in in. This
# is not needed in the final data, but is nescessary for determining
# what level to place a new item on after leaving a group so it is
# stored seperately
self.group_levels =

def parse(self) -> list:
"""
Parse and return self.file as a list of python dictionaries
"""
parse_out =
with open(self.file, "rt") as cobol_file:
line_number = 0
previous_field_level = 0
previous_group_level = 0
unfinished_item = {}
for line in cobol_file:
line_number += 1

# skip comment lines and empty lines
if not line.strip() or line.strip()[0] == "*":
continue

parse_out = self.parse_line(parse_out, line, line_number)

# Check for duplicate names
group = (
self._item_level(parse_out, self.group_levels[-1][1])
if self.group_levels
else parse_out
)
if (
group
and not unfinished_item
and group[-1:][0]["Name"] != "FILLER"
and group[-1:][0]["Name"]
in [item["Name"] for item in group[:-1]]
):
raise InvalidCOBOLError(
line_number, "Duplicate names in a group."
)

# Check for proper line ending
if (
unfinished_item
and unfinished_item["Name"]
!= self._lowest_dict(parse_out)["Name"]
):
raise InvalidCOBOLError(
line_number - 1, "Unended line was not continued."
)

# Make sure that the level is lesser than or equal to the last
# field level if previous level is a field. Then set a new
# value for previous_line_level.
if (
not unfinished_item
and previous_field_level
and int(line.strip()[:2]) > previous_field_level
):
raise InvalidCOBOLError(
line_number, "Field has sub entries at line {}"
)

# If a group was created the item after the group needs to be a
# member of said group, otherwise, raise error
if (
not unfinished_item
and previous_group_level
and int(line.strip()[:2]) <= previous_group_level
):
raise InvalidCOBOLError(
line_number - 1, "Group has no sub elements at line {}."
)

# Save information about last read line for easier error
# checking.
if (
not unfinished_item
and self._lowest_dict(parse_out)["Type"] == "Field"
):
previous_field_level = int(line.strip()[:2])
previous_group_level = 0
elif not unfinished_item:
previous_group_level = int(line.strip()[:2])
previous_field_level = 0

# Check for EOL character in string
line_ended_check = r"(w|d|))+.(s|$)"
if not re.search(line_ended_check, line):
unfinished_item = self._lowest_dict(parse_out)
else:
unfinished_item = {}

return parse_out

def parse_line(
self, out_builder: list, line: str, line_number: int
) -> list:
"""
Parses a COBOL line and creates a new item in the output builder. If
the line is a continuation of a previous line, just add the new info to
the previously added entry.
"""
if out_builder:
last_element = self._lowest_dict(out_builder)
else:
last_element = {}

items = [x for x in line.strip().split()]

# Strip out closing periods from items, then strip out items that come
# after the period
for item in items:
if item[-1:] == ".":
line_ended = True
last_item = items.index(item)
items = items[: last_item + 1]
items[last_item] = items[last_item][:-1]
else:
line_ended = False

try:
# Check first item (should be level if not a line continuation)
# Level should be two digits, representing a number between 01 and
# 49. COBOL standards also allow 66 and 88 as levels with specific
# rules. These are not supported yet but if they are in the future
# replace below regex with the following: (?!00)(66|88|[0-4][[0-9])
if re.match(r"(?!00)[0-4][0-9]", items[0]):
current_level = items[0]

# Check whether line is a continuation of previous line
# As of right now, only PIC and Usage are allowed to continue
# onto another line, (both only existing on fields
elif (
items[0] == "PIC"
and last_element
and "Format" not in last_element.keys()
):
last_element["Format"] = self._clause_value(items, "PIC")
last_element["Type"] = "Field"

# Check if continued line also has usage clause
# PIC is always two items long, check after
if len(items) == 3 and self._valid_usage(items[2]):
last_element["Usage"] = items[2]
return out_builder
elif (
last_element
and last_element["Type"] == "Field"
and self._valid_usage(items[0])
and "Usage" not in last_element.keys()
):
last_element["Usage"] = items[0]
return out_builder
else:
raise InvalidCOBOLError(
line_number, "Input does not resemble COBOL at line {}"
)

# At this point we know this is a new field or group. Get the group
# name and save it in a dictionary representing the new item. Also
# check for invalid names
new_item = {"Name": items[1]}

# Get the list of fields for the group the current item belongs to
current_group = self._item_level(out_builder, int(current_level))
if new_item["Name"] in COBOL_KEYWORDS:
raise InvalidCOBOLError(
line_number,
"Field or group name at line {} matches a COBOL keyword",
)

try:
clause_error = InvalidCOBOLError(
line_number,
"A clause was declared but no definition was given.",
)
if "REDEFINES" in items:
new_item["Redefines"] = self._clause_value(
items, "REDEFINES"
)
if new_item["Redefines"] in COBOL_KEYWORDS:
raise clause_error

if "OCCURS" in items:
if items[items.index("OCCURS") + 2] != "TIMES":
raise clause_error

try:
new_item["Occurs"] = int(
self._clause_value(items, "OCCURS")
)
except ValueError:
raise InvalidCOBOLError(
line_number,
"Occurs clause must specify an integer value at line {}.",
)

if "PIC" not in items and line_ended:
# Append the newly added group to group_levels
self.group_levels.append(
(new_item["Name"], int(current_level))
)
new_item["Type"] = "Group"
new_item["Fields"] =
current_group.append(new_item)
return out_builder

# Item is field
new_item["Type"] = "Field"
if "PIC" in items:
new_item["Format"] = self._clause_value(items, "PIC")

if new_item["Format"] in COBOL_KEYWORDS:
raise clause_error

# Check for usage clause.
usage_index = items.index("PIC") + 2
if len(items) > usage_index:
if self._valid_usage(items[usage_index]):
new_item["Usage"] = items[usage_index]
else:
raise InvalidCOBOLError(
line_number,
(
"Usage clause does not match an existing "
"definition at line {}"
),
)

except IndexError:
raise clause_error

current_group = self._item_level(out_builder, int(current_level))
current_group.append(new_item)
return out_builder

except IndexError:
raise InvalidCOBOLError(
line_number, "Input does not resemble COBOL at line {}"
)

def _item_level(self, struct: list, current_level: int) -> list:
"""
Returns a list corresponding what group an item should belong to.
"""
if not struct or not self.group_levels:
return struct
# We only care about the last level of a matching group. Check groups
# in reverse.
if current_level > self.group_levels[-1][1]:
return self._lowest_list(struct)
for group in self.group_levels[::-1]:

# Return the fields of the first group that has a lower level than
# the current item's level.
if group[1] < current_level:
return self._lowest_list(struct, group[0])
return struct

def _lowest_dict(self, struct: list) -> dict:
"""
Returns the deepest dictionary at the bottom of provided structure.
"""
last_element = struct[-1:][0]
if "Fields" in last_element.keys() and last_element["Fields"]:
return self._lowest_dict(last_element["Fields"])
return last_element

def _lowest_list(self, struct: list, name: str = None) -> list:
"""
Returns the deepest list at the bottom of provided stucture. If a name
parameter is provided, stop searching and return list with matching
name.
"""
if not struct:
return struct
last_element = struct[-1]
if (
name
and last_element["Name"] == name
and "Fields" in last_element.keys()
):
return last_element["Fields"]
if "Fields" in last_element.keys():
return self._lowest_list(last_element["Fields"])
return struct

@staticmethod
def _clause_value(items: list, clause: str) -> str:
"""
Returns the item from a list of items following the provided clause.
"""
value = items[items.index(clause) + 1]
return value

@staticmethod
def _valid_usage(usage: str) -> bool:
"""
Returns bool indicating whether provided usage is valid or not.
"""
# Not really an indication of valid usages as much as a list of what
# usages the EBCDIC reader we use supports.
valid_usages = ["COMP-3"]
return usage in valid_usages

class InvalidCOBOLError(Exception):
"""
Produces an error message with a line number showing which line of code
contains the Invalid COBOL. msg parameter should contain a set of empty
square brackets, although if not, a set will be appended to the end of the
message.
"""

def __init__(self, line, msg=None):
if msg is None:
# Try to not let this happen
msg = (
"There was an unspecified error while parsing the COBOL at "
"line {}. Please contact a developer for assistance."
)
elif not "{}" in msg:
msg = msg + " (Line {})."
msg = msg.format(line)
super(InvalidCOBOLError, self).__init__(msg)









share|improve this question







New contributor




Manderton is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.











I don't know anything about parsers but I had to write something to read COBOL for a work project. What are some things that I could improve with my Python coding and parser design?



Note: This isn't COBOL feature complete yet, just the things I need for the project.



"""
Parse COBOL copybook files to a Python list for EBCDIC reading
"""

#! /usr/bin/env python

import re
from os import path

# Only the handful I care about really
COBOL_KEYWORDS = {"COMP-3", "PIC", "REDEFINES", "OCCURS", "TIMES", "SIGN", "IS"}


class COBOLParser:
"""
Takes a file path as an argument. Run the parse method on returned object
to convert to a python readable format.
"""

def __init__(self, file: str):
if not path.isfile(file):
raise FileNotFoundError
self.file = file

# This here object keeps track of what level ID groups are in in. This
# is not needed in the final data, but is nescessary for determining
# what level to place a new item on after leaving a group so it is
# stored seperately
self.group_levels =

def parse(self) -> list:
"""
Parse and return self.file as a list of python dictionaries
"""
parse_out =
with open(self.file, "rt") as cobol_file:
line_number = 0
previous_field_level = 0
previous_group_level = 0
unfinished_item = {}
for line in cobol_file:
line_number += 1

# skip comment lines and empty lines
if not line.strip() or line.strip()[0] == "*":
continue

parse_out = self.parse_line(parse_out, line, line_number)

# Check for duplicate names
group = (
self._item_level(parse_out, self.group_levels[-1][1])
if self.group_levels
else parse_out
)
if (
group
and not unfinished_item
and group[-1:][0]["Name"] != "FILLER"
and group[-1:][0]["Name"]
in [item["Name"] for item in group[:-1]]
):
raise InvalidCOBOLError(
line_number, "Duplicate names in a group."
)

# Check for proper line ending
if (
unfinished_item
and unfinished_item["Name"]
!= self._lowest_dict(parse_out)["Name"]
):
raise InvalidCOBOLError(
line_number - 1, "Unended line was not continued."
)

# Make sure that the level is lesser than or equal to the last
# field level if previous level is a field. Then set a new
# value for previous_line_level.
if (
not unfinished_item
and previous_field_level
and int(line.strip()[:2]) > previous_field_level
):
raise InvalidCOBOLError(
line_number, "Field has sub entries at line {}"
)

# If a group was created the item after the group needs to be a
# member of said group, otherwise, raise error
if (
not unfinished_item
and previous_group_level
and int(line.strip()[:2]) <= previous_group_level
):
raise InvalidCOBOLError(
line_number - 1, "Group has no sub elements at line {}."
)

# Save information about last read line for easier error
# checking.
if (
not unfinished_item
and self._lowest_dict(parse_out)["Type"] == "Field"
):
previous_field_level = int(line.strip()[:2])
previous_group_level = 0
elif not unfinished_item:
previous_group_level = int(line.strip()[:2])
previous_field_level = 0

# Check for EOL character in string
line_ended_check = r"(w|d|))+.(s|$)"
if not re.search(line_ended_check, line):
unfinished_item = self._lowest_dict(parse_out)
else:
unfinished_item = {}

return parse_out

def parse_line(
self, out_builder: list, line: str, line_number: int
) -> list:
"""
Parses a COBOL line and creates a new item in the output builder. If
the line is a continuation of a previous line, just add the new info to
the previously added entry.
"""
if out_builder:
last_element = self._lowest_dict(out_builder)
else:
last_element = {}

items = [x for x in line.strip().split()]

# Strip out closing periods from items, then strip out items that come
# after the period
for item in items:
if item[-1:] == ".":
line_ended = True
last_item = items.index(item)
items = items[: last_item + 1]
items[last_item] = items[last_item][:-1]
else:
line_ended = False

try:
# Check first item (should be level if not a line continuation)
# Level should be two digits, representing a number between 01 and
# 49. COBOL standards also allow 66 and 88 as levels with specific
# rules. These are not supported yet but if they are in the future
# replace below regex with the following: (?!00)(66|88|[0-4][[0-9])
if re.match(r"(?!00)[0-4][0-9]", items[0]):
current_level = items[0]

# Check whether line is a continuation of previous line
# As of right now, only PIC and Usage are allowed to continue
# onto another line, (both only existing on fields
elif (
items[0] == "PIC"
and last_element
and "Format" not in last_element.keys()
):
last_element["Format"] = self._clause_value(items, "PIC")
last_element["Type"] = "Field"

# Check if continued line also has usage clause
# PIC is always two items long, check after
if len(items) == 3 and self._valid_usage(items[2]):
last_element["Usage"] = items[2]
return out_builder
elif (
last_element
and last_element["Type"] == "Field"
and self._valid_usage(items[0])
and "Usage" not in last_element.keys()
):
last_element["Usage"] = items[0]
return out_builder
else:
raise InvalidCOBOLError(
line_number, "Input does not resemble COBOL at line {}"
)

# At this point we know this is a new field or group. Get the group
# name and save it in a dictionary representing the new item. Also
# check for invalid names
new_item = {"Name": items[1]}

# Get the list of fields for the group the current item belongs to
current_group = self._item_level(out_builder, int(current_level))
if new_item["Name"] in COBOL_KEYWORDS:
raise InvalidCOBOLError(
line_number,
"Field or group name at line {} matches a COBOL keyword",
)

try:
clause_error = InvalidCOBOLError(
line_number,
"A clause was declared but no definition was given.",
)
if "REDEFINES" in items:
new_item["Redefines"] = self._clause_value(
items, "REDEFINES"
)
if new_item["Redefines"] in COBOL_KEYWORDS:
raise clause_error

if "OCCURS" in items:
if items[items.index("OCCURS") + 2] != "TIMES":
raise clause_error

try:
new_item["Occurs"] = int(
self._clause_value(items, "OCCURS")
)
except ValueError:
raise InvalidCOBOLError(
line_number,
"Occurs clause must specify an integer value at line {}.",
)

if "PIC" not in items and line_ended:
# Append the newly added group to group_levels
self.group_levels.append(
(new_item["Name"], int(current_level))
)
new_item["Type"] = "Group"
new_item["Fields"] =
current_group.append(new_item)
return out_builder

# Item is field
new_item["Type"] = "Field"
if "PIC" in items:
new_item["Format"] = self._clause_value(items, "PIC")

if new_item["Format"] in COBOL_KEYWORDS:
raise clause_error

# Check for usage clause.
usage_index = items.index("PIC") + 2
if len(items) > usage_index:
if self._valid_usage(items[usage_index]):
new_item["Usage"] = items[usage_index]
else:
raise InvalidCOBOLError(
line_number,
(
"Usage clause does not match an existing "
"definition at line {}"
),
)

except IndexError:
raise clause_error

current_group = self._item_level(out_builder, int(current_level))
current_group.append(new_item)
return out_builder

except IndexError:
raise InvalidCOBOLError(
line_number, "Input does not resemble COBOL at line {}"
)

def _item_level(self, struct: list, current_level: int) -> list:
"""
Returns a list corresponding what group an item should belong to.
"""
if not struct or not self.group_levels:
return struct
# We only care about the last level of a matching group. Check groups
# in reverse.
if current_level > self.group_levels[-1][1]:
return self._lowest_list(struct)
for group in self.group_levels[::-1]:

# Return the fields of the first group that has a lower level than
# the current item's level.
if group[1] < current_level:
return self._lowest_list(struct, group[0])
return struct

def _lowest_dict(self, struct: list) -> dict:
"""
Returns the deepest dictionary at the bottom of provided structure.
"""
last_element = struct[-1:][0]
if "Fields" in last_element.keys() and last_element["Fields"]:
return self._lowest_dict(last_element["Fields"])
return last_element

def _lowest_list(self, struct: list, name: str = None) -> list:
"""
Returns the deepest list at the bottom of provided stucture. If a name
parameter is provided, stop searching and return list with matching
name.
"""
if not struct:
return struct
last_element = struct[-1]
if (
name
and last_element["Name"] == name
and "Fields" in last_element.keys()
):
return last_element["Fields"]
if "Fields" in last_element.keys():
return self._lowest_list(last_element["Fields"])
return struct

@staticmethod
def _clause_value(items: list, clause: str) -> str:
"""
Returns the item from a list of items following the provided clause.
"""
value = items[items.index(clause) + 1]
return value

@staticmethod
def _valid_usage(usage: str) -> bool:
"""
Returns bool indicating whether provided usage is valid or not.
"""
# Not really an indication of valid usages as much as a list of what
# usages the EBCDIC reader we use supports.
valid_usages = ["COMP-3"]
return usage in valid_usages

class InvalidCOBOLError(Exception):
"""
Produces an error message with a line number showing which line of code
contains the Invalid COBOL. msg parameter should contain a set of empty
square brackets, although if not, a set will be appended to the end of the
message.
"""

def __init__(self, line, msg=None):
if msg is None:
# Try to not let this happen
msg = (
"There was an unspecified error while parsing the COBOL at "
"line {}. Please contact a developer for assistance."
)
elif not "{}" in msg:
msg = msg + " (Line {})."
msg = msg.format(line)
super(InvalidCOBOLError, self).__init__(msg)






python parsing cobol






share|improve this question







New contributor




Manderton is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.











share|improve this question







New contributor




Manderton is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.









share|improve this question




share|improve this question






New contributor




Manderton is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.









asked 2 days ago









Manderton

161




161




New contributor




Manderton is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.





New contributor





Manderton is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.






Manderton is a new contributor to this site. Take care in asking for clarification, commenting, and answering.
Check out our Code of Conduct.








  • 1




    An alternative approach is to look at cb2xml sourceforge.net/projects/cb2xml) it will convert the Cobol to Xml. cb2xml is written java but there is an example of reading the Xml in python. cb2xml. Alternative cb2xml is written using sablecc; sablecc can generate pyhtonas well as java.
    – Bruce Martin
    2 days ago










  • there is also stingray sourceforge.net/projects/stingrayreader
    – Bruce Martin
    2 days ago






  • 1




    There are a lot of quirks in COBOL, the existing parsers will deal with many of them. Take comp-3 it can also be written as computational-3, it can be specified with/without usage and at either field or group level.
    – Bruce Martin
    2 days ago














  • 1




    An alternative approach is to look at cb2xml sourceforge.net/projects/cb2xml) it will convert the Cobol to Xml. cb2xml is written java but there is an example of reading the Xml in python. cb2xml. Alternative cb2xml is written using sablecc; sablecc can generate pyhtonas well as java.
    – Bruce Martin
    2 days ago










  • there is also stingray sourceforge.net/projects/stingrayreader
    – Bruce Martin
    2 days ago






  • 1




    There are a lot of quirks in COBOL, the existing parsers will deal with many of them. Take comp-3 it can also be written as computational-3, it can be specified with/without usage and at either field or group level.
    – Bruce Martin
    2 days ago








1




1




An alternative approach is to look at cb2xml sourceforge.net/projects/cb2xml) it will convert the Cobol to Xml. cb2xml is written java but there is an example of reading the Xml in python. cb2xml. Alternative cb2xml is written using sablecc; sablecc can generate pyhtonas well as java.
– Bruce Martin
2 days ago




An alternative approach is to look at cb2xml sourceforge.net/projects/cb2xml) it will convert the Cobol to Xml. cb2xml is written java but there is an example of reading the Xml in python. cb2xml. Alternative cb2xml is written using sablecc; sablecc can generate pyhtonas well as java.
– Bruce Martin
2 days ago












there is also stingray sourceforge.net/projects/stingrayreader
– Bruce Martin
2 days ago




there is also stingray sourceforge.net/projects/stingrayreader
– Bruce Martin
2 days ago




1




1




There are a lot of quirks in COBOL, the existing parsers will deal with many of them. Take comp-3 it can also be written as computational-3, it can be specified with/without usage and at either field or group level.
– Bruce Martin
2 days ago




There are a lot of quirks in COBOL, the existing parsers will deal with many of them. Take comp-3 it can also be written as computational-3, it can be specified with/without usage and at either field or group level.
– Bruce Martin
2 days ago










1 Answer
1






active

oldest

votes

















up vote
0
down vote













group[-1:][0]["Name"]



This expression is reused, so assign it to a variable name.



in [item["Name"] for item in group[:-1]]



For a membership test, a set is a better idea than a list.



int(line.strip()[:2])



This is reused a bunch of times, so make a variable.



line_ended_check = r"(w|d|))+.(s|$)"



You shouldn't initialize this regex where it is. It needs to be compiled once, outside of all of your parsing loops, using re.compile.



elif not "{}" in msg:



You should probably use elif "{}" not in msg: .






share|improve this answer





















    Your Answer





    StackExchange.ifUsing("editor", function () {
    return StackExchange.using("mathjaxEditing", function () {
    StackExchange.MarkdownEditor.creationCallbacks.add(function (editor, postfix) {
    StackExchange.mathjaxEditing.prepareWmdForMathJax(editor, postfix, [["\$", "\$"]]);
    });
    });
    }, "mathjax-editing");

    StackExchange.ifUsing("editor", function () {
    StackExchange.using("externalEditor", function () {
    StackExchange.using("snippets", function () {
    StackExchange.snippets.init();
    });
    });
    }, "code-snippets");

    StackExchange.ready(function() {
    var channelOptions = {
    tags: "".split(" "),
    id: "196"
    };
    initTagRenderer("".split(" "), "".split(" "), channelOptions);

    StackExchange.using("externalEditor", function() {
    // Have to fire editor after snippets, if snippets enabled
    if (StackExchange.settings.snippets.snippetsEnabled) {
    StackExchange.using("snippets", function() {
    createEditor();
    });
    }
    else {
    createEditor();
    }
    });

    function createEditor() {
    StackExchange.prepareEditor({
    heartbeatType: 'answer',
    convertImagesToLinks: false,
    noModals: true,
    showLowRepImageUploadWarning: true,
    reputationToPostImages: null,
    bindNavPrevention: true,
    postfix: "",
    imageUploader: {
    brandingHtml: "Powered by u003ca class="icon-imgur-white" href="https://imgur.com/"u003eu003c/au003e",
    contentPolicyHtml: "User contributions licensed under u003ca href="https://creativecommons.org/licenses/by-sa/3.0/"u003ecc by-sa 3.0 with attribution requiredu003c/au003e u003ca href="https://stackoverflow.com/legal/content-policy"u003e(content policy)u003c/au003e",
    allowUrls: true
    },
    onDemand: true,
    discardSelector: ".discard-answer"
    ,immediatelyShowMarkdownHelp:true
    });


    }
    });






    Manderton is a new contributor. Be nice, and check out our Code of Conduct.










    draft saved

    draft discarded


















    StackExchange.ready(
    function () {
    StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f208789%2fcobol-layout-parser-in-python%23new-answer', 'question_page');
    }
    );

    Post as a guest















    Required, but never shown

























    1 Answer
    1






    active

    oldest

    votes








    1 Answer
    1






    active

    oldest

    votes









    active

    oldest

    votes






    active

    oldest

    votes








    up vote
    0
    down vote













    group[-1:][0]["Name"]



    This expression is reused, so assign it to a variable name.



    in [item["Name"] for item in group[:-1]]



    For a membership test, a set is a better idea than a list.



    int(line.strip()[:2])



    This is reused a bunch of times, so make a variable.



    line_ended_check = r"(w|d|))+.(s|$)"



    You shouldn't initialize this regex where it is. It needs to be compiled once, outside of all of your parsing loops, using re.compile.



    elif not "{}" in msg:



    You should probably use elif "{}" not in msg: .






    share|improve this answer

























      up vote
      0
      down vote













      group[-1:][0]["Name"]



      This expression is reused, so assign it to a variable name.



      in [item["Name"] for item in group[:-1]]



      For a membership test, a set is a better idea than a list.



      int(line.strip()[:2])



      This is reused a bunch of times, so make a variable.



      line_ended_check = r"(w|d|))+.(s|$)"



      You shouldn't initialize this regex where it is. It needs to be compiled once, outside of all of your parsing loops, using re.compile.



      elif not "{}" in msg:



      You should probably use elif "{}" not in msg: .






      share|improve this answer























        up vote
        0
        down vote










        up vote
        0
        down vote









        group[-1:][0]["Name"]



        This expression is reused, so assign it to a variable name.



        in [item["Name"] for item in group[:-1]]



        For a membership test, a set is a better idea than a list.



        int(line.strip()[:2])



        This is reused a bunch of times, so make a variable.



        line_ended_check = r"(w|d|))+.(s|$)"



        You shouldn't initialize this regex where it is. It needs to be compiled once, outside of all of your parsing loops, using re.compile.



        elif not "{}" in msg:



        You should probably use elif "{}" not in msg: .






        share|improve this answer












        group[-1:][0]["Name"]



        This expression is reused, so assign it to a variable name.



        in [item["Name"] for item in group[:-1]]



        For a membership test, a set is a better idea than a list.



        int(line.strip()[:2])



        This is reused a bunch of times, so make a variable.



        line_ended_check = r"(w|d|))+.(s|$)"



        You shouldn't initialize this regex where it is. It needs to be compiled once, outside of all of your parsing loops, using re.compile.



        elif not "{}" in msg:



        You should probably use elif "{}" not in msg: .







        share|improve this answer












        share|improve this answer



        share|improve this answer










        answered yesterday









        Reinderien

        1,482616




        1,482616






















            Manderton is a new contributor. Be nice, and check out our Code of Conduct.










            draft saved

            draft discarded


















            Manderton is a new contributor. Be nice, and check out our Code of Conduct.













            Manderton is a new contributor. Be nice, and check out our Code of Conduct.












            Manderton is a new contributor. Be nice, and check out our Code of Conduct.
















            Thanks for contributing an answer to Code Review Stack Exchange!


            • Please be sure to answer the question. Provide details and share your research!

            But avoid



            • Asking for help, clarification, or responding to other answers.

            • Making statements based on opinion; back them up with references or personal experience.


            Use MathJax to format equations. MathJax reference.


            To learn more, see our tips on writing great answers.





            Some of your past answers have not been well-received, and you're in danger of being blocked from answering.


            Please pay close attention to the following guidance:


            • Please be sure to answer the question. Provide details and share your research!

            But avoid



            • Asking for help, clarification, or responding to other answers.

            • Making statements based on opinion; back them up with references or personal experience.


            To learn more, see our tips on writing great answers.




            draft saved


            draft discarded














            StackExchange.ready(
            function () {
            StackExchange.openid.initPostLogin('.new-post-login', 'https%3a%2f%2fcodereview.stackexchange.com%2fquestions%2f208789%2fcobol-layout-parser-in-python%23new-answer', 'question_page');
            }
            );

            Post as a guest















            Required, but never shown





















































            Required, but never shown














            Required, but never shown












            Required, but never shown







            Required, but never shown

































            Required, but never shown














            Required, but never shown












            Required, but never shown







            Required, but never shown







            Popular posts from this blog

            Mont Emei

            Province de Neuquén

            Journaliste