Lift some code out of parse_identifiers

Make parse_identifiers less complex. Pylint was complaining that it had too
many local variables, and it had a point.

* Lift the constants identifier_regex and exclusion_lines to class
  constants (renamed to uppercase because they're constants).
* Lift the per-file loop into a new function parse_identifiers_in_file.

No intended behavior change.

Signed-off-by: Gilles Peskine <Gilles.Peskine@arm.com>
This commit is contained in:
Gilles Peskine 2021-11-16 20:56:47 +01:00
parent 7493c4017a
commit b3f4dd5c81

View file

@ -457,6 +457,105 @@ class CodeParser():
return enum_consts
IDENTIFIER_REGEX = re.compile(
# Match " something(a" or " *something(a". Functions.
# Assumptions:
# - function definition from return type to one of its arguments is
# all on one line
# - function definition line only contains alphanumeric, asterisk,
# underscore, and open bracket
r".* \**(\w+) *\( *\w|"
# Match "(*something)(".
r".*\( *\* *(\w+) *\) *\(|"
# Match names of named data structures.
r"(?:typedef +)?(?:struct|union|enum) +(\w+)(?: *{)?$|"
# Match names of typedef instances, after closing bracket.
r"}? *(\w+)[;[].*"
)
# The regex below is indented for clarity.
EXCLUSION_LINES = re.compile(
r"^("
r"extern +\"C\"|" # pylint: disable=bad-continuation
r"(typedef +)?(struct|union|enum)( *{)?$|"
r"} *;?$|"
r"$|"
r"//|"
r"#"
r")"
)
def parse_identifiers_in_file(self, header_file, identifiers):
"""
Parse all lines of a header where a function/enum/struct/union/typedef
identifier is declared, based on some regex and heuristics. Highly
dependent on formatting style.
Append found matches to the list ``identifiers``.
"""
with open(header_file, "r", encoding="utf-8") as header:
in_block_comment = False
# The previous line variable is used for concatenating lines
# when identifiers are formatted and spread across multiple
# lines.
previous_line = ""
for line_no, line in enumerate(header):
# Terminate current comment?
if in_block_comment:
line = re.sub(r".*?\*/", r"", line, 1)
in_block_comment = False
# Remove full comments and string literals
line = re.sub(r'/\*.*?\*/|(")(?:[^\\\"]|\\.)*"',
lambda s: '""' if s.group(1) else ' ',
line)
# Start an unfinished comment?
m = re.match(r"/\*", line)
if m:
in_block_comment = True
line = line[:m.end(0)]
if self.EXCLUSION_LINES.search(line):
previous_line = ""
continue
# If the line contains only space-separated alphanumeric
# characters (or underscore, asterisk, or, open bracket),
# and nothing else, high chance it's a declaration that
# continues on the next line
if re.search(r"^([\w\*\(]+\s+)+$", line):
previous_line += line
continue
# If previous line seemed to start an unfinished declaration
# (as above), concat and treat them as one.
if previous_line:
line = previous_line.strip() + " " + line.strip() + "\n"
previous_line = ""
# Skip parsing if line has a space in front = heuristic to
# skip function argument lines (highly subject to formatting
# changes)
if line[0] == " ":
continue
identifier = self.IDENTIFIER_REGEX.search(line)
if not identifier:
continue
# Find the group that matched, and append it
for group in identifier.groups():
if not group:
continue
identifiers.append(Match(
header_file,
line,
line_no,
identifier.span(),
group))
def parse_identifiers(self, include, exclude=None):
"""
Parse all lines of a header where a function/enum/struct/union/typedef
@ -469,100 +568,13 @@ class CodeParser():
Returns a List of Match objects with identifiers.
"""
identifier_regex = re.compile(
# Match " something(a" or " *something(a". Functions.
# Assumptions:
# - function definition from return type to one of its arguments is
# all on one line
# - function definition line only contains alphanumeric, asterisk,
# underscore, and open bracket
r".* \**(\w+) *\( *\w|"
# Match "(*something)(".
r".*\( *\* *(\w+) *\) *\(|"
# Match names of named data structures.
r"(?:typedef +)?(?:struct|union|enum) +(\w+)(?: *{)?$|"
# Match names of typedef instances, after closing bracket.
r"}? *(\w+)[;[].*"
)
# The regex below is indented for clarity.
exclusion_lines = re.compile(
r"^("
r"extern +\"C\"|" # pylint: disable=bad-continuation
r"(typedef +)?(struct|union|enum)( *{)?$|"
r"} *;?$|"
r"$|"
r"//|"
r"#"
r")"
)
files = self.get_files(include, exclude)
self.log.debug("Looking for identifiers in {} files".format(len(files)))
identifiers = []
for header_file in files:
with open(header_file, "r", encoding="utf-8") as header:
in_block_comment = False
# The previous line variable is used for concatenating lines
# when identifiers are formatted and spread across multiple
# lines.
previous_line = ""
for line_no, line in enumerate(header):
# Terminate current comment?
if in_block_comment:
line = re.sub(r".*?\*/", r"", line, 1)
in_block_comment = False
# Remove full comments and string literals
line = re.sub(r'/\*.*?\*/|(")(?:[^\\\"]|\\.)*"',
lambda s: '""' if s.group(1) else ' ',
line)
# Start an unfinished comment?
m = re.match(r"/\*", line)
if m:
in_block_comment = True
line = line[:m.end(0)]
if exclusion_lines.search(line):
previous_line = ""
continue
# If the line contains only space-separated alphanumeric
# characters (or underscore, asterisk, or, open bracket),
# and nothing else, high chance it's a declaration that
# continues on the next line
if re.search(r"^([\w\*\(]+\s+)+$", line):
previous_line += line
continue
# If previous line seemed to start an unfinished declaration
# (as above), concat and treat them as one.
if previous_line:
line = previous_line.strip() + " " + line.strip() + "\n"
previous_line = ""
# Skip parsing if line has a space in front = heuristic to
# skip function argument lines (highly subject to formatting
# changes)
if line[0] == " ":
continue
identifier = identifier_regex.search(line)
if not identifier:
continue
# Find the group that matched, and append it
for group in identifier.groups():
if not group:
continue
identifiers.append(Match(
header_file,
line,
line_no,
identifier.span(),
group))
self.parse_identifiers_in_file(header_file, identifiers)
return identifiers