Support non-ASCII characters in headers

Filter out non-ASCII characters in automatically processed headers.

Do this in a way that minimizes the code change: keep manipulating
strings, but strip off non-ASCII characters when reading lines, which
should only remove characters in comments that we don't parse anyway.
This commit is contained in:
Gilles Peskine 2019-12-06 19:20:13 +01:00
parent 81f7909497
commit 49af2d3a4f
2 changed files with 16 additions and 7 deletions

View file

@ -270,11 +270,16 @@ class MacroCollector:
# Other macro without parameter # Other macro without parameter
return return
_nonascii_re = re.compile(rb'[^\x00-\x7f]+')
_continued_line_re = re.compile(rb'\\\r?\n\Z')
def read_file(self, header_file): def read_file(self, header_file):
for line in header_file: for line in header_file:
while line.endswith('\\\n'): m = re.search(self._continued_line_re, line)
while m:
cont = next(header_file) cont = next(header_file)
line = line[:-2] + cont line = line[:m.start(0)] + cont
m = re.search(self._continued_line_re, line)
line = re.sub(self._nonascii_re, rb'', line).decode('ascii')
self.read_line(line) self.read_line(line)
@staticmethod @staticmethod
@ -380,7 +385,7 @@ class MacroCollector:
def generate_psa_constants(header_file_names, output_file_name): def generate_psa_constants(header_file_names, output_file_name):
collector = MacroCollector() collector = MacroCollector()
for header_file_name in header_file_names: for header_file_name in header_file_names:
with open(header_file_name) as header_file: with open(header_file_name, 'rb') as header_file:
collector.read_file(header_file) collector.read_file(header_file)
temp_file_name = output_file_name + '.tmp' temp_file_name = output_file_name + '.tmp'
with open(temp_file_name, 'w') as output_file: with open(temp_file_name, 'w') as output_file:

View file

@ -43,12 +43,14 @@ class read_file_lines:
except that if process(line) raises an exception, then the read_file_lines except that if process(line) raises an exception, then the read_file_lines
snippet annotates the exception with the file name and line number. snippet annotates the exception with the file name and line number.
""" """
def __init__(self, filename): def __init__(self, filename, binary=False):
self.filename = filename self.filename = filename
self.line_number = 'entry' self.line_number = 'entry'
self.generator = None self.generator = None
self.binary = binary
def __enter__(self): def __enter__(self):
self.generator = enumerate(open(self.filename, 'r')) self.generator = enumerate(open(self.filename,
'rb' if self.binary else 'r'))
return self return self
def __iter__(self): def __iter__(self):
for line_number, content in self.generator: for line_number, content in self.generator:
@ -224,13 +226,15 @@ class Inputs:
if m.group(3): if m.group(3):
self.argspecs[name] = self._argument_split(m.group(3)) self.argspecs[name] = self._argument_split(m.group(3))
_nonascii_re = re.compile(rb'[^\x00-\x7f]+')
def parse_header(self, filename): def parse_header(self, filename):
"""Parse a C header file, looking for "#define PSA_xxx".""" """Parse a C header file, looking for "#define PSA_xxx"."""
with read_file_lines(filename) as lines: with read_file_lines(filename, binary=True) as lines:
for line in lines: for line in lines:
line = re.sub(self._nonascii_re, rb'', line).decode('ascii')
self.parse_header_line(line) self.parse_header_line(line)
_macro_identifier_re = r'[A-Z]\w+' _macro_identifier_re = re.compile(r'[A-Z]\w+')
def generate_undeclared_names(self, expr): def generate_undeclared_names(self, expr):
for name in re.findall(self._macro_identifier_re, expr): for name in re.findall(self._macro_identifier_re, expr):
if name not in self.all_declared: if name not in self.all_declared: