From 55c6c87d951823b31e353a6bad5d7cc2e7957557 Mon Sep 17 00:00:00 2001
From: Yuto Takano <yuto.takano@arm.com>
Date: Mon, 9 Aug 2021 15:35:19 +0100
Subject: [PATCH] Separate code parsing and name checking in two classes

Signed-off-by: Yuto Takano <yuto.takano@arm.com>
---
 tests/scripts/check_names.py               | 191 +++++++++++----------
 tests/scripts/list_internal_identifiers.py |   5 +-
 2 files changed, 99 insertions(+), 97 deletions(-)

diff --git a/tests/scripts/check_names.py b/tests/scripts/check_names.py
index 9a7f3918e..957701433 100755
--- a/tests/scripts/check_names.py
+++ b/tests/scripts/check_names.py
@@ -20,16 +20,26 @@ This script confirms that the naming of all symbols and identifiers in Mbed TLS
 are consistent with the house style and are also self-consistent. It only runs
 on Linux and macOS since it depends on nm.
 
-The script performs the following checks:
+It contains two major Python classes, CodeParser and NameChecker. They both have
+a comprehensive "run-all" function (comprehensive_parse() and perform_checks())
+but the individual functions can also be used for specific needs.
+
+CodeParser makes heavy use of regular expressions to parse the code, and is
+dependent on the current code formatting. Many Python C parser libraries require
+preprocessed C code, which means no macro parsing. Compiler tools are also not
+very helpful when we want the exact location in the original source (which
+becomes impossible when e.g. comments are stripped).
+
+NameChecker performs the following checks:
 
 - All exported and available symbols in the library object files, are explicitly
   declared in the header files. This uses the nm command.
 - All macros, constants, and identifiers (function names, struct names, etc)
-  follow the required pattern.
+  follow the required regex pattern.
 - Typo checking: All words that begin with MBED exist as macros or constants.
 
-Returns 0 on success, 1 on test failure, and 2 if there is a script error or a
-subprocess error. Must be run from Mbed TLS root.
+The script returns 0 on success, 1 on test failure, and 2 if there is a script
+error error. Must be run from Mbed TLS root.
 """
 
 import argparse
@@ -168,16 +178,15 @@ class Typo(Problem): # pylint: disable=too-few-public-methods
             .format(self.match.filename, self.match.pos[0], self.match.name)
         ) + "\n" + str(self.match)
 
-class NameCheck():
+class CodeParser():
     """
-    Representation of the core name checking operation performed by this script.
-    Shares a common logger, and a shared return code.
+    Class for retrieving files and parsing the code. This can be used
+    independently of the checks that NameChecker performs, for example for
+    list_internal_identifiers.py.
     """
-    def __init__(self, verbose=False):
-        self.log = None
+    def __init__(self, log):
+        self.log = log
         self.check_repo_path()
-        self.return_code = 0
-        self.setup_logger(verbose)
 
         # Memo for storing "glob expression": set(filepaths)
         self.files = {}
@@ -185,9 +194,6 @@ class NameCheck():
         # Globally excluded filenames
         self.excluded_files = ["**/bn_mul", "**/compat-2.x.h"]
 
-        # Will contain the parse result after a comprehensive parse
-        self.parse_result = {}
-
     @staticmethod
     def check_repo_path():
         """
@@ -197,71 +203,12 @@ class NameCheck():
         if not all(os.path.isdir(d) for d in ["include", "library", "tests"]):
             raise Exception("This script must be run from Mbed TLS root")
 
-    def set_return_code(self, return_code):
-        if return_code > self.return_code:
-            self.log.debug("Setting new return code to {}".format(return_code))
-            self.return_code = return_code
-
-    def setup_logger(self, verbose=False):
+    def comprehensive_parse(self):
         """
-        Set up a logger and set the change the default logging level from
-        WARNING to INFO. Loggers are better than print statements since their
-        verbosity can be controlled.
-        """
-        self.log = logging.getLogger()
-        if verbose:
-            self.log.setLevel(logging.DEBUG)
-        else:
-            self.log.setLevel(logging.INFO)
-        self.log.addHandler(logging.StreamHandler())
+        Comprehensive ("default") function to call each parsing function and
+        retrieve various elements of the code, together with the source location.
 
-    def get_files(self, include_wildcards, exclude_wildcards):
-        """
-        Get all files that match any of the UNIX-style wildcards. While the
-        check_names script is designed only for use on UNIX/macOS (due to nm),
-        this function alone would work fine on Windows even with forward slashes
-        in the wildcard.
-
-        Args:
-        * include_wildcards: a List of shell-style wildcards to match filepaths.
-        * exclude_wildacrds: a List of shell-style wildcards to exclude.
-
-        Returns a List of relative filepaths.
-        """
-        accumulator = set()
-
-        # exclude_wildcards may be None. Also, consider the global exclusions.
-        exclude_wildcards = (exclude_wildcards or []) + self.excluded_files
-
-        # Perform set union on the glob results. Memoise individual sets.
-        for include_wildcard in include_wildcards:
-            if include_wildcard not in self.files:
-                self.files[include_wildcard] = set(glob.glob(
-                    include_wildcard,
-                    recursive=True
-                ))
-
-            accumulator = accumulator.union(self.files[include_wildcard])
-
-        # Perform set difference to exclude. Also use the same memo since their
-        # behaviour is pretty much identical and it can benefit from the cache.
-        for exclude_wildcard in exclude_wildcards:
-            if exclude_wildcard not in self.files:
-                self.files[exclude_wildcard] = set(glob.glob(
-                    exclude_wildcard,
-                    recursive=True
-                ))
-
-            accumulator = accumulator.difference(self.files[exclude_wildcard])
-
-        return list(accumulator)
-
-    def parse_names_in_source(self):
-        """
-        Comprehensive function to call each parsing function and retrieve
-        various elements of the code, together with their source location.
-        Puts the parsed values in the internal variable self.parse_result, so
-        they can be used from perform_checks().
+        Returns a dict of parsed item key to the corresponding List of Matches.
         """
         self.log.info("Parsing source code...")
         self.log.debug(
@@ -315,8 +262,7 @@ class NameCheck():
         self.log.debug("  {} Enum Constants".format(len(enum_consts)))
         self.log.debug("  {} Identifiers".format(len(identifiers)))
         self.log.debug("  {} Exported Symbols".format(len(symbols)))
-        self.log.info("Analysing...")
-        self.parse_result = {
+        return {
             "macros": actual_macros,
             "enum_consts": enum_consts,
             "identifiers": identifiers,
@@ -324,6 +270,47 @@ class NameCheck():
             "mbed_words": mbed_words
         }
 
+    def get_files(self, include_wildcards, exclude_wildcards):
+        """
+        Get all files that match any of the UNIX-style wildcards. While the
+        check_names script is designed only for use on UNIX/macOS (due to nm),
+        this function alone would work fine on Windows even with forward slashes
+        in the wildcard.
+
+        Args:
+        * include_wildcards: a List of shell-style wildcards to match filepaths.
+        * exclude_wildcards: a List of shell-style wildcards to exclude.
+
+        Returns a List of relative filepaths.
+        """
+        accumulator = set()
+
+        # exclude_wildcards may be None. Also, consider the global exclusions.
+        exclude_wildcards = (exclude_wildcards or []) + self.excluded_files
+
+        # Perform set union on the glob results. Memoise individual sets.
+        for include_wildcard in include_wildcards:
+            if include_wildcard not in self.files:
+                self.files[include_wildcard] = set(glob.glob(
+                    include_wildcard,
+                    recursive=True
+                ))
+
+            accumulator = accumulator.union(self.files[include_wildcard])
+
+        # Perform set difference to exclude. Also use the same memo since their
+        # behaviour is pretty much identical and it can benefit from the cache.
+        for exclude_wildcard in exclude_wildcards:
+            if exclude_wildcard not in self.files:
+                self.files[exclude_wildcard] = set(glob.glob(
+                    exclude_wildcard,
+                    recursive=True
+                ))
+
+            accumulator = accumulator.difference(self.files[exclude_wildcard])
+
+        return list(accumulator)
+
     def parse_macros(self, include, exclude=None):
         """
         Parse all macros defined by #define preprocessor directives.
@@ -456,11 +443,11 @@ class NameCheck():
             # Match " something(a" or " *something(a". Functions.
             # Assumptions:
             # - function definition from return type to one of its arguments is
-            #   all on one line (enforced by the previous_line concat below)
+            #   all on one line
             # - function definition line only contains alphanumeric, asterisk,
             #   underscore, and open bracket
             r".* \**(\w+) *\( *\w|"
-            # Match "(*something)(". Flexible with spaces.
+            # Match "(*something)(".
             r".*\( *\* *(\w+) *\) *\(|"
             # Match names of named data structures.
             r"(?:typedef +)?(?:struct|union|enum) +(\w+)(?: *{)?$|"
@@ -485,7 +472,7 @@ class NameCheck():
         for header_file in files:
             with open(header_file, "r", encoding="utf-8") as header:
                 in_block_comment = False
-                # The previous line varibale is used for concatenating lines
+                # The previous line variable is used for concatenating lines
                 # when identifiers are formatted and spread across multiple.
                 previous_line = ""
 
@@ -596,7 +583,6 @@ class NameCheck():
             )
         except subprocess.CalledProcessError as error:
             self.log.debug(error.output)
-            self.set_return_code(2)
             raise error
         finally:
             # Put back the original config regardless of there being errors.
@@ -614,7 +600,7 @@ class NameCheck():
         Does not return the position data since it is of no use.
 
         Args:
-        * object_files: a List of compiled object files to search through.
+        * object_files: a List of compiled object filepaths to search through.
 
         Returns a List of unique symbols defined and used in any of the object
         files.
@@ -646,18 +632,24 @@ class NameCheck():
 
         return symbols
 
+class NameChecker():
+    """
+    Representation of the core name checking operation performed by this script.
+    """
+    def __init__(self, parse_result, log):
+        self.parse_result = parse_result
+        self.log = log
+
     def perform_checks(self, quiet=False):
         """
-        Perform each check in order, output its PASS/FAIL status. Maintain an
-        overall test status, and output that at the end.
-        Assumes parse_names_in_source() was called before this.
+        A comprehensive checker that performs each check in order, and outputs
+        a final verdict.
 
         Args:
         * quiet: whether to hide detailed problem explanation.
         """
         self.log.info("=============")
         problems = 0
-
         problems += self.check_symbols_declared_in_header(quiet)
 
         pattern_checks = [
@@ -677,8 +669,10 @@ class NameCheck():
                 self.log.info("Remove --quiet to see explanations.")
             else:
                 self.log.info("Use --quiet for minimal output.")
+            return 1
         else:
             self.log.info("PASS")
+            return 0
 
     def check_symbols_declared_in_header(self, quiet):
         """
@@ -782,7 +776,6 @@ class NameCheck():
         * problems: a List of encountered Problems
         """
         if problems:
-            self.set_return_code(1)
             self.log.info("{}: FAIL\n".format(name))
             for problem in problems:
                 problem.quiet = quiet
@@ -792,8 +785,8 @@ class NameCheck():
 
 def main():
     """
-    Perform argument parsing, and create an instance of NameCheck to begin the
-    core operation.
+    Perform argument parsing, and create an instance of CodeParser and
+    NameChecker to begin the core operation.
     """
     parser = argparse.ArgumentParser(
         formatter_class=argparse.RawDescriptionHelpFormatter,
@@ -816,14 +809,22 @@ def main():
 
     args = parser.parse_args()
 
+    # Configure the global logger, which is then passed to the classes below
+    log = logging.getLogger()
+    log.setLevel(logging.DEBUG if args.verbose else logging.INFO)
+    log.addHandler(logging.StreamHandler())
+
     try:
-        name_check = NameCheck(verbose=args.verbose)
-        name_check.parse_names_in_source()
-        name_check.perform_checks(quiet=args.quiet)
-        sys.exit(name_check.return_code)
+        code_parser = CodeParser(log)
+        parse_result = code_parser.comprehensive_parse()
     except Exception: # pylint: disable=broad-except
         traceback.print_exc()
         sys.exit(2)
 
+    name_checker = NameChecker(parse_result, log)
+    return_code = name_checker.perform_checks(quiet=args.quiet)
+
+    sys.exit(return_code)
+
 if __name__ == "__main__":
     main()
diff --git a/tests/scripts/list_internal_identifiers.py b/tests/scripts/list_internal_identifiers.py
index 75b1646aa..64a4c3531 100755
--- a/tests/scripts/list_internal_identifiers.py
+++ b/tests/scripts/list_internal_identifiers.py
@@ -29,9 +29,10 @@ Must be run from Mbed TLS root.
 """
 
 import argparse
+import logging
 import traceback
 import sys
-from check_names import NameCheck
+from check_names import CodeParser
 
 def main():
     parser = argparse.ArgumentParser(
@@ -44,7 +45,7 @@ def main():
     parser.parse_args()
 
     try:
-        name_check = NameCheck()
+        name_check = CodeParser(logging.getLogger())
         result = name_check.parse_identifiers([
             "include/mbedtls/*_internal.h",
             "library/*.h"