code checker: add 'make check_struct_comments'

This utility uses Python's clang module to validate comments intended to match the struct member names. `cmake_static_check_clang.py` has been written to support adding other checkers in the future.
2023-07-18 14:18:08 +10:00 · 2023-07-18 14:18:08 +10:00 · 2c9aa2c8e5
commit 2c9aa2c8e5
parent 17563e9a91
2 changed files with 552 additions and 0 deletions
--- a/8
+++ b/8
@ -59,6 +59,7 @@ Static Source Code Checking

   * check_cppcheck:        Run blender source through cppcheck (C & C++).
   * check_clang_array:     Run blender source through clang array checking script (C & C++).
+   * check_struct_comments: Check struct member comments are correct (C & C++).
   * check_deprecated:      Check if there is any deprecated code to remove.
   * check_descriptions:    Check for duplicate/invalid descriptions.
   * check_licenses:        Check license headers follow the SPDX license specification,
@ -468,6 +469,13 @@ check_cppcheck: .FORCE
 	    "$(BLENDER_DIR)/check_cppcheck.txt"
 	@echo "written: check_cppcheck.txt"

+check_struct_comments: .FORCE
+	@$(CMAKE_CONFIG)
+	@cd "$(BUILD_DIR)" ; \
+	$(PYTHON) \
+	    "$(BLENDER_DIR)/build_files/cmake/cmake_static_check_clang.py" \
+	    --checks=struct_comments --match=".*"
+
 check_clang_array: .FORCE
 	@$(CMAKE_CONFIG)
 	@cd "$(BUILD_DIR)" ; \
--- a/build_files/cmake/cmake_static_check_clang.py
+++ b/build_files/cmake/cmake_static_check_clang.py
@ -0,0 +1,544 @@
+#!/usr/bin/env python3
+# SPDX-FileCopyrightText: 2023 Blender Foundation
+#
+# SPDX-License-Identifier: GPL-2.0-or-later
+
+"""
+A command line utility to check Blender's source code with CLANG's Python module.
+
+To call this directly:
+
+export CLANG_LIB_DIR=/usr/lib64
+cd {BUILD_DIR}
+python ../blender/build_files/cmake/cmake_static_check_clang.py --match=".*" --checks=struct_comments
+
+"""
+
+import argparse
+import os
+import re
+import sys
+
+from typing import (
+    Any,
+    Dict,
+    List,
+    Type,
+    Sequence,
+    Tuple,
+)
+
+
+import project_source_info
+
+# pylint: disable-next=import-outside-toplevel
+import clang  # type: ignore
+# pylint: disable-next=import-outside-toplevel
+import clang.cindex  # type: ignore
+from clang.cindex import (
+    CursorKind,
+)
+
+# Only for readability.
+ClangNode = Any
+ClangTranslationUnit = Any
+ClangSourceLocation = Any
+
+
+USE_VERBOSE = os.environ.get("VERBOSE", None) is not None
+
+# Turn off for debugging.
+USE_MULTIPROCESS = True
+
+CLANG_BIND_DIR = os.environ.get("CLANG_BIND_DIR")
+CLANG_LIB_DIR = os.environ.get("CLANG_LIB_DIR")
+
+if CLANG_BIND_DIR is None:
+    print("$CLANG_BIND_DIR python binding dir not set")
+if CLANG_LIB_DIR is None:
+    print("$CLANG_LIB_DIR clang lib dir not set")
+
+if CLANG_LIB_DIR:
+    clang.cindex.Config.set_library_path(CLANG_LIB_DIR)
+if CLANG_BIND_DIR:
+    sys.path.append(CLANG_BIND_DIR)
+
+
+CHECKER_IGNORE_PREFIX = [
+    "extern",
+]
+
+CHECKER_EXCLUDE_SOURCE_FILES = set(os.path.join(*f.split("/")) for f in (
+    # Skip parsing these large (mostly data files).
+    "source/blender/editors/space_text/text_format_pov.cc",
+    "source/blender/editors/space_text/text_format_pov_ini.cc",
+))
+
+
+# -----------------------------------------------------------------------------
+# Utility Functions
+
+def clang_source_location_as_str(source_location: ClangSourceLocation) -> str:
+    return "{:s}:{:d}:{:d}:".format(str(source_location.file), source_location.line, source_location.column)
+
+
+# -----------------------------------------------------------------------------
+# Checkers
+
+class ClangChecker:
+    """
+    Base class for checkers.
+
+    Notes:
+
+    - The function ``check_source`` takes file_data as bytes instead of a string
+      because the offsets provided by CLANG are byte offsets.
+      While the offsets could be converted into UNICODE offset's,
+      there doesn't seem to be an efficient & convenient way to do that.
+    """
+    __slots__ = ()
+
+    def __new__(cls, *args: Tuple[Any], **kwargs: Dict[str, Any]) -> Any:
+        raise RuntimeError("%s should not be instantiated" % cls)
+
+    @staticmethod
+    def check_source(
+            _filepath: str,
+            _file_data: bytes,
+            _tu: ClangTranslationUnit,
+            _shared_check_data: Any,
+    ) -> List[str]:
+        raise RuntimeError("This function must be overridden by it's subclass!")
+        return []
+
+    @staticmethod
+    def setup() -> Any:
+        return None
+
+    @staticmethod
+    def teardown(_shared_check_data: Any) -> None:
+        pass
+
+
+class clang_checkers:
+    # fake module.
+
+    class struct_comments(ClangChecker):
+        """
+        Ensure comments in struct declarations match the members of the struct, e.g:
+
+           SomeStruct var = {
+               /*name*/ "Text",
+               /*children*/ nullptr,
+               /*flag*/ 0,
+           };
+
+        Will generate a warning if any of the names in the prefix comments don't match the struct member names.
+        """
+
+        _struct_comments_ignore = {
+            # `PyTypeObject` uses compile time members that vary (see: #PyVarObject_HEAD_INIT macro)
+            # While some clever comment syntax could be supported to signify multiple/optional members
+            # this is such a specific case that it's simpler to skip this warning.
+            "PyTypeObject": {"ob_base": {"ob_size"}},
+        }
+
+        @staticmethod
+        def _struct_check_comments_recursive(
+                # Static (unchanged for each recursion).
+                filepath: str,
+                file_data: bytes,
+                # Different for each recursion.
+                node: ClangNode,
+                node_parent: ClangNode,
+                level: int,
+                # Used to build data.
+                struct_decl_map: Dict[str, ClangNode],
+                struct_type_map: Dict[str, str],
+                output: List[str],
+        ) -> None:
+
+            # Needed to read back the node.
+            if USE_VERBOSE:
+                print("TRY:", node.kind, node.spelling, len(list(node.get_tokens())), level, node.location)
+
+            # if node.kind == CursorKind.VAR_DECL and node.spelling == "Vector_NumMethods":
+            #     import IPython
+            #     IPython.embed()
+
+            if node.kind == CursorKind.STRUCT_DECL:
+                struct_type = node.spelling.strip()
+                if not struct_type:
+                    # The parent may be a `typedef [..] TypeID` where `[..]` is `struct { a; b; c; }`.
+                    # Inspect the parent.
+                    if node_parent is not None and (node_parent.kind == CursorKind.TYPEDEF_DECL):
+                        tokens = list(node_parent.get_tokens())
+                        if tokens[0].spelling == "typedef":
+                            struct_type = tokens[-1].spelling
+
+                struct_decl_map[struct_type] = node
+
+            # Ignore declarations for anything defined outside this file.
+            if str(node.location.file) == filepath:
+                if node.kind == CursorKind.INIT_LIST_EXPR:
+                    if USE_VERBOSE:
+                        print(node.spelling, node.location)
+                    # Split to avoid `const struct` .. and similar.
+                    # NOTE: there may be an array size suffix, e.g. `[4]`.
+                    # This could be supported.
+                    struct_type = node.type.spelling.split()[-1]
+                    struct = struct_decl_map.get(struct_type)
+                    if struct is None:
+                        if USE_VERBOSE:
+                            print("NOT FOUND:", struct_type)
+                        struct_type = struct_type_map.get(struct_type)
+                        if struct_type is not None:
+                            struct = struct_decl_map.get(struct_type)
+
+                    if USE_VERBOSE:
+                        print("INSPECTING STRUCT:", struct_type)
+                    if struct is not None:
+                        member_names = [
+                            node_child.spelling for node_child in struct.get_children()
+                            if node_child.kind == CursorKind.FIELD_DECL
+                        ]
+                        # if struct_type == "PyMappingMethods":
+                        #     import IPython
+                        #     IPython.embed()
+
+                        children = list(node.get_children())
+                        comment_names = []
+
+                        # Set to true when there is a comment directly before a value,
+                        # this is needed because:
+                        # - Comments on the previous line are rarely intended to be identifiers of the struct member.
+                        # - Comments which _are_ intended to be identifiers can be wrapped onto new-lines
+                        #   so they should not be ignored.
+                        #
+                        # While it's possible every member is wrapped onto a new-line,
+                        # this is highly unlikely.
+                        comment_names_prefix_any = False
+
+                        for node_child in children:
+                            # Extract the content before the child
+                            # (typically a C-style comment containing the struct member).
+                            end = min(node_child.location.offset, len(file_data))
+
+                            # It's possible this ID has a preceding "name::space::etc"
+                            # which should be skipped.
+                            while end > 0 and ((ch := bytes((file_data[end - 1],))).isalpha() or ch == b":"):
+                                end -= 1
+
+                            has_newline = False
+                            while end > 0:
+                                ch = bytes((file_data[end - 1],))
+                                if ch in {b"\t", b" "}:
+                                    end -= 1
+                                elif ch == b"\n":
+                                    end -= 1
+                                    has_newline = True
+                                else:
+                                    break
+
+                            beg = end - 1
+                            while beg != 0 and bytes((file_data[beg],)) != b"\n":
+                                beg -= 1
+                            text = file_data[beg:end]
+                            if text.lstrip().startswith(b"/*"):
+                                if not has_newline:
+                                    comment_names_prefix_any = True
+                            else:
+                                text = b""
+                            comment_names.append(text.decode('utf-8'))
+
+                        if USE_VERBOSE:
+                            print(member_names)
+                            print(comment_names)
+
+                        total = min(len(member_names), len(comment_names))
+
+                        if total != 0 and comment_names_prefix_any:
+                            result = [""] * total
+                            count_found = 0
+                            count_invalid = 0
+                            for i in range(total):
+                                comment = comment_names[i]
+                                if "/*" in comment and "*/" in comment:
+                                    comment = comment.strip().strip("/").strip("*")
+                                    if comment == member_names[i]:
+                                        count_found += 1
+                                    else:
+                                        suppress_warning = False
+                                        if (
+                                                skip_members_table :=
+                                                clang_checkers.struct_comments._struct_comments_ignore.get(
+                                                    node_parent.type.spelling,
+                                                )
+                                        ) is not None:
+                                            if (skip_members := skip_members_table.get(comment)) is not None:
+                                                if member_names[i] in skip_members:
+                                                    suppress_warning = True
+
+                                        if not suppress_warning:
+                                            result[i] = "Incorrect! found \"{:s}\" expected \"{:s}\"".format(
+                                                comment, member_names[i])
+                                            count_invalid += 1
+                                else:
+                                    result[i] = "No comment for \"{:s}\"".format(member_names[i])
+                            if count_found == 0 and count_invalid == 0:
+                                # No comments used, skip this as not all declaration use this comment style.
+                                output.append(
+                                    "NONE: {:s} {:s}".format(
+                                        clang_source_location_as_str(node.location),
+                                        node.type.spelling,
+                                    )
+                                )
+                            elif count_found != total:
+                                for i in range(total):
+                                    if result[i]:
+                                        output.append(
+                                            "FAIL: {:s} {:s}".format(
+                                                clang_source_location_as_str(children[i].location),
+                                                result[i],
+                                            )
+                                        )
+                            else:
+                                output.append(
+                                    "OK: {:s} {:s}".format(
+                                        clang_source_location_as_str(node.location),
+                                        node.type.spelling,
+                                    )
+                                )
+
+            for node_child in node.get_children():
+                clang_checkers.struct_comments._struct_check_comments_recursive(
+                    filepath, file_data,
+                    node_child, node, level + 1,
+                    struct_decl_map, struct_type_map, output,
+                )
+
+        @staticmethod
+        def check_source(
+                filepath: str,
+                file_data: bytes,
+                tu: ClangTranslationUnit,
+                _shared_check_data: Any) -> List[str]:
+            output: List[str] = []
+
+            struct_decl_map: Dict[str, Any] = {}
+            struct_type_map: Dict[str, str] = {}
+            clang_checkers.struct_comments._struct_check_comments_recursive(
+                filepath, file_data,
+                tu.cursor, None, 0,
+                struct_decl_map, struct_type_map, output,
+            )
+
+            return output
+
+
+# -----------------------------------------------------------------------------
+# Checker Class Access
+
+def check_function_get_all() -> List[str]:
+    checkers = []
+    for name in dir(clang_checkers):
+        value = getattr(clang_checkers, name)
+        if isinstance(value, type) and issubclass(value, ClangChecker):
+            checkers.append(name)
+    checkers.sort()
+    return checkers
+
+
+def check_class_from_id(name: str) -> Type[ClangChecker]:
+    result = getattr(clang_checkers, name)
+    assert issubclass(result, ClangChecker)
+    # MYPY 0.812 doesn't recognize the assert above.
+    return result  # type: ignore
+
+
+def check_docstring_from_id(name: str) -> str:
+    from textwrap import dedent
+    result = getattr(clang_checkers, name).__doc__
+    return dedent(result or '').strip('\n') + '\n'
+
+
+# -----------------------------------------------------------------------------
+# Generic Clang Checker
+
+def check_source_file(
+        filepath: str,
+        args: Sequence[str],
+        check_ids: Sequence[str],
+        shared_check_data_foreach_check: Sequence[Any],
+) -> str:
+    index = clang.cindex.Index.create()
+    try:
+        tu = index.parse(filepath, args)
+    except clang.cindex.TranslationUnitLoadError as ex:
+        return "PARSE_ERROR: {:s} {!r}".format(filepath, ex)
+
+    with open(filepath, "rb") as fh:
+        file_data = fh.read()
+
+    output: List[str] = []
+
+    # we don't really care what we are looking at, just scan entire file for
+    # function calls.
+    for check, shared_check_data in zip(check_ids, shared_check_data_foreach_check):
+        cls = check_class_from_id(check)
+        output.extend(cls.check_source(filepath, file_data, tu, shared_check_data))
+
+    if not output:
+        return ""
+    return "\n".join(output)
+
+
+def check_source_file_for_imap(args: Tuple[str, Sequence[str], Sequence[str], Sequence[Any]]) -> str:
+    return check_source_file(*args)
+
+
+def source_info_filter(
+        source_info: List[Tuple[str, List[str], List[str]]],
+        regex_list: Sequence[re.Pattern[str]],
+) -> List[Tuple[str, List[str], List[str]]]:
+    source_dir = project_source_info.SOURCE_DIR
+    if not source_dir.endswith(os.sep):
+        source_dir += os.sep
+    source_info_result = []
+    for item in source_info:
+        filepath_source = item[0]
+        if filepath_source.startswith(source_dir):
+            filepath_source_relative = filepath_source[len(source_dir):]
+            if filepath_source_relative in CHECKER_EXCLUDE_SOURCE_FILES:
+                CHECKER_EXCLUDE_SOURCE_FILES.remove(filepath_source_relative)
+                continue
+            if filepath_source_relative.startswith("intern" + os.sep + "ghost"):
+                pass
+            elif filepath_source_relative.startswith("source" + os.sep):
+                pass
+            else:
+                continue
+
+            has_match = False
+            for regex in regex_list:
+                if regex.match(filepath_source_relative) is not None:
+                    has_match = True
+            if not has_match:
+                continue
+
+        source_info_result.append(item)
+
+    if CHECKER_EXCLUDE_SOURCE_FILES:
+        sys.stderr.write(
+            "Error: exclude file(s) are missing: {!r}\n".format((list(sorted(CHECKER_EXCLUDE_SOURCE_FILES))))
+        )
+        sys.exit(1)
+
+    return source_info_result
+
+
+def run_checks_on_project(
+        check_ids: Sequence[str],
+        regex_list: Sequence[re.Pattern[str]],
+) -> None:
+    source_info = project_source_info.build_info(ignore_prefix_list=CHECKER_IGNORE_PREFIX)
+    source_defines = project_source_info.build_defines_as_args()
+
+    # Apply exclusion.
+    source_info = source_info_filter(source_info, regex_list)
+
+    shared_check_data_foreach_check = [
+        check_class_from_id(check).setup() for check in check_ids
+    ]
+
+    all_args = []
+    index = 0
+    for filepath_source, inc_dirs, defs in source_info[index:]:
+        args = (
+            [("-I" + i) for i in inc_dirs] +
+            [("-D" + d) for d in defs] +
+            source_defines
+        )
+
+        all_args.append((filepath_source, args, check_ids, shared_check_data_foreach_check))
+
+    import multiprocessing
+
+    if USE_MULTIPROCESS:
+        job_total = multiprocessing.cpu_count() + 1
+        with multiprocessing.Pool(processes=job_total) as pool:
+            # No `istarmap`, use an intermediate function.
+            for result in pool.imap(check_source_file_for_imap, all_args):
+                if result:
+                    print(result)
+    else:
+        for (filepath_source, args, _check_ids, shared_check_data_foreach_check) in all_args:
+            result = check_source_file(filepath_source, args, check_ids, shared_check_data_foreach_check)
+            if result:
+                print(result)
+
+    for (check, shared_check_data) in zip(check_ids, shared_check_data_foreach_check):
+        check_class_from_id(check).teardown(shared_check_data)
+
+
+def create_parser(checkers_all: Sequence[str]) -> argparse.ArgumentParser:
+    from textwrap import indent
+
+    # Create doc-string for checks.
+    checks_all_docs = []
+    for checker in checkers_all:
+        # `%` -> `%%` is needed for `--help` not to interpret these as formatting arguments.
+        checks_all_docs.append(
+            "  %s\n%s" % (
+                checker,
+                indent(check_docstring_from_id(checker).replace("%", "%%"), '    '),
+            )
+        )
+
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawTextHelpFormatter,
+    )
+    parser.add_argument(
+        "--match",
+        nargs='+',
+        required=True,
+        metavar="REGEX",
+        help="Match file paths against this expression",
+    )
+    parser.add_argument(
+        "--checks",
+        dest="checks",
+        help=(
+            "Specify the check presets to run.\n\n" +
+            "\n".join(checks_all_docs) + "\n"
+            "Multiple checkers may be passed at once (comma separated, no spaces)."),
+        required=True,
+    )
+
+    return parser
+
+
+# -----------------------------------------------------------------------------
+# Main Function
+
+def main() -> int:
+    checkers_all = check_function_get_all()
+    parser = create_parser(checkers_all)
+    args = parser.parse_args()
+
+    regex_list = []
+
+    for expr in args.match:
+        try:
+            regex_list.append(re.compile(expr))
+        except Exception as ex:
+            print("Error in expression: \"{:s}\"\n  {!r}".format(expr, ex))
+            return 1
+
+    run_checks_on_project(args.checks.split(','), regex_list)
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())