#!/usr/bin/env python3 # SPDX-FileCopyrightText: 2022-2023 Blender Authors # # SPDX-License-Identifier: GPL-2.0-or-later """ Check license headers follow the SPDX spec https://spdx.org/licenses/ This can be activated by calling "make check_licenses" from Blenders root directory. """ import os import argparse import datetime import re from dataclasses import dataclass from typing import ( Callable, Dict, Generator, List, Tuple, ) # ----------------------------------------------------------------------------- # Constants # Add one, maybe someone runs this on new-years in another timezone or so. YEAR_MAX = datetime.date.today().year + 1 # Lets not worry about software written before this time. YEAR_MIN = 1950 YEAR_RANGE = range(YEAR_MIN, YEAR_MAX + 1) # Faster bug makes exceptions and errors more difficult to troubleshoot. USE_MULTIPROCESS = False EXPECT_SPDX_IN_FIRST_CHARS = 1024 # Show unique headers after modifying them. # Useful when reviewing changes as there may be many duplicates. REPORT_UNIQUE_HEADER_MAPPING = False mapping: Dict[str, List[str]] = {} SOURCE_DIR = os.path.normpath( os.path.abspath( os.path.normpath(os.path.join(os.path.dirname(__file__), "..", "..")) ) ) SPDX_IDENTIFIER_FILE = os.path.join( SOURCE_DIR, "doc", "license", "SPDX-license-identifiers.txt" ) SPDX_IDENTIFIER_UNKNOWN = "*Unknown License*" with open(SPDX_IDENTIFIER_FILE, "r", encoding="utf-8") as fh: ACCEPTABLE_LICENSES = set(line.split()[0] for line in sorted(fh) if "https://spdx.org/licenses/" in line) del fh # ----------------------------------------------------------------------------- # Global Variables # Count how many licenses are used. SPDX_IDENTIFIER_STATS: Dict[str, int] = {SPDX_IDENTIFIER_UNKNOWN: 0} # ----------------------------------------------------------------------------- # File Type Checks # Use `/* .. */` style comments. def filename_is_c_compat(filename: str) -> bool: return filename.endswith( ( # C. ".c", ".h", # C++ ".cc", ".cxx", ".cpp", ".hh", ".hxx", ".hpp", ".inl", # Objective-C/C++ ".m", ".mm", # OpenGL Shading Language. ".glsl", # OPENCL. ".cl", # CUDA. ".cu", # Metal. ".metal", # Metal Shading Language. ".msl", # Open Shading Language. ".osl", # Cycles uses this extension. ".tables", ) ) def filename_is_cmake(filename: str) -> bool: return filename.endswith(("CMakeLists.txt", ".cmake")) # Use '#' style comments. def filename_is_script_compat(filename: str) -> bool: return filename.endswith((".py", ".sh", "GNUmakefile")) # ----------------------------------------------------------------------------- # Cursor Motion def txt_next_line_while_fn(text: str, index: int, fn: Callable[[str], bool]) -> int: """ Return the next line where ``fn`` fails. """ while index < len(text): index_prev = index index = text.find("\n", index) if index == -1: index = len(text) if not fn(text[index_prev:index]): index = index_prev break # Step over the newline. index = index + 1 return index def txt_next_eol(text: str, pos: int, limit: int, step_over: bool) -> int: """ Extend ``pos`` to just before the next EOL, otherwise EOF. As this is intended for use as a range, ``text[pos]`` will either be ``\n`` or equal to out of range (equal to ``len(text)``). """ if pos + 1 >= len(text): return pos # Already at the bounds. if text[pos] == "\n": return pos + (1 if step_over else 0) pos_next = text.find("\n", pos, limit) if pos_next == -1: return limit return pos_next + (1 if step_over else 0) def txt_prev_bol(text: str, pos: int, limit: int) -> int: if pos == 0: return pos # Already at the bounds. if text[pos - 1] == "\n": return pos pos_next = text.rfind("\n", limit, pos) if pos_next == -1: return limit # We don't want to include the newline. return pos_next + 1 def txt_anonymous_years(text: str) -> str: """ Replace year with text, since we don't want to consider them different when looking at unique headers. """ # Replace year ranges with `2005-2009`: `####`. def key_replace_range(match: re.Match[str]) -> str: values = match.groups() if int(values[0]) in YEAR_RANGE and int(values[1]) in YEAR_RANGE: return '#' * len(values[0]) return match.group() text = re.sub(r'([0-9]+)-([0-9]+)', key_replace_range, text) # Replace year ranges with `2005`: `####`. def key_replace(match: re.Match[str]) -> str: values = match.groups() if int(values[0]) in YEAR_RANGE: return '#' * len(values[0]) return match.group() text = re.sub(r'([0-9]+)', key_replace, text) return text def txt_find_next_indented_block(text: str, find: str, pos: int, limit: int) -> Tuple[int, int]: """ Support for finding an indented block of text. Return the identifier index and the end of the block. Where searching for ``SPDX-FileCopyrightText: `` .. code-block:: # SPDX-FileCopyrightText: 2020 Name ^ begin ^ end. With multiple lines supported: .. code-block:: # SPDX-FileCopyrightText: 2020 Name # 2021 Another Name ^ begin (one line up) ^ end. """ pos_found = text.find(find, pos, limit) if pos_found == -1: return (-1, -1) pos_next = txt_next_eol(text, pos_found, limit - 1, False) + 1 if pos_next != limit: pos_found_indent = pos_found - txt_prev_bol(text, pos_found, 0) while True: # Step over leading comment chars. pos_next_test = pos_next + pos_found_indent pos_next_step = pos_next_test + len(find) # The next lines text is indented. text_indent = text[pos_next_test:pos_next_step] if (len(text_indent) == pos_next_step - pos_next_test) and (not text[pos_next_test:pos_next_step].strip()): pos_next = txt_next_eol(text, pos_next_step, limit - 1, step_over=False) + 1 else: break return (pos_found, pos_next) # ----------------------------------------------------------------------------- # License Checker def check_contents(filepath: str, text: str) -> None: """ Check for license text, e.g: ``SPDX-License-Identifier: GPL-2.0-or-later`` Intentionally be strict here... no extra spaces, no trailing space at the end of line etc. As there is no reason to be sloppy in this case. """ text_header = text[:EXPECT_SPDX_IN_FIRST_CHARS] # Use the license to limit the copyright search, # so code-generation that includes copyright headers don't cause false alarms. license_id = " SPDX-License-Identifier: " license_id_beg = text_header.find(license_id) if license_id_beg == -1: # Allow completely empty files (sometimes `__init__.py`). if not text.rstrip(): return # Empty file already accounted for. print("Missing {:s}{:s}".format(license_id, filepath)) return # Check copyright text, reading multiple (potentially multi-line indented) blocks. copyright_id = " SPDX-FileCopyrightText: " copyright_id_step = 0 copyright_id_beg = -1 copyright_id_end = -1 while ((copyright_id_item := txt_find_next_indented_block( text_header, copyright_id, copyright_id_step, license_id_beg, )) != (-1, -1)): if copyright_id_end == -1: # Set once. copyright_id_beg = copyright_id_item[0] else: lines = text_header[copyright_id_end:copyright_id_item[0]].count("\n") if lines != 0: print( "Expected no blank lines, found {:d} between \"{:s}\": {:s}".format( lines, copyright_id, filepath, )) copyright_id_end = copyright_id_item[1] copyright_id_step = copyright_id_end del copyright_id_item, copyright_id_step if copyright_id_beg == -1: print("Missing {:s}{:s}".format(copyright_id, filepath)) # Maintain statistics. SPDX_IDENTIFIER_STATS[SPDX_IDENTIFIER_UNKNOWN] += 1 return # Check for blank lines: blank_lines = text[:copyright_id_beg].count("\n") if filename_is_script_compat(filepath): if blank_lines > 0 and text.startswith("#!/"): blank_lines -= 1 if blank_lines > 0: print("SPDX \"{:s}\" not on first line: {:s}".format(copyright_id, filepath)) # Leading char. leading_char = text_header[txt_prev_bol(text_header, license_id_beg, 0):license_id_beg].strip() text_blank_line = text_header[copyright_id_end:license_id_beg] if (text_blank_line.count("\n") != 1) or (text_blank_line.replace(leading_char, "").strip() != ""): print("Expected blank line between \"{:s}\" & \"{:s}\": {:s}".format(copyright_id, license_id, filepath)) del text_blank_line, leading_char license_id_end = license_id_beg + len(license_id) line_end = txt_next_eol(text, license_id_end, len(text), step_over=False) license_text = text[license_id_end:line_end] # For C/C++ comments. license_text = license_text.rstrip("*/") for license_id in license_text.split(): if license_id in {"AND", "OR"}: continue if license_id not in ACCEPTABLE_LICENSES: print( "Unexpected:", "{:s}:{:d}".format(filepath, text[:license_id_beg].count("\n") + 1), "contains license", repr(license_text), "not in", SPDX_IDENTIFIER_FILE, ) try: SPDX_IDENTIFIER_STATS[license_id] += 1 except KeyError: SPDX_IDENTIFIER_STATS[license_id] = 1 if REPORT_UNIQUE_HEADER_MAPPING: if filename_is_c_compat(filepath): comment_beg = text.rfind("/*", 0, license_id_beg) if comment_beg == -1: print("Comment Block:", filepath, "failed to find comment start") return comment_end = text.find("*/", license_id_end, len(text)) if comment_end == -1: print("Comment Block:", filepath, "failed to find comment end") return comment_end += 2 comment_block = text[comment_beg + 2: comment_end - 2] comment_block = "\n".join( [line.removeprefix(" *") for line in comment_block.split("\n")] ) elif filename_is_script_compat(filepath) or filename_is_cmake(filepath): comment_beg = txt_prev_bol(text, license_id_beg, 0) comment_end = txt_next_eol(text, license_id_beg, len(text), step_over=False) comment_beg = txt_next_line_while_fn( text, comment_beg, lambda line: line.startswith("#") and not line.startswith("#!/"), ) comment_end = txt_next_line_while_fn( text, comment_end, lambda line: line.startswith("#"), ) comment_block = text[comment_beg:comment_end].rstrip() comment_block = "\n".join( [line.removeprefix("# ") for line in comment_block.split("\n")] ) else: raise Exception("Unknown file type: {:s}".format(filepath)) mapping.setdefault(txt_anonymous_years(comment_block), []).append(filepath) def report_statistics() -> None: """ Report some final statistics of license usage. """ print("") files_total = sum(SPDX_IDENTIFIER_STATS.values()) files_unknown = SPDX_IDENTIFIER_STATS[SPDX_IDENTIFIER_UNKNOWN] files_percent = (1.0 - (files_unknown / files_total)) * 100.0 title = "License Statistics in {:,d} Files, {:.2f}% Complete".format(files_total, files_percent) print("#" * len(title)) print(title) print("#" * len(title)) print("") max_length = max(len(k) for k in SPDX_IDENTIFIER_STATS.keys()) print(" License:" + (" " * (max_length - 7)) + "Files:") print("") items = [(k, "{:,d}".format(v)) for k, v in sorted(SPDX_IDENTIFIER_STATS.items())] v_max = max([len(v) for _, v in items]) for k, v in items: if v == "0": continue print("-", k + " " * (max_length - len(k)), (" " * (v_max - len(v))) + v) print("") # ----------------------------------------------------------------------------- # Main Function & Source Listing operation = check_contents def source_files( path: str, paths_exclude: Tuple[str, ...], filename_test: Callable[[str], bool], ) -> Generator[str, None, None]: # Split paths into directories & files. dirs_exclude_list = [] files_exclude_list = [] for f in paths_exclude: if not os.path.exists(f): raise Exception("File {!r} doesn't exist!".format(f)) if os.path.isdir(f): dirs_exclude_list.append(f) else: files_exclude_list.append(f) del paths_exclude dirs_exclude_set = set(p.rstrip("/") for p in dirs_exclude_list) dirs_exclude = tuple(p.rstrip("/") + "/" for p in dirs_exclude_list) files_exclude_set = set(p.rstrip("/") for p in files_exclude_list) del dirs_exclude_list, files_exclude_list for dirpath, dirnames, filenames in os.walk(path): dirnames[:] = [d for d in dirnames if not d.startswith(".")] if dirpath in dirs_exclude_set or dirpath.startswith(dirs_exclude): continue for filename in filenames: if filename.startswith("."): continue filepath = os.path.join(dirpath, filename) if filepath in files_exclude_set: files_exclude_set.remove(filepath) continue if filename_test(filename): yield filepath if files_exclude_set: raise Exception("Excluded paths not found: {!r}".format(repr(tuple(sorted(files_exclude_set))))) def operation_wrap(filepath: str) -> None: with open(filepath, "r", encoding="utf-8") as f: try: text = f.read() except Exception as ex: print("Failed to read", filepath, "with", repr(ex)) return operation(filepath, text) def argparse_create() -> argparse.ArgumentParser: # When --help or no args are given, print this help description = __doc__ parser = argparse.ArgumentParser(description=description) parser.add_argument( "--show-headers", dest="show_headers", type=bool, default=False, required=False, help="Show unique headers (useful for spotting irregularities).", ) return parser def main() -> None: global REPORT_UNIQUE_HEADER_MAPPING args = argparse_create().parse_args() REPORT_UNIQUE_HEADER_MAPPING = args.show_headers # Ensure paths are relative to the root, no matter where this script runs from. os.chdir(SOURCE_DIR) @dataclass class Pass: filename_test: Callable[[str], bool] source_paths_include: Tuple[str, ...] source_paths_exclude: Tuple[str, ...] passes = ( Pass( filename_test=filename_is_c_compat, source_paths_include=(".",), source_paths_exclude=( # Directories: "./extern", "./scripts/addons_contrib", "./scripts/templates_osl", "./tools", # Exclude library sources (GIT-LFS). "./lib", # Needs manual handling as it mixes two licenses. "./intern/atomic", # Practically an "extern" within an "intern" module, leave as-is. "./intern/itasc/kdl", # TODO: Files in these directories should be handled but the files have valid licenses. "./intern/libmv", # Files: # This file is generated by a configure script (no point in manually setting the license). "./build_files/build_environment/patches/config_gmpxx.h", # A modified `Apache-2.0` license. "./intern/opensubdiv/internal/evaluator/shaders/glsl_compute_kernel.glsl", ), ), Pass( filename_test=filename_is_cmake, source_paths_include=(".",), source_paths_exclude=( # Directories: # This is an exception, it has its own CMake files we do not maintain. "./extern/audaspace", "./extern/quadriflow/3rd/lemon-1.3.1", # Exclude library sources (GIT-LFS). "./lib", ), ), Pass( filename_test=filename_is_script_compat, source_paths_include=(".",), source_paths_exclude=( # Directories: # This is an exception, it has its own CMake files we do not maintain. "./extern", # Exclude library sources (GIT-LFS). "./lib", # Just data. "./doc/python_api/examples", "./scripts/addons/presets", "./scripts/addons_contrib", "./scripts/presets", "./scripts/templates_py", ), ), ) for pass_data in passes: if USE_MULTIPROCESS: filepath_args = [ filepath for dirpath in pass_data.source_paths_include for filepath in source_files( dirpath, pass_data.source_paths_exclude, pass_data.filename_test, ) ] import multiprocessing job_total = multiprocessing.cpu_count() pool = multiprocessing.Pool(processes=job_total) pool.map(operation_wrap, filepath_args) else: for filepath in [ filepath for dirpath in pass_data.source_paths_include for filepath in source_files( dirpath, pass_data.source_paths_exclude, pass_data.filename_test, ) ]: operation_wrap(filepath) if REPORT_UNIQUE_HEADER_MAPPING: print("#####################") print("Unique Header Listing") print("#####################") print("") for k, v in sorted(mapping.items()): print("=" * 79) print(k) print("-" * 79) v.sort() for filepath in v: print("-", filepath) print("") report_statistics() if __name__ == "__main__": main()