[libc++] Rewrite the transitive header checking machinery

Since we don't generate a full dependency graph of headers, we can greatly simplify the script that parses the result of --trace-includes. At the same time, we also unify the mechanism for detecting whether a header is a public/C compat/internal/etc header with the existing mechanism in header_information.py. As a drive-by this fixes the headers_in_modulemap.sh.py test which had been disabled by mistake because it used its own way of determining the list of libc++ headers. By consistently using header_information.py to get that information, problems like this shouldn't happen anymore.
llvm · Oct 16, 2024 · 162bef9 · 162bef9
1 parent a4367d2
commit 162bef9
Show file tree

Hide file tree

Showing 8 changed files with 298 additions and 297 deletions.
diff --git a/libcxx/test/libcxx/header_inclusions.gen.py b/libcxx/test/libcxx/header_inclusions.gen.py
@@ -16,7 +16,7 @@
 from libcxx.header_information import lit_header_restrictions, public_headers, mandatory_inclusions
 
 for header in public_headers:
-  header_guard = lambda h: f"_LIBCPP_{h.upper().replace('.', '_').replace('/', '_')}"
+  header_guard = lambda h: f"_LIBCPP_{str(h).upper().replace('.', '_').replace('/', '_')}"
 
   # <cassert> has no header guards
   if header == 'cassert':

diff --git a/libcxx/test/libcxx/headers_in_modulemap.sh.py b/libcxx/test/libcxx/headers_in_modulemap.sh.py
@@ -1,25 +1,16 @@
-# RUN: %{python} %s %{libcxx-dir}/utils %{include-dir}
+# RUN: %{python} %s %{libcxx-dir}/utils
 
 import sys
-
 sys.path.append(sys.argv[1])
-
 import pathlib
-import sys
-from libcxx.header_information import is_modulemap_header, is_header
+from libcxx.header_information import all_headers, libcxx_include
 
-headers = list(pathlib.Path(sys.argv[2]).rglob("*"))
-modulemap = open(f"{sys.argv[2]}/module.modulemap").read()
+with open(libcxx_include / "module.modulemap") as f:
+    modulemap = f.read()
 
 isHeaderMissing = False
-
-for header in headers:
-    if not is_header(header):
-        continue
-
-    header = header.relative_to(pathlib.Path(sys.argv[2])).as_posix()
-
-    if not is_modulemap_header(header):
+for header in all_headers:
+    if not header.is_in_modulemap():
         continue
 
     if not str(header) in modulemap:

diff --git a/libcxx/test/libcxx/transitive_includes.gen.py b/libcxx/test/libcxx/transitive_includes.gen.py
@@ -42,10 +42,10 @@
 
     all_traces = []
     for header in sorted(public_headers):
-        if header.endswith(".h"):  # Skip C compatibility or detail headers
+        if header.is_C_compatibility() or header.is_internal():
             continue
 
-        normalized_header = re.sub("/", "_", header)
+        normalized_header = re.sub("/", "_", str(header))
         print(
             f"""\
 // RUN: echo "#include <{header}>" | %{{cxx}} -xc++ - %{{flags}} %{{compile_flags}} --trace-includes -fshow-skipped-includes --preprocess > /dev/null 2> %t/trace-includes.{normalized_header}.txt
@@ -55,17 +55,17 @@
 
     print(
         f"""\
-// RUN: %{{python}} %{{libcxx-dir}}/test/libcxx/transitive_includes_to_csv.py {' '.join(all_traces)} > %{{libcxx-dir}}/test/libcxx/transitive_includes/%{{cxx_std}}.csv
+// RUN: %{{python}} %{{libcxx-dir}}/test/libcxx/transitive_includes/to_csv.py {' '.join(all_traces)} > %{{libcxx-dir}}/test/libcxx/transitive_includes/%{{cxx_std}}.csv
 """
     )
 
 else:
     for header in public_headers:
-        if header.endswith(".h"):  # Skip C compatibility or detail headers
+        if header.is_C_compatibility() or header.is_internal():
             continue
 
         # Escape slashes for the awk command below
-        escaped_header = header.replace("/", "\\/")
+        escaped_header = str(header).replace("/", "\\/")
 
         print(
             f"""\
@@ -92,7 +92,7 @@
 
 // RUN: mkdir %t
 // RUN: %{{cxx}} %s %{{flags}} %{{compile_flags}} --trace-includes -fshow-skipped-includes --preprocess > /dev/null 2> %t/trace-includes.txt
-// RUN: %{{python}} %{{libcxx-dir}}/test/libcxx/transitive_includes_to_csv.py %t/trace-includes.txt > %t/actual_transitive_includes.csv
+// RUN: %{{python}} %{{libcxx-dir}}/test/libcxx/transitive_includes/to_csv.py %t/trace-includes.txt > %t/actual_transitive_includes.csv
 // RUN: cat %{{libcxx-dir}}/test/libcxx/transitive_includes/%{{cxx_std}}.csv | awk '/^{escaped_header} / {{ print }}' > %t/expected_transitive_includes.csv
 // RUN: diff -w %t/expected_transitive_includes.csv %t/actual_transitive_includes.csv
 #include <{header}>

diff --git a/libcxx/test/libcxx/transitive_includes/to_csv.py b/libcxx/test/libcxx/transitive_includes/to_csv.py
@@ -0,0 +1,120 @@
+#!/usr/bin/env python
+# ===----------------------------------------------------------------------===##
+#
+# Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+#
+# ===----------------------------------------------------------------------===##
+
+from typing import List, Tuple, Optional
+import argparse
+import io
+import itertools
+import os
+import pathlib
+import re
+import sys
+
+libcxx_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))))
+sys.path.append(os.path.join(libcxx_root, "utils"))
+from libcxx.header_information import Header
+
+def parse_line(line: str) -> Tuple[int, str]:
+    """
+    Parse a single line of --trace-includes output.
+
+    Returns the inclusion level and the raw file name being included.
+    """
+    match = re.match(r"(\.+) (.+)", line)
+    if not match:
+        raise ArgumentError(f"Line {line} contains invalid data.")
+
+    # The number of periods in front of the header name is the nesting level of
+    # that header.
+    return (len(match.group(1)), match.group(2))
+
+def make_cxx_v1_relative(header: str) -> Optional[str]:
+    """
+    Returns the path of the header as relative to <whatever>/c++/v1, or None if the path
+    doesn't contain c++/v1.
+
+    We use that heuristic to figure out which headers are libc++ headers.
+    """
+    # On Windows, the path separators can either be forward slash or backslash.
+    # If it is a backslash, Clang prints it escaped as two consecutive
+    # backslashes, and they need to be escaped in the RE. (Use a raw string for
+    # the pattern to avoid needing another level of escaping on the Python string
+    # literal level.)
+    pathsep = r"(?:/|\\\\)"
+    CXX_V1_REGEX = r"^.*c\+\+" + pathsep + r"v[0-9]+" + pathsep + r"(.+)$"
+    match = re.match(CXX_V1_REGEX, header)
+    if not match:
+        return None
+    else:
+        return match.group(1)
+
+def parse_file(file: io.TextIOBase) -> List[Tuple[Header, Header]]:
+    """
+    Parse a file containing --trace-includes output to generate a list of the
+    transitive includes contained in it.
+    """
+    result = []
+    includer = None
+    for line in file.readlines():
+        (level, header) = parse_line(line)
+        relative = make_cxx_v1_relative(header)
+
+        # Not a libc++ header
+        if relative is None:
+            continue
+
+        # If we're at the first level, remember this header as being the one who includes other headers.
+        # There's usually exactly one, except if the compiler is passed a file with `-include`.
+        if level == 1:
+            includer = Header(relative)
+            continue
+
+        # Otherwise, take note that this header is being included by the top-level includer.
+        else:
+            assert includer is not None
+            result.append((includer, Header(relative)))
+    return result
+
+def print_csv(includes: List[Tuple[Header, Header]]) -> None:
+    """
+    Print the transitive includes as space-delimited CSV.
+
+    This function only prints public libc++ headers that are not C compatibility headers.
+    """
+    # Sort and group by includer
+    by_includer = lambda t: t[0]
+    includes = itertools.groupby(sorted(includes, key=by_includer), key=by_includer)
+
+    for (includer, includees) in includes:
+        includees = map(lambda t: t[1], includees)
+        for h in sorted(set(includees)):
+            if h.is_public() and not h.is_C_compatibility():
+                print(f"{includer} {h}")
+
+def main(argv):
+    parser = argparse.ArgumentParser(
+        description="""
+        Given a list of headers produced by --trace-includes, produce a list of libc++ headers in that output.
+
+        Note that -fshow-skipped-includes must also be passed to the compiler in order to get sufficient
+        information for this script to run.
+
+        The output of this script is provided in space-delimited CSV format where each line contains:
+
+            <header performing inclusion> <header being included>
+        """)
+    parser.add_argument("inputs", type=argparse.FileType("r"), nargs='+', default=None,
+        help="One or more files containing the result of --trace-includes")
+    args = parser.parse_args(argv)
+
+    includes = [line for file in args.inputs for line in parse_file(file)]
+    print_csv(includes)
+
+if __name__ == "__main__":
+    main(sys.argv[1:])
diff --git a/libcxx/test/libcxx/transitive_includes_to_csv.py b/libcxx/test/libcxx/transitive_includes_to_csv.py
diff --git a/libcxx/utils/generate_iwyu_mapping.py b/libcxx/utils/generate_iwyu_mapping.py
@@ -71,7 +71,7 @@ def main(argv: typing.List[str]):
 
     mappings = []  # Pairs of (header, public_header)
     for header in libcxx.header_information.all_headers:
-        public_headers = IWYU_mapping(header)
+        public_headers = IWYU_mapping(str(header))
         if public_headers is not None:
             mappings.extend((header, public) for public in public_headers)
 

diff --git a/libcxx/utils/generate_libcxx_cppm_in.py b/libcxx/utils/generate_libcxx_cppm_in.py
@@ -9,19 +9,11 @@
 import os.path
 import sys
 
-from libcxx.header_information import module_c_headers
-from libcxx.header_information import module_headers
-from libcxx.header_information import header_restrictions
-from libcxx.header_information import headers_not_available
+from libcxx.header_information import module_c_headers, module_headers, header_restrictions, headers_not_available, libcxx_root
 
 
 def write_file(module):
-    libcxx_module_directory = os.path.join(
-        os.path.dirname(os.path.dirname(os.path.realpath(__file__))), "modules"
-    )
-    with open(
-        os.path.join(libcxx_module_directory, f"{module}.cppm.in"), "w"
-    ) as module_cpp_in:
+    with open(libcxx_root / "modules" / f"{module}.cppm.in", "w") as module_cpp_in:
         module_cpp_in.write(
             """\
 // -*- C++ -*-
@@ -45,7 +37,7 @@ def write_file(module):
 // and the headers of Table 25: C++ headers for C library facilities [tab:headers.cpp.c]
 """
         )
-        for header in module_headers if module == "std" else module_c_headers:
+        for header in sorted(module_headers if module == "std" else module_c_headers):
             if header in header_restrictions:
                 module_cpp_in.write(
                     f"""\