parser: workaround cursor.get_tokens() issue with macro expansions

There's a bug in clang that causes cursor.get_tokens() to sometimes fail when there are macro expansions in the cursor's extent. Just having a 'bool x' variable or struct member fails to tokenize, because bool is a macro rather than a typedef in stdbool.h. I haven't been able to come up with a reproducer that would hit this with our current usage of cursor.get_tokens(). However, it's obvious when trying to tokenize said 'bool x'. Rather than wait for us hitting subtle bugs, defensively work around the issue. The issue seems to be related to something going awry in the cursor's extent. Simply recreating the extent works. The resulting new extent has the same __repr__, but they differ under the hood. The new one works, the old one doesn't. Try the regular cursor.get_tokens() first, and if that produces no tokens, fall back to recreating the extent, and getting the tokens from the translation unit. This is likely caused by clang issue [1], or it's closely related. See also [2]. [1] llvm/llvm-project#43451 [2] https://stackoverflow.com/questions/16786767/obtain-original-unexpanded-macro-text-using-libclang
jnikula · Oct 5, 2023 · 83de8c1 · 83de8c1
1 parent 0726141
commit 83de8c1
Showing 1 changed file with 34 additions and 8 deletions.
diff --git a/src/hawkmoth/parser.py b/src/hawkmoth/parser.py
@@ -38,6 +38,7 @@
 from clang.cindex import StorageClass, AccessSpecifier, ExceptionSpecificationKind
 from clang.cindex import Index, TranslationUnit, TranslationUnitLoadError
 from clang.cindex import Diagnostic
+from clang.cindex import SourceLocation, SourceRange
 
 from hawkmoth import docstring
 
@@ -185,6 +186,31 @@ def _comment_extract(tu):
 
     return top_level_comments, comments
 
+# Workaround for clang sometimes failing to tokenize cursor extents with macro
+# expansions in them. A simple 'bool x' variable or struct member cursor will
+# fail to tokenize, because bool is a macro.
+def _cursor_get_tokens(cursor):
+    # Try get_tokens() first
+    tokens = [t for t in cursor.get_tokens()]
+    if tokens:
+        yield from tokens
+        return
+
+    # Fallback to recreating the extent and getting the tokens from the
+    # translation unit. Notably the repr for both extent and cursor.extent will
+    # be the same, but under the hood there's something wrong.
+    tu = cursor.translation_unit
+
+    start = cursor.extent.start
+    start = SourceLocation.from_position(tu, start.file, start.line, start.column)
+
+    end = cursor.extent.end
+    end = SourceLocation.from_position(tu, end.file, end.line, end.column)
+
+    extent = SourceRange.from_locations(start, end)
+
+    yield from tu.get_tokens(extent=extent)
+
 def _get_meta(comment, cursor=None):
     meta = {'line': comment.extent.start.line}
     if cursor:
@@ -200,7 +226,7 @@ def _get_macro_args(cursor):
     if cursor.kind != CursorKind.MACRO_DEFINITION:
         return None
 
-    tokens = cursor.get_tokens()
+    tokens = _cursor_get_tokens(cursor)
 
     # Use the first two tokens to make sure this starts with 'IDENTIFIER('
     one = next(tokens)
@@ -246,7 +272,7 @@ def _get_function_quals(cursor):
     Returns:
         List of (prefix) function qualifiers.
     """
-    tokens = [t.spelling for t in cursor.get_tokens()]
+    tokens = [t.spelling for t in _cursor_get_tokens(cursor)]
     quals = []
 
     if 'static' in tokens:
@@ -262,7 +288,7 @@ def _get_method_quals(cursor):
     Returns:
         List of prefix method qualifiers and list of suffix method qualifiers.
     """
-    tokens = [t.spelling for t in cursor.get_tokens()]
+    tokens = [t.spelling for t in _cursor_get_tokens(cursor)]
     pre_quals = []
     pos_quals = []
 
@@ -338,7 +364,7 @@ def _get_template_line(cursor):
     # We can do it by looking at the tokens directly. This is slightly
     # complicated due to variadic template type parameters.
     def typetype(cursor):
-        tokens = list(cursor.get_tokens())
+        tokens = list(_cursor_get_tokens(cursor))
         if tokens[-2].spelling == '...':
             return f'{tokens[-3].spelling}...'
         else:
@@ -373,7 +399,7 @@ def _specifiers_fixup(cursor, basetype):
     Returns:
         List of C++ specifiers for the cursor.
     """
-    tokens = [t.spelling for t in cursor.get_tokens()]
+    tokens = [t.spelling for t in _cursor_get_tokens(cursor)]
     type_elem = []
 
     if 'mutable' in tokens:
@@ -397,7 +423,7 @@ def _get_scopedenum_type(cursor):
         ``None`` otherwise.
     """
     if cursor.kind == CursorKind.ENUM_DECL and cursor.is_scoped_enum():
-        if list(cursor.get_tokens())[3].spelling == ':':
+        if list(_cursor_get_tokens(cursor))[3].spelling == ':':
             return f': {cursor.enum_type.spelling}'
     return None
 
@@ -660,7 +686,7 @@ def _recursive_parse(domain, comments, errors, cursor, nest):
 
     elif cursor.kind == CursorKind.ENUM_CONSTANT_DECL:
         # Show enumerator value if it's explicitly set in source
-        if '=' in [t.spelling for t in cursor.get_tokens()]:
+        if '=' in [t.spelling for t in _cursor_get_tokens(cursor)]:
             value = cursor.enum_value
         else:
             value = None
@@ -721,7 +747,7 @@ def _parse_undocumented_block(domain, comments, errors, cursor, nest):
     # For some reason, the Python bindings don't return the cursor kind
     # LINKAGE_SPEC as one would expect, so we need to do it the hard way.
     if cursor.kind == CursorKind.UNEXPOSED_DECL:
-        tokens = cursor.get_tokens()
+        tokens = _cursor_get_tokens(cursor)
         ntoken = next(tokens, None)
         if ntoken and ntoken.spelling == 'extern':
             ntoken = next(tokens, None)