Skip to content

Commit

Permalink
parser: workaround cursor.get_tokens() issue with macro expansions
Browse files Browse the repository at this point in the history
There's a bug in clang that causes cursor.get_tokens() to sometimes fail
when there are macro expansions in the cursor's extent. Just having a
'bool x' variable or struct member fails to tokenize, because bool is a
macro rather than a typedef in stdbool.h.

I haven't been able to come up with a reproducer that would hit this
with our current usage of cursor.get_tokens(). However, it's obvious
when trying to tokenize said 'bool x'. Rather than wait for us hitting
subtle bugs, defensively work around the issue.

The issue seems to be related to something going awry in the cursor's
extent. Simply recreating the extent works. The resulting new extent has
the same __repr__, but they differ under the hood. The new one works,
the old one doesn't. Try the regular cursor.get_tokens() first, and if
that produces no tokens, fall back to recreating the extent, and getting
the tokens from the translation unit.

This is likely caused by clang issue [1], or it's closely related. See
also [2].

[1] llvm/llvm-project#43451
[2] https://stackoverflow.com/questions/16786767/obtain-original-unexpanded-macro-text-using-libclang
  • Loading branch information
jnikula committed Oct 5, 2023
1 parent 0726141 commit 83de8c1
Showing 1 changed file with 34 additions and 8 deletions.
42 changes: 34 additions & 8 deletions src/hawkmoth/parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
from clang.cindex import StorageClass, AccessSpecifier, ExceptionSpecificationKind
from clang.cindex import Index, TranslationUnit, TranslationUnitLoadError
from clang.cindex import Diagnostic
from clang.cindex import SourceLocation, SourceRange

from hawkmoth import docstring

Expand Down Expand Up @@ -185,6 +186,31 @@ def _comment_extract(tu):

return top_level_comments, comments

# Workaround for clang sometimes failing to tokenize cursor extents with macro
# expansions in them. A simple 'bool x' variable or struct member cursor will
# fail to tokenize, because bool is a macro.
def _cursor_get_tokens(cursor):
# Try get_tokens() first
tokens = [t for t in cursor.get_tokens()]
if tokens:
yield from tokens
return

# Fallback to recreating the extent and getting the tokens from the
# translation unit. Notably the repr for both extent and cursor.extent will
# be the same, but under the hood there's something wrong.
tu = cursor.translation_unit

start = cursor.extent.start
start = SourceLocation.from_position(tu, start.file, start.line, start.column)

end = cursor.extent.end
end = SourceLocation.from_position(tu, end.file, end.line, end.column)

extent = SourceRange.from_locations(start, end)

yield from tu.get_tokens(extent=extent)

def _get_meta(comment, cursor=None):
meta = {'line': comment.extent.start.line}
if cursor:
Expand All @@ -200,7 +226,7 @@ def _get_macro_args(cursor):
if cursor.kind != CursorKind.MACRO_DEFINITION:
return None

tokens = cursor.get_tokens()
tokens = _cursor_get_tokens(cursor)

# Use the first two tokens to make sure this starts with 'IDENTIFIER('
one = next(tokens)
Expand Down Expand Up @@ -246,7 +272,7 @@ def _get_function_quals(cursor):
Returns:
List of (prefix) function qualifiers.
"""
tokens = [t.spelling for t in cursor.get_tokens()]
tokens = [t.spelling for t in _cursor_get_tokens(cursor)]
quals = []

if 'static' in tokens:
Expand All @@ -262,7 +288,7 @@ def _get_method_quals(cursor):
Returns:
List of prefix method qualifiers and list of suffix method qualifiers.
"""
tokens = [t.spelling for t in cursor.get_tokens()]
tokens = [t.spelling for t in _cursor_get_tokens(cursor)]
pre_quals = []
pos_quals = []

Expand Down Expand Up @@ -338,7 +364,7 @@ def _get_template_line(cursor):
# We can do it by looking at the tokens directly. This is slightly
# complicated due to variadic template type parameters.
def typetype(cursor):
tokens = list(cursor.get_tokens())
tokens = list(_cursor_get_tokens(cursor))
if tokens[-2].spelling == '...':
return f'{tokens[-3].spelling}...'
else:
Expand Down Expand Up @@ -373,7 +399,7 @@ def _specifiers_fixup(cursor, basetype):
Returns:
List of C++ specifiers for the cursor.
"""
tokens = [t.spelling for t in cursor.get_tokens()]
tokens = [t.spelling for t in _cursor_get_tokens(cursor)]
type_elem = []

if 'mutable' in tokens:
Expand All @@ -397,7 +423,7 @@ def _get_scopedenum_type(cursor):
``None`` otherwise.
"""
if cursor.kind == CursorKind.ENUM_DECL and cursor.is_scoped_enum():
if list(cursor.get_tokens())[3].spelling == ':':
if list(_cursor_get_tokens(cursor))[3].spelling == ':':
return f': {cursor.enum_type.spelling}'
return None

Expand Down Expand Up @@ -660,7 +686,7 @@ def _recursive_parse(domain, comments, errors, cursor, nest):

elif cursor.kind == CursorKind.ENUM_CONSTANT_DECL:
# Show enumerator value if it's explicitly set in source
if '=' in [t.spelling for t in cursor.get_tokens()]:
if '=' in [t.spelling for t in _cursor_get_tokens(cursor)]:
value = cursor.enum_value
else:
value = None
Expand Down Expand Up @@ -721,7 +747,7 @@ def _parse_undocumented_block(domain, comments, errors, cursor, nest):
# For some reason, the Python bindings don't return the cursor kind
# LINKAGE_SPEC as one would expect, so we need to do it the hard way.
if cursor.kind == CursorKind.UNEXPOSED_DECL:
tokens = cursor.get_tokens()
tokens = _cursor_get_tokens(cursor)
ntoken = next(tokens, None)
if ntoken and ntoken.spelling == 'extern':
ntoken = next(tokens, None)
Expand Down

0 comments on commit 83de8c1

Please sign in to comment.