From c0bb443b2eb532be805f811b7d21e35e7add96c2 Mon Sep 17 00:00:00 2001 From: Dustin Spicuzza Date: Sun, 23 Jul 2023 17:03:04 -0400 Subject: [PATCH] Restructure preprocessor support - Remove partial support for #define and other PP directives - Allow pragma to span multiple lines - Pragma now emits a list of tokens instead of a single string --- cxxheaderparser/lexer.py | 55 ++++++++++++++++++++++++++++------ cxxheaderparser/parser.py | 32 +++++++++++++------- cxxheaderparser/simple.py | 16 ++-------- cxxheaderparser/visitor.py | 10 ++----- tests/test_misc.py | 61 +++++++++++++++++++++++++++----------- 5 files changed, 116 insertions(+), 58 deletions(-) diff --git a/cxxheaderparser/lexer.py b/cxxheaderparser/lexer.py index 2b91892..68f2513 100644 --- a/cxxheaderparser/lexer.py +++ b/cxxheaderparser/lexer.py @@ -51,6 +51,7 @@ class LexToken(Protocol): #: private lexer: lex.Lexer + lexmatch: "re.Match" PhonyEnding: LexToken = lex.LexToken() # type: ignore @@ -175,7 +176,10 @@ class PlyLexer: # Comments "COMMENT_SINGLELINE", "COMMENT_MULTILINE", - "PRECOMP_MACRO", + "LINE_MACRO", + "PRAGMA_MACRO", + "INCLUDE_MACRO", + "PP_MACRO", # misc "DIVIDE", "NEWLINE", @@ -434,16 +438,32 @@ def t_NAME(self, t: LexToken) -> LexToken: t.type = t.value return t - @TOKEN(r"\#.*") - def t_PRECOMP_MACRO(self, t: LexToken) -> typing.Optional[LexToken]: - m = _line_re.match(t.value) - if m: - self.filename = m.group(2) + @TOKEN(r'\#[\t ]*line (\d+) "(.*)"') + def t_LINE_MACRO(self, t: LexToken) -> None: + m = t.lexmatch + self.filename = m.group(2) + self.line_offset = 1 + self.lex.lineno - int(m.group(1)) - self.line_offset = 1 + self.lex.lineno - int(m.group(1)) - return None + @TOKEN(r"\#[\t ]*pragma") + def t_PRAGMA_MACRO(self, t: LexToken) -> LexToken: + return t + + @TOKEN(r"\#[\t ]*include (.*)") + def t_INCLUDE_MACRO(self, t: LexToken) -> LexToken: + return t + + @TOKEN(r"\#(.*)") + def t_PP_MACRO(self, t: LexToken): + if "define" in t.value: + msgtype = "#define" else: - return t + msgtype = "preprocessor" + self._error( + "cxxheaderparser does not support " + + msgtype + + " directives, please use a C++ preprocessor first", + t, + ) t_DIVIDE = r"/(?!/)" t_ELLIPSIS = r"\.\.\." @@ -541,6 +561,12 @@ def get_doxygen_after(self) -> typing.Optional[str]: "WHITESPACE", } + _discard_types_except_newline = { + "COMMENT_SINGLELINE", + "COMMENT_MULTILINE", + "WHITESPACE", + } + def token(self) -> LexToken: tokbuf = self.tokbuf while True: @@ -563,6 +589,17 @@ def token_eof_ok(self) -> typing.Optional[LexToken]: if not self._fill_tokbuf(tokbuf): return None + def token_newline_eof_ok(self) -> typing.Optional[LexToken]: + tokbuf = self.tokbuf + while True: + while tokbuf: + tok = tokbuf.popleft() + if tok.type not in self._discard_types_except_newline: + return tok + + if not self._fill_tokbuf(tokbuf): + return None + def token_if(self, *types: str) -> typing.Optional[LexToken]: tok = self.token_eof_ok() if tok is None: diff --git a/cxxheaderparser/parser.py b/cxxheaderparser/parser.py index f6ecdbc..532d679 100644 --- a/cxxheaderparser/parser.py +++ b/cxxheaderparser/parser.py @@ -304,7 +304,8 @@ def parse(self) -> None: "{": self._on_empty_block_start, "}": self._on_block_end, "DBL_LBRACKET": self._consume_attribute_specifier_seq, - "PRECOMP_MACRO": self._process_preprocessor_token, + "INCLUDE_MACRO": self._process_include_macro, + "PRAGMA_MACRO": self._process_pragma_macro, ";": lambda _1, _2: None, } @@ -361,20 +362,29 @@ def parse(self) -> None: _preprocessor_compress_re = re.compile(r"^#[\t ]+") _preprocessor_split_re = re.compile(r"[\t ]+") - def _process_preprocessor_token( - self, tok: LexToken, doxygen: typing.Optional[str] - ) -> None: + def _process_include_macro(self, tok: LexToken, doxygen: typing.Optional[str]): value = self._preprocessor_compress_re.sub("#", tok.value) svalue = self._preprocessor_split_re.split(value, 1) if len(svalue) == 2: self.state.location = tok.location - macro = svalue[0].lower().replace(" ", "") - if macro.startswith("#include"): - self.visitor.on_include(self.state, svalue[1]) - elif macro.startswith("#define"): - self.visitor.on_define(self.state, svalue[1]) - elif macro.startswith("#pragma"): - self.visitor.on_pragma(self.state, svalue[1]) + self.visitor.on_include(self.state, svalue[1]) + else: + raise CxxParseError("incomplete #include directive", tok) + + def _process_pragma_macro(self, _: LexToken, doxygen: typing.Optional[str]): + # consume all tokens until the end of the line + # -- but if we find a paren, get the group + tokens: LexTokenList = [] + while True: + tok = self.lex.token_newline_eof_ok() + if not tok or tok.type == "NEWLINE": + break + if tok.type in self._balanced_token_map: + tokens.extend(self._consume_balanced_tokens(tok)) + else: + tokens.append(tok) + + self.visitor.on_pragma(self.state, self._create_value(tokens)) # # Various diff --git a/cxxheaderparser/simple.py b/cxxheaderparser/simple.py index 65cf04a..6d1d1c1 100644 --- a/cxxheaderparser/simple.py +++ b/cxxheaderparser/simple.py @@ -45,6 +45,7 @@ UsingAlias, UsingDecl, Variable, + Value, ) from .parserstate import ( @@ -123,14 +124,9 @@ class NamespaceScope: Block = typing.Union[ClassScope, NamespaceScope] -@dataclass -class Define: - content: str - - @dataclass class Pragma: - content: str + content: Value @dataclass @@ -171,9 +167,6 @@ class N::C { #: Global namespace namespace: NamespaceScope = field(default_factory=lambda: NamespaceScope()) - #: Any ``#define`` preprocessor directives encountered - defines: typing.List[Define] = field(default_factory=list) - #: Any ``#pragma`` directives encountered pragmas: typing.List[Pragma] = field(default_factory=list) @@ -208,10 +201,7 @@ def __init__(self) -> None: self.data = ParsedData(self.namespace) - def on_define(self, state: State, content: str) -> None: - self.data.defines.append(Define(content)) - - def on_pragma(self, state: State, content: str) -> None: + def on_pragma(self, state: State, content: Value) -> None: self.data.pragmas.append(Pragma(content)) def on_include(self, state: State, filename: str) -> None: diff --git a/cxxheaderparser/visitor.py b/cxxheaderparser/visitor.py index 6f5708c..73b5ad0 100644 --- a/cxxheaderparser/visitor.py +++ b/cxxheaderparser/visitor.py @@ -20,6 +20,7 @@ UsingAlias, UsingDecl, Variable, + Value, ) from .parserstate import ( @@ -36,14 +37,7 @@ class CxxVisitor(Protocol): Defines the interface used by the parser to emit events """ - def on_define(self, state: State, content: str) -> None: - """ - .. warning:: cxxheaderparser intentionally does not have a C preprocessor - implementation. If you are parsing code with macros in it, - use a conforming preprocessor like ``pcpp`` - """ - - def on_pragma(self, state: State, content: str) -> None: + def on_pragma(self, state: State, content: Value) -> None: """ Called once for each ``#pragma`` directive encountered """ diff --git a/tests/test_misc.py b/tests/test_misc.py index 355854a..d54f546 100644 --- a/tests/test_misc.py +++ b/tests/test_misc.py @@ -20,7 +20,6 @@ Pragma, parse_string, ParsedData, - Define, ) # @@ -28,42 +27,70 @@ # -def test_define() -> None: +def test_includes() -> None: content = """ - #define simple - #define complex(thing) stuff(thing) - # define spaced + #include + #include "local.h" + # include "space.h" """ data = parse_string(content, cleandoc=True) assert data == ParsedData( - defines=[ - Define(content="simple"), - Define(content="complex(thing) stuff(thing)"), - Define(content="spaced"), - ], + includes=[Include(""), Include('"local.h"'), Include('"space.h"')] ) -def test_includes() -> None: +def test_pragma() -> None: content = """ - #include - #include "local.h" + + #pragma once + """ data = parse_string(content, cleandoc=True) - assert data == ParsedData(includes=[Include(""), Include('"local.h"')]) + assert data == ParsedData( + pragmas=[Pragma(content=Value(tokens=[Token(value="once")]))] + ) -def test_pragma() -> None: +def test_pragma_more() -> None: content = """ - #pragma once + #pragma (some content here) + #pragma (even \ + more \ + content here) """ data = parse_string(content, cleandoc=True) - assert data == ParsedData(pragmas=[Pragma(content="once")]) + assert data == ParsedData( + pragmas=[ + Pragma( + content=Value( + tokens=[ + Token(value="("), + Token(value="some"), + Token(value="content"), + Token(value="here"), + Token(value=")"), + ] + ) + ), + Pragma( + content=Value( + tokens=[ + Token(value="("), + Token(value="even"), + Token(value="more"), + Token(value="content"), + Token(value="here"), + Token(value=")"), + ] + ) + ), + ] + ) #