From 9a5b3c2ca8396f546356115c6602e2f6030d340c Mon Sep 17 00:00:00 2001 From: Francesco Bertolaccini Date: Tue, 30 Jul 2024 15:39:35 +0200 Subject: [PATCH] sarif: add generator script --- sarif/scripts/sarif-headergen/.gitignore | 162 +++++++++++++ sarif/scripts/sarif-headergen/README.md | 7 + sarif/scripts/sarif-headergen/pyproject.toml | 19 ++ .../src/sarif_headergen/__cli__.py | 228 ++++++++++++++++++ .../src/sarif_headergen/__init__.py | 3 + .../src/sarif_headergen/output.py | 138 +++++++++++ .../src/sarif_headergen/schema.py | 30 +++ 7 files changed, 587 insertions(+) create mode 100644 sarif/scripts/sarif-headergen/.gitignore create mode 100644 sarif/scripts/sarif-headergen/README.md create mode 100644 sarif/scripts/sarif-headergen/pyproject.toml create mode 100644 sarif/scripts/sarif-headergen/src/sarif_headergen/__cli__.py create mode 100644 sarif/scripts/sarif-headergen/src/sarif_headergen/__init__.py create mode 100644 sarif/scripts/sarif-headergen/src/sarif_headergen/output.py create mode 100644 sarif/scripts/sarif-headergen/src/sarif_headergen/schema.py diff --git a/sarif/scripts/sarif-headergen/.gitignore b/sarif/scripts/sarif-headergen/.gitignore new file mode 100644 index 0000000..3a8816c --- /dev/null +++ b/sarif/scripts/sarif-headergen/.gitignore @@ -0,0 +1,162 @@ +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +cover/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +.pybuilder/ +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +# For a library or package, you might want to ignore these files since the code is +# intended to run in multiple environments; otherwise, check them in: +# .python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# poetry +# Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. +# This is especially recommended for binary packages to ensure reproducibility, and is more +# commonly ignored for libraries. +# https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control +#poetry.lock + +# pdm +# Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. +#pdm.lock +# pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it +# in version control. +# https://pdm-project.org/#use-with-ide +.pdm.toml +.pdm-python +.pdm-build/ + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# pytype static type analyzer +.pytype/ + +# Cython debug symbols +cython_debug/ + +# PyCharm +# JetBrains specific template is maintained in a separate JetBrains.gitignore that can +# be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore +# and can be added to the global gitignore or merged into this file. For a more nuclear +# option (not recommended) you can uncomment the following to ignore the entire idea folder. +#.idea/ diff --git a/sarif/scripts/sarif-headergen/README.md b/sarif/scripts/sarif-headergen/README.md new file mode 100644 index 0000000..e4eb624 --- /dev/null +++ b/sarif/scripts/sarif-headergen/README.md @@ -0,0 +1,7 @@ +# sarif-headergen + +Generates nlohmann::json (de)serialization files from the JSON schema for SARIF files. + +## Usage + + $ python3 -m src.sarif_headergen.__cli__ sarif.json out.hpp out.cpp diff --git a/sarif/scripts/sarif-headergen/pyproject.toml b/sarif/scripts/sarif-headergen/pyproject.toml new file mode 100644 index 0000000..74bf50d --- /dev/null +++ b/sarif/scripts/sarif-headergen/pyproject.toml @@ -0,0 +1,19 @@ +[project] +name = "sarif-headergen" +version = "0.1.0" +description = "Generates nlohmann::json (de)serialization files from the JSON schema for SARIF files" +authors = [ + {name = "Francesco Bertolaccini", email = "francesco.bertolaccini@trailofbits.com"}, +] +dependencies = [] +requires-python = ">=3.12" +readme = "README.md" +license = {text = "MIT"} + +[build-system] +requires = ["pdm-backend"] +build-backend = "pdm.backend" + + +[tool.pdm] +distribution = true diff --git a/sarif/scripts/sarif-headergen/src/sarif_headergen/__cli__.py b/sarif/scripts/sarif-headergen/src/sarif_headergen/__cli__.py new file mode 100644 index 0000000..8394f5e --- /dev/null +++ b/sarif/scripts/sarif-headergen/src/sarif_headergen/__cli__.py @@ -0,0 +1,228 @@ +import json +from .output import get_structs, Enum, Struct, ForwardRef, Array, FieldType, Map, Field +from .schema import Schema + + +def get_type_name(type: FieldType, field_name: str | None = None): + def _get_type_name(type: FieldType): + if isinstance(type, Enum): + return to_snake_case(f"{type.name}") + if isinstance(type, Struct): + return to_snake_case(f"{type.name}") + if isinstance(type, ForwardRef): + return to_snake_case(f"{type.name}") + if isinstance(type, Array): + return f"std::forward_list< {get_type_name(type.subtype, field_name)} >" + if isinstance(type, Map): + return f"std::unordered_map< std::string, {get_type_name(type.subtype, field_name)} >" + if type == "boolean": + return "bool" + if type == "number": + return "double" + if type == "integer": + return "int64_t" + if type == "string": + return "std::string" + if type == "json": + return "json" + name = _get_type_name(type) + if field_name is not None and name == field_name: + return f"::gap::sarif::{name}" + return name + +def sanitize(name: str) -> str: + return name.translate(str.maketrans({ + "-": "_", + ".": "_", + " ": "_", + })) + +def to_snake_case(name: str) -> str: + tr = str.maketrans({ + letter: f"_{letter.lower()}" for letter in "QWERTYUIOPASDFGHJKLZXCVBNM" + }) + return name.translate(tr) + +def to_pascal_case(name: str) -> str: + return name[0].upper() + name[1:] + +def get_default_value(field: Field) -> str: + def stringify(val): + type = field.type + if isinstance(type, Array): + type = type.subtype + + if type == "string": + val = val.translate(str.maketrans({ + '"': '\\"', + "\r": "\\r", + "\n": "\\n", + "\\": "\\\\", + })) + return f'"{val}"' + if isinstance(type, Enum): + return f"{get_type_name(type, field.name)}::k{sanitize(to_pascal_case(val))}" + elif isinstance(val, bool): + return str(val).lower() + else: + return str(val) + if isinstance(field.default, list): + if len(field.default) == 0: + return "{}" + else: + return f"{{ {', '.join(map(stringify, field.default))} }}" + else: + return stringify(field.default) + +if __name__ == "__main__": + import argparse + parser = argparse.ArgumentParser() + parser.add_argument("input_schema") + parser.add_argument("output_header") + parser.add_argument("output_source") + + args = parser.parse_args() + + structs = {} + visited = set() + + with open(args.input_schema) as sarif: + sarif: Schema = json.load(sarif) + output = get_structs(sarif) + with open(args.output_header, mode="w") as output_header: + print("""// Copyright 2024-present, Trail of Bits, Inc. + +#pragma once + +// +// These definitions were generated from a JSON Schema description of SARIF +// found at https://github.com/microsoft/sarif-python-om/blob/7a84e8c2b2b9d8b9a8d25b1d376f039a0bf92a7c/sarif-schema-2.1.0.json +// +// The naming convention used is to convert all of the definition names from camelCase to snake_case, +// suffixed with either _struct or _enum depending on the kind. +// "Nested" definitions are prefixed by the name of the parent definition, for example the `headers` +// property of the `webRequest` definition is called web_request_headers_struct, in order to +// distinguish it from the `headers` property of the `webResponse` definition. +// Property names are kept as-is. +// +// Enum value names are converted from camelCase to PascalCase and prefixed with `k`. +// + +#include +#include +#include +#include +#include + +#include + +namespace gap::sarif +{ + using json = nlohmann::json;""", file=output_header) + for elem in output: + if isinstance(elem, ForwardRef): + print(f" struct {get_type_name(elem)};", file=output_header) + if isinstance(elem, Struct): + structs[elem.name] = elem + print(file=output_header) + print(" //", file=output_header) + print(f" // {elem.description}", file=output_header) + print(" //", file=output_header) + print(f" struct {get_type_name(elem)} {{", end="", file=output_header) + for field in elem.fields: + print(file=output_header) + if field.description is not None: + print(" //", file=output_header) + print(f" // {field.description}", file=output_header) + print(" //", file=output_header) + if field.required: + print(f" {get_type_name(field.type, field.name)} {field.name};", file=output_header) + elif field.default is not None: + print(f" {get_type_name(field.type, field.name)} {field.name} = {get_default_value(field)};", file=output_header) + elif isinstance(field.type, Array): + print(f" {get_type_name(field.type, field.name)} {field.name} = {{}};", file=output_header) + else: + print(f" std::optional< {get_type_name(field.type, field.name)} > {field.name} = std::nullopt;", file=output_header) + if elem.additional_props is not None: + print(file=output_header) + if elem.additional_props == "json": + print(" json additional_properties;", file=output_header) + else: + print(f" std::unordered_map< std::string, {get_type_name(elem.additional_props)} > additional_properties;", file=output_header) + + print(" };", file=output_header) + print(file=output_header) + print(f" void to_json(json &, const {get_type_name(elem)} &);", file=output_header) + print(f" void from_json(const json &, {get_type_name(elem)} &);", file=output_header) + if isinstance(elem, Enum): + print(file=output_header) + print(f" enum class {get_type_name(elem)} {{", file=output_header) + for value in elem.values: + value_name = sanitize(to_pascal_case(value)) + print(f" k{value_name},", file=output_header) + print(" };", file=output_header) + print(file=output_header) + print(f" NLOHMANN_JSON_SERIALIZE_ENUM({get_type_name(elem)}, {{", file=output_header) + for value in elem.values: + value_name = sanitize(to_pascal_case(value)) + print(f" {{ {get_type_name(elem)}::k{value_name}, \"{value}\" }},", file=output_header) + print(" })", file=output_header) + print("} // namespace gap::sarif", file=output_header) + + with open(args.output_source, mode="w") as output_source: + print("""// Copyright (c) 2024-present, Trail of Bits, Inc. + +#include + +namespace gap::sarif {""", file=output_source) + for elem in output: + if not isinstance(elem, Struct): + continue + print(f" void from_json(const json &j, {get_type_name(elem)} &o) {{", file=output_source) + print(" for( auto &[key, val] : j.items() ) {", file=output_source) + for field in elem.fields: + print(f" if ( key == \"{field.name}\" ) {{", file=output_source) + if field.required or field.default is not None: + print(f" val.get_to(o.{field.name});", file=output_source) + else: + print(f" {get_type_name(field.type)} field;", file=output_source) + print(" val.get_to(field);", file=output_source) + print(f" o.{field.name} = field;", file=output_source) + print(" continue;", file=output_source) + print(" }", file=output_source) + if elem.additional_props is not None: + print(" val.get_to(o.additional_properties[key]);", file=output_source) + print(" }", file=output_source) + print(" }", file=output_source) + + print(f" void to_json(json &j, const {get_type_name(elem)} &o) {{", file=output_source) + for field in elem.fields: + if field.required: + print(f" j[\"{field.name}\"] = o.{field.name};", file=output_source) + elif field.default is not None: + if isinstance(field.default, list): + if len(field.default) == 0: + print(f" if ( !o.{field.name}.empty() ) {{", file=output_source) + else: + print(f" if ( o.{field.name} != decltype(o.{field.name}){get_default_value(field)} ) {{", file=output_source) + else: + print(f" if ( o.{field.name} != {get_default_value(field)} ) {{", file=output_source) + print(f" j[\"{field.name}\"] = o.{field.name};", file=output_source) + print(" }", file=output_source) + elif isinstance(field.type, Array): + print(f" if ( !o.{field.name}.empty() ) {{", file=output_source) + print(f" j[\"{field.name}\"] = o.{field.name};", file=output_source) + print(" }", file=output_source) + else: + print(f" if ( o.{field.name}.has_value() ) {{", file=output_source) + print(f" j[\"{field.name}\"] = *o.{field.name};", file=output_source) + print(" }", file=output_source) + if elem.additional_props is not None: + if elem.additional_props == "json": + print(" for ( auto &[key, val] : o.additional_properties.items() ) {", file=output_source) + else: + print(" for ( auto &[key, val] : o.additional_properties ) {", file=output_source) + print(" j[key] = val;", file=output_source) + print(" }", file=output_source) + print(" }", file=output_source) + print("} // namespace gap::sarif", file=output_source) diff --git a/sarif/scripts/sarif-headergen/src/sarif_headergen/__init__.py b/sarif/scripts/sarif-headergen/src/sarif_headergen/__init__.py new file mode 100644 index 0000000..923b5d0 --- /dev/null +++ b/sarif/scripts/sarif-headergen/src/sarif_headergen/__init__.py @@ -0,0 +1,3 @@ +from .schema import Schema + +__all__ = [Schema] diff --git a/sarif/scripts/sarif-headergen/src/sarif_headergen/output.py b/sarif/scripts/sarif-headergen/src/sarif_headergen/output.py new file mode 100644 index 0000000..b3dbcc5 --- /dev/null +++ b/sarif/scripts/sarif-headergen/src/sarif_headergen/output.py @@ -0,0 +1,138 @@ +from typing import Literal +from functools import cache +from dataclasses import dataclass + +from .schema import Schema + +type FieldType = ( + Literal["string", "number", "boolean", "integer", "json"] | "Array" | "Struct" | "Enum" | "ForwardRef" | "Map" +) + +@dataclass(frozen=True) +class Array: + subtype: FieldType + + +@dataclass(frozen=True) +class Field: + name: str + type: FieldType + description: str | None + required: bool + default: str | int | float | bool | list | None + + +@dataclass(frozen=True) +class Enum: + name: str + values: list[str] + + +@dataclass(frozen=True) +class Struct: + name: str + fields: list[Field] + description: str | None + additional_props: FieldType | None + +@dataclass(frozen=True) +class ForwardRef: + name: str + +@dataclass(frozen=True) +class Map: + subtype: FieldType + + +def get_structs(schema: Schema) -> list[Struct | Enum | ForwardRef]: + assert "definitions" in schema + + res: list[Struct | Enum | ForwardRef] = [] + defs: dict[str, FieldType] = {} + + definitions = schema["definitions"] + + def get_fwd_ref(struct: str, name: str, field: Schema) -> FieldType: + if "$ref" in field: + ref = field["$ref"] + ref_name = ref[len("#/definitions/") :] + if ref_name in defs: + return defs[ref_name] + fwd = ForwardRef(ref_name) + res.append(fwd) + defs[ref_name] = fwd + return fwd + return get_field(struct, name, field) + + def get_field(struct: str, name: str, field: Schema) -> FieldType: + if "$ref" in field: + return get_ref(None, field["$ref"]) + + if "type" in field: + if field["type"] in ("boolean", "number", "string", "integer"): + return field["type"] + if field["type"] == "array": + items = field["items"] + return Array(subtype=get_fwd_ref(None, name, items)) + if field["type"] == "object": + if struct is not None and "properties" not in field and "additionalProperties" in field and field["additionalProperties"] != False: + if field["additionalProperties"] != True: + return Map(subtype=get_field(struct, name, field["additionalProperties"])) + else: + return "json" + return make_struct(f"{struct}_{name}" if struct is not None else name, field) + raise Exception("Invalid type") + + if "enum" in field: + if name in defs: + return defs[name] + enum = Enum(name, field["enum"]) + res.append(enum) + defs[name] = enum + return enum + + raise Exception("Unknown field type") + + @cache + def get_ref(struct: str, ref: str) -> FieldType: + assert ref.startswith("#/definitions/") + name = ref[len("#/definitions/") :] + return get_field(struct, name, definitions[name]) + + def make_struct(name: str, type: Schema): + assert type["type"] == "object" + additional_props: FieldType | None = None + if "additionalProperties" in type: + if type["additionalProperties"] == True: + additional_props = "json" + elif type["additionalProperties"] != False: + additional_props = get_field(name, "additional_properties", type["additionalProperties"]) + fields: list[Field] = [] + if "properties" in type: + fields = [ + Field( + name=field_name, + type=get_field(name, field_name, field_type), + description=field_type.get("description", None), + required="required" in type and field_name in type["required"], + default=field_type.get("default", None), + ) + for field_name, field_type in type["properties"].items() + if field_name != "$schema" + ] + struct = Struct( + name, + fields=fields, + description=type.get("description", None), + additional_props=additional_props, + ) + res.append(struct) + defs[name] = struct + return struct + + make_struct("root", schema) + + for name in schema["definitions"]: + get_ref(None, f"#/definitions/{name}") + + return res diff --git a/sarif/scripts/sarif-headergen/src/sarif_headergen/schema.py b/sarif/scripts/sarif-headergen/src/sarif_headergen/schema.py new file mode 100644 index 0000000..c3d02dc --- /dev/null +++ b/sarif/scripts/sarif-headergen/src/sarif_headergen/schema.py @@ -0,0 +1,30 @@ +from typing import TypedDict, NotRequired, Union +from enum import Enum + + +class Type(str, Enum): + null = "null" + boolean = "boolean" + object = "object" + array = "array" + number = "number" + string = "string" + integer = "integer" + + +Schema = TypedDict( + "Schema", + { + "title": NotRequired[str], + "description": NotRequired[str], + "type": NotRequired[Type], + "$ref": NotRequired[str], + "items": NotRequired["Schema"], + "properties": NotRequired[dict[str, "Schema"]], + "additionalProperties": NotRequired[Union["Schema", bool]], + "definitions": NotRequired[dict[str, "Schema"]], + "enum": NotRequired[list[str]], + "required": NotRequired[list[str]], + "default": NotRequired[str | int | float | bool | list], + }, +)