From 9a5b3c2ca8396f546356115c6602e2f6030d340c Mon Sep 17 00:00:00 2001
From: Francesco Bertolaccini <francesco.bertolaccini@trailofbits.com>
Date: Tue, 30 Jul 2024 15:39:35 +0200
Subject: [PATCH] sarif: add generator script

---
 sarif/scripts/sarif-headergen/.gitignore      | 162 +++++++++++++
 sarif/scripts/sarif-headergen/README.md       |   7 +
 sarif/scripts/sarif-headergen/pyproject.toml  |  19 ++
 .../src/sarif_headergen/__cli__.py            | 228 ++++++++++++++++++
 .../src/sarif_headergen/__init__.py           |   3 +
 .../src/sarif_headergen/output.py             | 138 +++++++++++
 .../src/sarif_headergen/schema.py             |  30 +++
 7 files changed, 587 insertions(+)
 create mode 100644 sarif/scripts/sarif-headergen/.gitignore
 create mode 100644 sarif/scripts/sarif-headergen/README.md
 create mode 100644 sarif/scripts/sarif-headergen/pyproject.toml
 create mode 100644 sarif/scripts/sarif-headergen/src/sarif_headergen/__cli__.py
 create mode 100644 sarif/scripts/sarif-headergen/src/sarif_headergen/__init__.py
 create mode 100644 sarif/scripts/sarif-headergen/src/sarif_headergen/output.py
 create mode 100644 sarif/scripts/sarif-headergen/src/sarif_headergen/schema.py

diff --git a/sarif/scripts/sarif-headergen/.gitignore b/sarif/scripts/sarif-headergen/.gitignore
new file mode 100644
index 0000000..3a8816c
--- /dev/null
+++ b/sarif/scripts/sarif-headergen/.gitignore
@@ -0,0 +1,162 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+.pybuilder/
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm-project.org/#use-with-ide
+.pdm.toml
+.pdm-python
+.pdm-build/
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# pytype static type analyzer
+.pytype/
+
+# Cython debug symbols
+cython_debug/
+
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/
diff --git a/sarif/scripts/sarif-headergen/README.md b/sarif/scripts/sarif-headergen/README.md
new file mode 100644
index 0000000..e4eb624
--- /dev/null
+++ b/sarif/scripts/sarif-headergen/README.md
@@ -0,0 +1,7 @@
+# sarif-headergen
+
+Generates nlohmann::json (de)serialization files from the JSON schema for SARIF files.
+
+## Usage
+
+    $ python3 -m src.sarif_headergen.__cli__ sarif.json out.hpp out.cpp
diff --git a/sarif/scripts/sarif-headergen/pyproject.toml b/sarif/scripts/sarif-headergen/pyproject.toml
new file mode 100644
index 0000000..74bf50d
--- /dev/null
+++ b/sarif/scripts/sarif-headergen/pyproject.toml
@@ -0,0 +1,19 @@
+[project]
+name = "sarif-headergen"
+version = "0.1.0"
+description = "Generates nlohmann::json (de)serialization files from the JSON schema for SARIF files"
+authors = [
+    {name = "Francesco Bertolaccini", email = "francesco.bertolaccini@trailofbits.com"},
+]
+dependencies = []
+requires-python = ">=3.12"
+readme = "README.md"
+license = {text = "MIT"}
+
+[build-system]
+requires = ["pdm-backend"]
+build-backend = "pdm.backend"
+
+
+[tool.pdm]
+distribution = true
diff --git a/sarif/scripts/sarif-headergen/src/sarif_headergen/__cli__.py b/sarif/scripts/sarif-headergen/src/sarif_headergen/__cli__.py
new file mode 100644
index 0000000..8394f5e
--- /dev/null
+++ b/sarif/scripts/sarif-headergen/src/sarif_headergen/__cli__.py
@@ -0,0 +1,228 @@
+import json
+from .output import get_structs, Enum, Struct, ForwardRef, Array, FieldType, Map, Field
+from .schema import Schema
+
+
+def get_type_name(type: FieldType, field_name: str | None = None):
+    def _get_type_name(type: FieldType):
+        if isinstance(type, Enum):
+            return to_snake_case(f"{type.name}")
+        if isinstance(type, Struct):
+            return to_snake_case(f"{type.name}")
+        if isinstance(type, ForwardRef):
+            return to_snake_case(f"{type.name}")
+        if isinstance(type, Array):
+            return f"std::forward_list< {get_type_name(type.subtype, field_name)} >"
+        if isinstance(type, Map):
+            return f"std::unordered_map< std::string, {get_type_name(type.subtype, field_name)} >"
+        if type == "boolean":
+            return "bool"
+        if type == "number":
+            return "double"
+        if type == "integer":
+            return "int64_t"
+        if type == "string":
+            return "std::string"
+        if type == "json":
+            return "json"
+    name = _get_type_name(type)
+    if field_name is not None and name == field_name:
+        return f"::gap::sarif::{name}"
+    return name
+
+def sanitize(name: str) -> str:
+    return name.translate(str.maketrans({
+        "-": "_",
+        ".": "_",
+        " ": "_",
+    }))
+
+def to_snake_case(name: str) -> str:
+    tr = str.maketrans({
+        letter: f"_{letter.lower()}" for letter in "QWERTYUIOPASDFGHJKLZXCVBNM"
+    })
+    return name.translate(tr)
+
+def to_pascal_case(name: str) -> str:
+    return name[0].upper() + name[1:]
+
+def get_default_value(field: Field) -> str:
+    def stringify(val):
+        type = field.type
+        if isinstance(type, Array):
+            type = type.subtype
+
+        if type == "string":
+            val = val.translate(str.maketrans({
+                '"': '\\"',
+                "\r": "\\r",
+                "\n": "\\n",
+                "\\": "\\\\",
+            }))
+            return f'"{val}"'
+        if isinstance(type, Enum):
+            return f"{get_type_name(type, field.name)}::k{sanitize(to_pascal_case(val))}"
+        elif isinstance(val, bool):
+            return str(val).lower()
+        else:
+            return str(val)
+    if isinstance(field.default, list):
+        if len(field.default) == 0:
+            return "{}"
+        else:
+            return f"{{ {', '.join(map(stringify, field.default))} }}"
+    else:
+        return stringify(field.default)
+
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument("input_schema")
+    parser.add_argument("output_header")
+    parser.add_argument("output_source")
+
+    args = parser.parse_args()
+
+    structs = {}
+    visited = set()
+
+    with open(args.input_schema) as sarif:
+        sarif: Schema = json.load(sarif)
+        output = get_structs(sarif)
+        with open(args.output_header, mode="w") as output_header:
+            print("""// Copyright 2024-present, Trail of Bits, Inc.
+
+#pragma once
+
+//
+// These definitions were generated from a JSON Schema description of SARIF
+// found at https://github.com/microsoft/sarif-python-om/blob/7a84e8c2b2b9d8b9a8d25b1d376f039a0bf92a7c/sarif-schema-2.1.0.json
+//
+// The naming convention used is to convert all of the definition names from camelCase to snake_case,
+// suffixed with either _struct or _enum depending on the kind.
+// "Nested" definitions are prefixed by the name of the parent definition, for example the `headers`
+// property of the `webRequest` definition is called web_request_headers_struct, in order to
+// distinguish it from the `headers` property of the `webResponse` definition.
+// Property names are kept as-is.
+//
+// Enum value names are converted from camelCase to PascalCase and prefixed with `k`.
+//
+
+#include <cstdint>
+#include <forward_list>
+#include <optional>
+#include <string>
+#include <unordered_map>
+
+#include <nlohmann/json.hpp>
+
+namespace gap::sarif
+{
+    using json = nlohmann::json;""", file=output_header)
+            for elem in output:
+                if isinstance(elem, ForwardRef):
+                    print(f"    struct {get_type_name(elem)};", file=output_header)
+                if isinstance(elem, Struct):
+                    structs[elem.name] = elem
+                    print(file=output_header)
+                    print("    //", file=output_header)
+                    print(f"    // {elem.description}", file=output_header)
+                    print("    //", file=output_header)
+                    print(f"    struct {get_type_name(elem)} {{", end="", file=output_header)
+                    for field in elem.fields:
+                        print(file=output_header)
+                        if field.description is not None:
+                            print("        //", file=output_header)
+                            print(f"        // {field.description}", file=output_header)
+                            print("        //", file=output_header)
+                        if field.required:
+                            print(f"        {get_type_name(field.type, field.name)} {field.name};", file=output_header)
+                        elif field.default is not None:
+                            print(f"        {get_type_name(field.type, field.name)} {field.name} = {get_default_value(field)};", file=output_header)
+                        elif isinstance(field.type, Array):
+                            print(f"        {get_type_name(field.type, field.name)} {field.name} = {{}};", file=output_header)
+                        else:
+                            print(f"        std::optional< {get_type_name(field.type, field.name)} > {field.name} = std::nullopt;", file=output_header)
+                    if elem.additional_props is not None:
+                        print(file=output_header)
+                        if elem.additional_props == "json":
+                            print("        json additional_properties;", file=output_header)
+                        else:
+                            print(f"        std::unordered_map< std::string, {get_type_name(elem.additional_props)} > additional_properties;", file=output_header)
+
+                    print("    };", file=output_header)
+                    print(file=output_header)
+                    print(f"    void to_json(json &, const {get_type_name(elem)} &);", file=output_header)
+                    print(f"    void from_json(const json &, {get_type_name(elem)} &);", file=output_header)
+                if isinstance(elem, Enum):
+                    print(file=output_header)
+                    print(f"    enum class {get_type_name(elem)} {{", file=output_header)
+                    for value in elem.values:
+                        value_name = sanitize(to_pascal_case(value))
+                        print(f"        k{value_name},", file=output_header)
+                    print("    };", file=output_header)
+                    print(file=output_header)
+                    print(f"    NLOHMANN_JSON_SERIALIZE_ENUM({get_type_name(elem)}, {{", file=output_header)
+                    for value in elem.values:
+                        value_name = sanitize(to_pascal_case(value))
+                        print(f"        {{ {get_type_name(elem)}::k{value_name}, \"{value}\" }},", file=output_header)
+                    print("    })", file=output_header)
+            print("} // namespace gap::sarif", file=output_header)
+
+        with open(args.output_source, mode="w") as output_source:
+            print("""// Copyright (c) 2024-present, Trail of Bits, Inc.
+
+#include <gap/sarif/sarif.hpp>
+
+namespace gap::sarif {""", file=output_source)
+            for elem in output:
+                if not isinstance(elem, Struct):
+                    continue
+                print(f"    void from_json(const json &j, {get_type_name(elem)} &o) {{", file=output_source)
+                print("        for( auto &[key, val] : j.items() ) {", file=output_source)
+                for field in elem.fields:
+                    print(f"            if ( key == \"{field.name}\" ) {{", file=output_source)
+                    if field.required or field.default is not None:
+                        print(f"                val.get_to(o.{field.name});", file=output_source)
+                    else:
+                        print(f"                {get_type_name(field.type)} field;", file=output_source)
+                        print("                val.get_to(field);", file=output_source)
+                        print(f"                o.{field.name} = field;", file=output_source)
+                    print("                continue;", file=output_source)
+                    print("            }", file=output_source)
+                if elem.additional_props is not None:
+                    print("            val.get_to(o.additional_properties[key]);", file=output_source)
+                print("        }", file=output_source)
+                print("    }", file=output_source)
+
+                print(f"    void to_json(json &j, const {get_type_name(elem)} &o) {{", file=output_source)
+                for field in elem.fields:
+                    if field.required:
+                        print(f"        j[\"{field.name}\"] = o.{field.name};", file=output_source)
+                    elif field.default is not None:
+                        if isinstance(field.default, list):
+                            if len(field.default) == 0:
+                                print(f"        if ( !o.{field.name}.empty() ) {{", file=output_source)
+                            else:
+                                print(f"        if ( o.{field.name} != decltype(o.{field.name}){get_default_value(field)} ) {{", file=output_source)
+                        else:
+                            print(f"        if ( o.{field.name} != {get_default_value(field)} ) {{", file=output_source)
+                        print(f"            j[\"{field.name}\"] = o.{field.name};", file=output_source)
+                        print("        }", file=output_source)
+                    elif isinstance(field.type, Array):
+                        print(f"        if ( !o.{field.name}.empty() ) {{", file=output_source)
+                        print(f"            j[\"{field.name}\"] = o.{field.name};", file=output_source)
+                        print("        }", file=output_source)
+                    else:
+                        print(f"        if ( o.{field.name}.has_value() ) {{", file=output_source)
+                        print(f"            j[\"{field.name}\"] = *o.{field.name};", file=output_source)
+                        print("        }", file=output_source)
+                if elem.additional_props is not None:
+                    if elem.additional_props == "json":
+                        print("        for ( auto &[key, val] : o.additional_properties.items() ) {", file=output_source)
+                    else:
+                        print("        for ( auto &[key, val] : o.additional_properties ) {", file=output_source)
+                    print("            j[key] = val;", file=output_source)
+                    print("        }", file=output_source)
+                print("    }", file=output_source)
+            print("} // namespace gap::sarif", file=output_source)
diff --git a/sarif/scripts/sarif-headergen/src/sarif_headergen/__init__.py b/sarif/scripts/sarif-headergen/src/sarif_headergen/__init__.py
new file mode 100644
index 0000000..923b5d0
--- /dev/null
+++ b/sarif/scripts/sarif-headergen/src/sarif_headergen/__init__.py
@@ -0,0 +1,3 @@
+from .schema import Schema
+
+__all__ = [Schema]
diff --git a/sarif/scripts/sarif-headergen/src/sarif_headergen/output.py b/sarif/scripts/sarif-headergen/src/sarif_headergen/output.py
new file mode 100644
index 0000000..b3dbcc5
--- /dev/null
+++ b/sarif/scripts/sarif-headergen/src/sarif_headergen/output.py
@@ -0,0 +1,138 @@
+from typing import Literal
+from functools import cache
+from dataclasses import dataclass
+
+from .schema import Schema
+
+type FieldType = (
+    Literal["string", "number", "boolean", "integer", "json"] | "Array" | "Struct" | "Enum" | "ForwardRef" | "Map"
+)
+
+@dataclass(frozen=True)
+class Array:
+    subtype: FieldType
+
+
+@dataclass(frozen=True)
+class Field:
+    name: str
+    type: FieldType
+    description: str | None
+    required: bool
+    default: str | int | float | bool | list | None
+
+
+@dataclass(frozen=True)
+class Enum:
+    name: str
+    values: list[str]
+
+
+@dataclass(frozen=True)
+class Struct:
+    name: str
+    fields: list[Field]
+    description: str | None
+    additional_props: FieldType | None
+
+@dataclass(frozen=True)
+class ForwardRef:
+    name: str
+
+@dataclass(frozen=True)
+class Map:
+    subtype: FieldType
+
+
+def get_structs(schema: Schema) -> list[Struct | Enum | ForwardRef]:
+    assert "definitions" in schema
+
+    res: list[Struct | Enum | ForwardRef] = []
+    defs: dict[str, FieldType] = {}
+
+    definitions = schema["definitions"]
+
+    def get_fwd_ref(struct: str, name: str, field: Schema) -> FieldType:
+        if "$ref" in field:
+            ref = field["$ref"]
+            ref_name = ref[len("#/definitions/") :]
+            if ref_name in defs:
+                return defs[ref_name]
+            fwd = ForwardRef(ref_name)
+            res.append(fwd)
+            defs[ref_name] = fwd
+            return fwd
+        return get_field(struct, name, field)
+
+    def get_field(struct: str, name: str, field: Schema) -> FieldType:
+        if "$ref" in field:
+            return get_ref(None, field["$ref"])
+
+        if "type" in field:
+            if field["type"] in ("boolean", "number", "string", "integer"):
+                return field["type"]
+            if field["type"] == "array":
+                items = field["items"]
+                return Array(subtype=get_fwd_ref(None, name, items))
+            if field["type"] == "object":
+                if struct is not None and "properties" not in field and "additionalProperties" in field and field["additionalProperties"] != False:
+                    if field["additionalProperties"] != True:
+                        return Map(subtype=get_field(struct, name, field["additionalProperties"]))
+                    else:
+                        return "json"
+                return make_struct(f"{struct}_{name}" if struct is not None else name, field)
+            raise Exception("Invalid type")
+
+        if "enum" in field:
+            if name in defs:
+                return defs[name]
+            enum = Enum(name, field["enum"])
+            res.append(enum)
+            defs[name] = enum
+            return enum
+
+        raise Exception("Unknown field type")
+
+    @cache
+    def get_ref(struct: str, ref: str) -> FieldType:
+        assert ref.startswith("#/definitions/")
+        name = ref[len("#/definitions/") :]
+        return get_field(struct, name, definitions[name])
+
+    def make_struct(name: str, type: Schema):
+        assert type["type"] == "object"
+        additional_props: FieldType | None = None
+        if "additionalProperties" in type:
+            if type["additionalProperties"] == True:
+                additional_props = "json"
+            elif type["additionalProperties"] != False:
+                additional_props = get_field(name, "additional_properties", type["additionalProperties"])
+        fields: list[Field] = []
+        if "properties" in type:
+            fields = [
+                Field(
+                    name=field_name,
+                    type=get_field(name, field_name, field_type),
+                    description=field_type.get("description", None),
+                    required="required" in type and field_name in type["required"],
+                    default=field_type.get("default", None),
+                )
+                for field_name, field_type in type["properties"].items()
+                if field_name != "$schema"
+            ]
+        struct = Struct(
+            name,
+            fields=fields,
+            description=type.get("description", None),
+            additional_props=additional_props,
+        )
+        res.append(struct)
+        defs[name] = struct
+        return struct
+
+    make_struct("root", schema)
+
+    for name in schema["definitions"]:
+        get_ref(None, f"#/definitions/{name}")
+
+    return res
diff --git a/sarif/scripts/sarif-headergen/src/sarif_headergen/schema.py b/sarif/scripts/sarif-headergen/src/sarif_headergen/schema.py
new file mode 100644
index 0000000..c3d02dc
--- /dev/null
+++ b/sarif/scripts/sarif-headergen/src/sarif_headergen/schema.py
@@ -0,0 +1,30 @@
+from typing import TypedDict, NotRequired, Union
+from enum import Enum
+
+
+class Type(str, Enum):
+    null = "null"
+    boolean = "boolean"
+    object = "object"
+    array = "array"
+    number = "number"
+    string = "string"
+    integer = "integer"
+
+
+Schema = TypedDict(
+    "Schema",
+    {
+        "title": NotRequired[str],
+        "description": NotRequired[str],
+        "type": NotRequired[Type],
+        "$ref": NotRequired[str],
+        "items": NotRequired["Schema"],
+        "properties": NotRequired[dict[str, "Schema"]],
+        "additionalProperties": NotRequired[Union["Schema", bool]],
+        "definitions": NotRequired[dict[str, "Schema"]],
+        "enum": NotRequired[list[str]],
+        "required": NotRequired[list[str]],
+        "default": NotRequired[str | int | float | bool | list],
+    },
+)