Keep the resource and property names in a compressed binary file

zstd level 15 was one of the fastest decompression algorithms, with the best compression ratio.
randovania · Jul 21, 2023 · d5d6f27 · d5d6f27
1 parent a8ebbe4
commit d5d6f27
Show file tree

Hide file tree

Showing 8 changed files with 151 additions and 6 deletions.
diff --git a/.gitignore b/.gitignore
@@ -131,9 +131,10 @@ dmypy.json
 # PyCharm
 /.idea
 
-# Version
+# Generated files
 src/mercury_engine_data_structures/version.py
 src/mercury_engine_data_structures/formats/dread_types.py
+src/mercury_engine_data_structures/*.bin
 
 # Deny certain files at root
 /*.txt

diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1 @@
+include
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,6 +4,7 @@ requires = [
     "setuptools_scm[toml]>=3.4.3",
     "wheel>=0.37.0",
     "construct>=2.10.0",
+    "zstd",
 ]
 build-backend = "setuptools.build_meta"
 
@@ -23,7 +24,8 @@ requires-python = ">=3.8"
 dynamic = ["version"]
 
 dependencies = [
-    "construct>=2.10.0"
+    "construct>=2.10.0",
+    "zstd",
 ]
 
 

diff --git a/setup.cfg b/setup.cfg
@@ -0,0 +1,9 @@
+[options.package_data]
+mercury_engine_data_structures =
+    *.bin
+
+[options.exclude_package_data]
+mercury_engine_data_structures =
+    dread_property_names.json
+    dread_resource_names.json
+    sr_resource_names.json
diff --git a/src/mercury_engine_data_structures/_dread_data_construct.py b/src/mercury_engine_data_structures/_dread_data_construct.py
@@ -0,0 +1,50 @@
+import struct
+import typing
+
+import construct
+
+
+class CompressedZSTD(construct.Tunnel):
+    def __init__(self, subcon, level: int = 3):
+        super().__init__(subcon)
+        import zstd
+        self.lib = zstd
+        self.level = level
+
+    def _decode(self, data, context, path):
+        return self.lib.decompress(data)
+
+    def _encode(self, data, context, path):
+        return self.lib.compress(data, self.level)
+
+
+class HashesDict(construct.Construct):
+    def __init__(self):
+        super().__init__()
+        self._build_construct = construct.PrefixedArray(
+            construct.Int32un,
+            construct.Sequence(
+                construct.PascalString(construct.Int16un, "ascii"),  # key
+                construct.Int64un,  # hash
+            )
+        )
+
+    def _parse(self, stream, context, path) -> typing.Dict[str, int]:
+        key_size_struct = struct.Struct("=H")
+        value_size_struct = struct.Struct("=Q")
+
+        count = construct.Int32un._parse(stream, None, "")
+
+        result = {}
+        for _ in range(count):
+            key = stream.read(key_size_struct.unpack(stream.read(2))[0]).decode()
+            value = value_size_struct.unpack(stream.read(8))[0]
+            result[key] = value
+
+        return result
+
+    def _build(self, obj: typing.Dict[str, int], stream, context, path):
+        return self._build_construct._build(list(obj.items()), stream, context, path)
+
+
+KnownHashes = CompressedZSTD(HashesDict(), 15)
diff --git a/src/mercury_engine_data_structures/dread_data.py b/src/mercury_engine_data_structures/dread_data.py
@@ -4,17 +4,25 @@
 from pathlib import Path
 from typing import Dict, Optional
 
+from mercury_engine_data_structures._dread_data_construct import KnownHashes
+
+_root = Path(__file__).parent
+
 
 @functools.lru_cache
 def get_raw_types() -> Dict[str, typing.Any]:
-    path = Path(__file__).parent.joinpath("dread_types.json")
+    path = _root.joinpath("dread_types.json")
     with path.open() as f:
         return json.load(f)
 
 
 @functools.lru_cache
 def all_name_to_asset_id() -> Dict[str, int]:
-    path = Path(__file__).parent.joinpath("dread_resource_names.json")
+    bin_path = _root.joinpath("dread_resource_names.bin")
+    if bin_path.exists():
+        return dict(KnownHashes.parse_file(bin_path))
+
+    path = _root.joinpath("dread_resource_names.json")
     with path.open() as names_file:
         return json.load(names_file)
 
@@ -33,7 +41,11 @@ def name_for_asset_id(asset_id: int) -> Optional[str]:
 
 @functools.lru_cache
 def all_name_to_property_id() -> Dict[str, int]:
-    path = Path(__file__).parent.joinpath("dread_property_names.json")
+    bin_path = _root.joinpath("dread_property_names.bin")
+    if bin_path.exists():
+        return dict(KnownHashes.parse_file(bin_path))
+
+    path = _root.joinpath("dread_property_names.json")
     with path.open() as names_file:
         return json.load(names_file)
 

diff --git a/src/mercury_engine_data_structures/samus_returns_data.py b/src/mercury_engine_data_structures/samus_returns_data.py
@@ -4,6 +4,10 @@
 from pathlib import Path
 from typing import Dict, Optional
 
+from mercury_engine_data_structures._dread_data_construct import KnownHashes
+
+_root = Path(__file__).parent
+
 
 @functools.lru_cache
 def get_raw_types() -> Dict[str, typing.Any]:
@@ -12,6 +16,10 @@ def get_raw_types() -> Dict[str, typing.Any]:
 
 @functools.lru_cache
 def all_name_to_asset_id() -> Dict[str, int]:
+    bin_path = _root.joinpath("sr_resource_names.bin")
+    if bin_path.exists():
+        return dict(KnownHashes.parse_file(bin_path))
+
     path = Path(__file__).parent.joinpath("sr_resource_names.json")
 
     with path.open() as names_file:
@@ -32,6 +40,10 @@ def name_for_asset_id(asset_id: int) -> Optional[str]:
 
 @functools.lru_cache
 def all_name_to_property_id() -> Dict[str, int]:
+    bin_path = _root.joinpath("sr_property_names.bin")
+    if bin_path.exists():
+        return dict(KnownHashes.parse_file(bin_path))
+
     path = Path(__file__).parent.joinpath("sr_property_names.json")
     with path.open() as names_file:
         return json.load(names_file)

diff --git a/tools/create_class_definitions.py b/tools/create_class_definitions.py
@@ -1,6 +1,8 @@
 import collections
 import copy
+import io
 import json
+import struct
 from pathlib import Path
 
 import construct
@@ -16,6 +18,11 @@
 type_lib = construct.Container(dread_types=dread_types)
 exec(compile(type_lib_source, type_lib_path, "exec"), type_lib)
 
+dread_data_construct_path = meds_root.joinpath("_dread_data_construct.py")
+dread_data_construct = construct.Container()
+exec(compile(dread_data_construct_path.read_text(), dread_data_construct_path, "exec"), dread_data_construct)
+
+
 primitive_to_construct = {
     type_lib.PrimitiveKind.VECTOR_2: "common_types.CVector2D",
     type_lib.PrimitiveKind.VECTOR_3: "common_types.CVector3D",
@@ -57,7 +64,8 @@ def children_for(self, type_name: str, recursive: bool = True):
                 yield from self.children_for(child)
 
     def _debug(self, msg: str):
-        print("  " * len(self._types_being_exported) + f"* {msg}")
+        pass
+        # print("  " * len(self._types_being_exported) + f"* {msg}")
 
     def _export_enum_type(self, type_variable: str, type_name: str):
         data = self.all_types[type_name]
@@ -224,6 +232,47 @@ def export_code(self):
         return code
 
 
+class CompressedZSTD(construct.Tunnel):
+    def __init__(self, subcon, level: int = 3):
+        super().__init__(subcon)
+        import zstd
+        self.lib = zstd
+        self.level = level
+
+    def _decode(self, data, context, path):
+        return self.lib.decompress(data)
+
+    def _encode(self, data, context, path):
+        return self.lib.compress(data, self.level)
+
+
+def raw_data():
+    return construct.PrefixedArray(
+        construct.Int32un,
+        construct.Sequence(
+            construct.PascalString(construct.Int16un, "ascii"),  # key
+            construct.Int64un,  # hash
+        )
+    )
+
+
+def parse(data: bytes):
+    stream = io.BytesIO(data)
+
+    key_size_struct = struct.Struct("=H")
+    value_size_struct = struct.Struct("=Q")
+
+    count = construct.Int32un._parse(stream, None, "")
+
+    result = {}
+    for _ in range(count):
+        key = stream.read(key_size_struct.unpack(stream.read(2))[0]).decode()
+        value = value_size_struct.unpack(stream.read(8))[0]
+        result[key] = value
+
+    return result
+
+
 def main():
     output_path = meds_root.joinpath("formats", "dread_types.py")
 
@@ -235,6 +284,15 @@ def main():
 
     output_path.write_text(type_exporter.export_code())
 
+    for file_name in ["dread_resource_names", "dread_property_names", "sr_resource_names"]:
+        with meds_root.joinpath(f"{file_name}.json").open() as f:
+            file_data: dict[str, int] = json.load(f)
+
+        dread_data_construct.KnownHashes.build_file(
+            file_data,
+            meds_root.joinpath(f"{file_name}.bin")
+        )
+
 
 if __name__ == '__main__':
     main()