From ca8c07281b1a52f385bcb6b32aeaa4e5350993bb Mon Sep 17 00:00:00 2001
From: Dan Allan <dallan@bnl.gov>
Date: Tue, 9 Apr 2024 13:32:15 -0400
Subject: [PATCH 1/5] Add explanatory docs on catalog database

---
 docs/source/explanations/catalog.md | 278 ++++++++++++++++++++++++++++
 docs/source/index.md                |   1 +
 2 files changed, 279 insertions(+)
 create mode 100644 docs/source/explanations/catalog.md

diff --git a/docs/source/explanations/catalog.md b/docs/source/explanations/catalog.md
new file mode 100644
index 000000000..d90a369bf
--- /dev/null
+++ b/docs/source/explanations/catalog.md
@@ -0,0 +1,278 @@
+# Catalog Database
+
+The Catalog database is a SQL database of information describing data: its
+name, metadata, structure, format, and location.
+
+## Nodes
+
+The `nodes` table is the _logical_ view of the data, the way that Tiled
+presents the data to clients. Each row represents one node in the logical
+"tree" of data represented by Tiled.
+
+- `metadata` --- user-controlled JSON object, with arbitrary metadata
+- `ancestors` and `key` --- together specify the unique path of the data
+- `structre_family` --- enum of structure types (`"container"`, `"array"`, `"table"`, ...)
+- `specs` --- user-controlled JSON list of specs, such as `[{"name": "XDI", "version": "1"}]`
+- `id` an internal integer primary key, not exposed by the API
+- `time_created` and `time_updated` --- for forensics, not exposed by the API
+
+## Data Source
+
+Each Data Source is associated with one Node. Together, `data_sources`, `structures`,
+and `assets`, describes the format,  structure, and location of the data.
+
+- `mimetype` --- MIME type string describing the format, such as `"text/csv"`
+  (This is used by Tiled to identify a suitable Adapter to read this data.)
+- `parameters` --- JSON object with additional parameters that will be passed
+  to the Adapter
+- `management` --- enum indicating whether the data is registered `"external"` data
+  or `"writable"` data managed by Tiled
+- `structre_family` --- enum of structure types (`"container"`, `"array"`, `"table"`, ...)
+- `structure_id` --- a foreign key to the `structures` table
+- `node_id` --- foreign key to `nodes`
+- `id` --- integer primary key
+- `time_created` and `time_updated` --- for forensics, not exposed by the API
+
+## Structure
+
+Each Data Source references exactly one Structure.
+
+- `structure` --- JSON object describing the structure
+- `id` --- MD5 hash of the
+  [RFC 8785](https://www.rfc-editor.org/rfc/rfc8785) canonical JSON of the structure
+
+
+## Asset
+
+- `data_uri` --- location of data, given as `file://localhost/PATH`
+  (It is planned to extend to schemes other than `file`, such as `s3`, in the
+  future.)
+- `is_directory` --- boolean
+- `hash_type` and `hash_content` --- not yet implemented (i.e. always NULL) but
+  intended for content verification
+- `size` --- not yet implemented (i.e. always NULL) but intended to support
+  fast queries for data size estimation
+- `id` --- integer primary key
+- `time_created` and `time_updated` --- for forensics, not exposed by the API
+
+## Data Source Asset Relation
+
+Assets and Data Sources have a many-to-many relation. The
+`data_source_asset_assocation` table is best described by the example below.
+
+- `data_source_id`, `asset_id` --- foreign keys
+- `parameter` --- the name of the parameter this Asset should be passed to
+- `num` --- the position of this item in a list
+
+If `parameter` is NULL, the Asset is a supporting file, not passed directly to
+the Adapter.
+
+If `num` is NULL, the Adapter will be passed a scalar value. If `num` is an
+integer, the Adapter will be passed a list sorted by `num`.
+
+Database triggers are used to ensure self-consistency.
+
+### Single HDF5 file
+
+This is a simple example: one Data Source and one associated Asset.
+
+```sql
+select id, mimetype, parameters from data_sources;
+```
+
+id | mimetype | parameters |
+-- | -- | --
+1 | "application/x-hdf5" | {"smwr": true} | NULL
+
+
+```sql
+select data_uri, is_diretory from assets
+```
+
+id | data_uri | is_directory
+-- | -- | --
+1 | "file://localhost/path/to/data.h5" | false
+
+The HDF5 Adapter takes one HDF5 file passed to the argument
+named `data_uri`, so the Asset is given parameter `"data_uri"`
+and num `NULL`.
+
+```sql
+select * from data_source_asset_assocation
+```
+
+data_source_id | asset_id | parameter | num
+-- | -- | -- | --
+1 | 1 | "data_uri" | NULL
+
+### Single Zarr directory
+
+This is similar. A single Zarr dataset is backed by a directory, not a
+file. The internal structure of the directory is managed by Zarr, not by the
+user, so Tiled can simply track the whole directory as a unit, not each
+individual file.
+
+```sql
+select id, mimetype, parameters from data_sources;
+```
+
+id | mimetype | parameters |
+-- | -- | --
+1 | "application/x-zarr" | {} | NULL
+
+
+```sql
+select data_uri, is_diretory from assets
+```
+
+id | data_uri | is_directory
+-- | -- | --
+1 | "file://localhost/path/to/data.zarr" | true
+
+(Notice is_directory is `true`.)
+
+```sql
+select * from data_source_asset_assocation
+```
+
+data_source_id | asset_id | parameter | num
+-- | -- | -- | --
+1 | 1 | "data_uri" | NULL
+
+### Single TIFF Image
+
+This is another simple example, very much like the HDF5 example.
+
+```sql
+select id, mimetype, parameters from data_sources;
+```
+
+id | mimetype | parameters |
+-- | -- | --
+1 | "image/tiff" | {} | NULL
+
+
+```sql
+select data_uri, is_diretory from assets
+```
+
+id | data_uri | is_directory
+-- | -- | --
+1 | "file://localhost/path/to/image.tiff" | false
+
+```sql
+select * from data_source_asset_assocation
+```
+
+data_source_id | asset_id | parameter | num
+-- | -- | -- | --
+1 | 1 | "data_uri" | NULL
+
+### TIFF sequence
+
+Now we have a sequence of separate TIFF files (`image00001.tiff`,
+`image00002.tiff`, ...) that we want to treat as a single Data Source.
+
+```sql
+select id, mimetype, parameters from data_sources;
+```
+
+id | mimetype | parameters |
+-- | -- | --
+1 | "multipart/related;type=image/tiff" | {} | NULL
+
+The MIME type `multipart/related;type=image/tiff` is registered to an Adapter
+that expects a _sequence_ of TIFF files.
+
+```sql
+select data_uri, is_diretory from assets
+```
+
+id | data_uri | is_directory
+-- | -- | --
+1 | "file://localhost/path/to/image00001.tiff" | false
+2 | "file://localhost/path/to/image00002.tiff" | false
+3 | "file://localhost/path/to/image00003.tiff" | false
+
+```sql
+select * from data_source_asset_assocation
+```
+
+data_source_id | asset_id | parameter | num
+-- | -- | -- | --
+1 | 1 | "data_uris" | 0
+1 | 2 | "data_uris" | 1
+1 | 3 | "data_uris" | 2
+
+### Single CSV file
+
+The CSV Adapter is designed to accept multiple CSV partitions
+representing batches (a.k.a. partitions) of rows.
+
+```sql
+select id, mimetype, parameters from data_sources;
+```
+
+id | mimetype | parameters |
+-- | -- | --
+1 | "text/csv" | {} | NULL
+
+
+```sql
+select data_uri, is_diretory from assets
+```
+
+id | data_uri | is_directory
+-- | -- | --
+1 | "file://localhost/path/to/table.csv" | false
+
+The CSV Adapter takes one or more CSV passed as a list to the
+argument named `data_uris`, so the Asset is given parameter
+`data_uris` and num `0`.
+
+```sql
+select * from data_source_asset_assocation
+```
+
+data_source_id | asset_id | parameter | num
+-- | -- | -- | --
+1 | 1 | "data_uris" | 0
+
+### HDF5 file with virtual datasets
+
+Here is an example where we set parameter to NULL.
+
+```sql
+select id, mimetype, parameters from data_sources;
+```
+
+id | mimetype | parameters |
+-- | -- | --
+1 | "application/x-hdf5" | {} | NULL
+
+
+```sql
+select data_uri, is_diretory from assets
+```
+
+id | data_uri | is_directory
+-- | -- | --
+1 | "file://localhost/path/to/master.h5" | false
+2 | "file://localhost/path/to/data00001.h5" | false
+3 | "file://localhost/path/to/data00002.h5" | false
+4 | "file://localhost/path/to/data00003.h5" | false
+
+The CSV Adapter takes one or more CSV passed as a list to the
+argument named `data_uris`, so the Asset is given parameter
+`data_uris` and num `0`.
+
+```sql
+select * from data_source_asset_assocation
+```
+
+data_source_id | asset_id | parameter | num
+-- | -- | -- | --
+1 | 1 | "data_uri" | NULL
+1 | 2 | NULL | NULL
+1 | 3 | NULL | NULL
+1 | 4 | NULL | NULL
diff --git a/docs/source/index.md b/docs/source/index.md
index 1fe5b8433..2940cfd3f 100644
--- a/docs/source/index.md
+++ b/docs/source/index.md
@@ -44,6 +44,7 @@ explanations/compression
 explanations/specialized-formats
 explanations/caching
 explanations/access-control
+explanations/catalog
 explanations/faq
 explanations/lineage
 ```

From 5cd0adf50668c9a2f758e70fac68468007109e5c Mon Sep 17 00:00:00 2001
From: Dan Allan <dallan@bnl.gov>
Date: Tue, 9 Apr 2024 13:55:12 -0400
Subject: [PATCH 2/5] Cover the revisions table

---
 docs/source/explanations/catalog.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/docs/source/explanations/catalog.md b/docs/source/explanations/catalog.md
index d90a369bf..de64c3608 100644
--- a/docs/source/explanations/catalog.md
+++ b/docs/source/explanations/catalog.md
@@ -276,3 +276,16 @@ data_source_id | asset_id | parameter | num
 1 | 2 | NULL | NULL
 1 | 3 | NULL | NULL
 1 | 4 | NULL | NULL
+
+## Revisions
+
+The `revisions` table stores snapshots of Node `metadata` and `specs`. When an
+update is made, the row in the `nodes` table is updated and a _copy_ with the
+original content is inserted in the `revisions` table.
+
+- `node_id` --- foreign key to the node
+- `revision_number` --- integer counting revisions of this node from 1
+- `metadata` --- snapshot of node metadata
+- `specs` --- snapshot of node specs
+- `id` --- an internal integer primary key, not exposed by the API
+- `time_created` and `time_updated` --- for forensics, not exposed by the API

From 49becf815cfe3e5658e42327027f5f63c342671c Mon Sep 17 00:00:00 2001
From: Dan Allan <daniel.b.allan@gmail.com>
Date: Wed, 10 Apr 2024 09:30:42 -0400
Subject: [PATCH 3/5] Apply suggestions from code review

Co-authored-by: Eugene <ymatviych@bnl.gov>
---
 docs/source/explanations/catalog.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/source/explanations/catalog.md b/docs/source/explanations/catalog.md
index de64c3608..fc255ddb5 100644
--- a/docs/source/explanations/catalog.md
+++ b/docs/source/explanations/catalog.md
@@ -27,7 +27,7 @@ and `assets`, describes the format,  structure, and location of the data.
   to the Adapter
 - `management` --- enum indicating whether the data is registered `"external"` data
   or `"writable"` data managed by Tiled
-- `structre_family` --- enum of structure types (`"container"`, `"array"`, `"table"`, ...)
+- `structure_family` --- enum of structure types (`"container"`, `"array"`, `"table"`, ...)
 - `structure_id` --- a foreign key to the `structures` table
 - `node_id` --- foreign key to `nodes`
 - `id` --- integer primary key
@@ -71,7 +71,7 @@ If `num` is NULL, the Adapter will be passed a scalar value. If `num` is an
 integer, the Adapter will be passed a list sorted by `num`.
 
 Database triggers are used to ensure self-consistency.
-
+`time_created` and `time_updated` contain timestamps related to the corresponding DB entry (Node, Data Source, Asset), and not the underlying data files.
 ### Single HDF5 file
 
 This is a simple example: one Data Source and one associated Asset.
@@ -82,7 +82,7 @@ select id, mimetype, parameters from data_sources;
 
 id | mimetype | parameters |
 -- | -- | --
-1 | "application/x-hdf5" | {"smwr": true} | NULL
+1 | "application/x-hdf5" | {"smwr": true}
 
 
 ```sql
@@ -118,7 +118,7 @@ select id, mimetype, parameters from data_sources;
 
 id | mimetype | parameters |
 -- | -- | --
-1 | "application/x-zarr" | {} | NULL
+1 | "application/x-zarr" | {}
 
 
 ```sql
@@ -129,7 +129,7 @@ id | data_uri | is_directory
 -- | -- | --
 1 | "file://localhost/path/to/data.zarr" | true
 
-(Notice is_directory is `true`.)
+(Notice `is_directory` is `true`.)
 
 ```sql
 select * from data_source_asset_assocation
@@ -179,10 +179,10 @@ select id, mimetype, parameters from data_sources;
 
 id | mimetype | parameters |
 -- | -- | --
-1 | "multipart/related;type=image/tiff" | {} | NULL
+1 | "multipart/related;type=image/tiff" | {}
 
 The MIME type `multipart/related;type=image/tiff` is registered to an Adapter
-that expects a _sequence_ of TIFF files.
+that expects a _sequence_ of TIFF files, e.g. `TiffSequenceAdapter`.
 
 ```sql
 select data_uri, is_diretory from assets
@@ -248,7 +248,7 @@ select id, mimetype, parameters from data_sources;
 
 id | mimetype | parameters |
 -- | -- | --
-1 | "application/x-hdf5" | {} | NULL
+1 | "application/x-hdf5" | {}
 
 
 ```sql

From 27bc69219122e761be20a82c0cc18efbb16f22f7 Mon Sep 17 00:00:00 2001
From: Dan Allan <dallan@bnl.gov>
Date: Wed, 10 Apr 2024 11:17:00 -0400
Subject: [PATCH 4/5] Expand on some aspects.

---
 docs/source/explanations/catalog.md | 22 +++++++++++++++++++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/docs/source/explanations/catalog.md b/docs/source/explanations/catalog.md
index fc255ddb5..0829ea022 100644
--- a/docs/source/explanations/catalog.md
+++ b/docs/source/explanations/catalog.md
@@ -16,6 +16,12 @@ presents the data to clients. Each row represents one node in the logical
 - `id` an internal integer primary key, not exposed by the API
 - `time_created` and `time_updated` --- for forensics, not exposed by the API
 
+The `time_created` and `time_updated` columns, which appear in this table and
+others below, contain timestamps related to the corresponding database row
+(Node, Data Source, Asset), not the underlying data files. They should not
+carry a scientific meaning; they are only used for book-keeping, forensics,
+and debugging.
+
 ## Data Source
 
 Each Data Source is associated with one Node. Together, `data_sources`, `structures`,
@@ -47,7 +53,12 @@ Each Data Source references exactly one Structure.
 - `data_uri` --- location of data, given as `file://localhost/PATH`
   (It is planned to extend to schemes other than `file`, such as `s3`, in the
   future.)
-- `is_directory` --- boolean
+- `is_directory` --- boolean: `true` when the Asset being tracked is a
+  directory. This is used for data formats in which the directory structure is
+  an internal detail managed by the I/O library, such as Zarr and TileDB.
+  Otherwise this is `false`, and Tiled tracks each file as an individual Asset,
+  such as each TIFF file in a TIFF sequence, or each HDF5 file in a virtual
+  HDF5 dataset).
 - `hash_type` and `hash_content` --- not yet implemented (i.e. always NULL) but
   intended for content verification
 - `size` --- not yet implemented (i.e. always NULL) but intended to support
@@ -61,7 +72,12 @@ Assets and Data Sources have a many-to-many relation. The
 `data_source_asset_assocation` table is best described by the example below.
 
 - `data_source_id`, `asset_id` --- foreign keys
-- `parameter` --- the name of the parameter this Asset should be passed to
+- `parameter` --- the name of the Tiled Adapter's parameter that this Asset
+  should be passed to, e.g. `"data_uri"` or `"data_uris"`. These can be any
+  string because some Adapters handle a heterogeneous group of Assets, like
+  a combination of an image file and a separate text metadata file, and
+  load them as a unit. The parameter is used to differentiate the various
+  Assets for the Adapter.
 - `num` --- the position of this item in a list
 
 If `parameter` is NULL, the Asset is a supporting file, not passed directly to
@@ -71,7 +87,7 @@ If `num` is NULL, the Adapter will be passed a scalar value. If `num` is an
 integer, the Adapter will be passed a list sorted by `num`.
 
 Database triggers are used to ensure self-consistency.
-`time_created` and `time_updated` contain timestamps related to the corresponding DB entry (Node, Data Source, Asset), and not the underlying data files.
+
 ### Single HDF5 file
 
 This is a simple example: one Data Source and one associated Asset.

From 4cc78e944f72041dccf52a9662890353de0b3df5 Mon Sep 17 00:00:00 2001
From: Dan Allan <dallan@bnl.gov>
Date: Wed, 10 Apr 2024 11:29:13 -0400
Subject: [PATCH 5/5] Add overview and diagram

---
 docs/source/explanations/architecture.md | 16 ++---------
 docs/source/explanations/catalog.md      | 35 ++++++++++++++++++++++--
 2 files changed, 34 insertions(+), 17 deletions(-)

diff --git a/docs/source/explanations/architecture.md b/docs/source/explanations/architecture.md
index 3eb233e9d..92b0210d4 100644
--- a/docs/source/explanations/architecture.md
+++ b/docs/source/explanations/architecture.md
@@ -118,23 +118,11 @@ Not all Tiled servers are configured to use the Catalog:
 But for most standard applications, including serving a directory of files or
 providing a writable data store, the Catalog is used.
 
+See {doc}`catalog` for an explanation of the database.
+
 [FastAPI]: https://fastapi.tiangolo.com/
 [httpx]: https://www.python-httpx.org/
 [Starlette]: https://www.starlette.io/
 [OpenAPI]: https://www.openapis.org/
 [Pydantic]: https://docs.pydantic.dev/
 [content negotiation]: https://developer.mozilla.org/en-US/docs/Web/HTTP/Content_negotiation
-
-(catalog-database)=
-### Catalog Database
-
-```{mermaid}
-erDiagram
-    nodes ||--o{  data_sources : has
-    data_sources ||--o{  data_source_asset_association : has
-    data_source_asset_association }|--|{  assets : has
-    data_sources }|--||   structure : has
-    nodes ||--o{  revisions : has
-    alembic_version
-
-```
diff --git a/docs/source/explanations/catalog.md b/docs/source/explanations/catalog.md
index 0829ea022..7b6f7be3e 100644
--- a/docs/source/explanations/catalog.md
+++ b/docs/source/explanations/catalog.md
@@ -3,6 +3,27 @@
 The Catalog database is a SQL database of information describing data: its
 name, metadata, structure, format, and location.
 
+## Overview
+
+```{mermaid}
+erDiagram
+    nodes ||--o{  data_sources : has
+    data_sources ||--o{  data_source_asset_association : has
+    data_source_asset_association }|--|{  assets : has
+    data_sources }|--||   structure : has
+    nodes ||--o{  revisions : has
+    alembic_version
+
+```
+
+- `nodes` - metadata and logical location of this dataset in Tiled's tree
+- `data_sources` - format and parameters for opening dataset
+- `structures` - description of dataset structure (e.g. shape, chunks, data type, column names, ...)
+- `assets` - location (URI) of data
+- `data_source_asset_assocation` - many-to-many relation between `data_sources` and `assets`
+- `revisions` - snapshots of revision history of metadata
+- `alembic_version` - version of database schema, to verify compatibility with version of Tiled
+
 ## Nodes
 
 The `nodes` table is the _logical_ view of the data, the way that Tiled
@@ -44,9 +65,7 @@ and `assets`, describes the format,  structure, and location of the data.
 Each Data Source references exactly one Structure.
 
 - `structure` --- JSON object describing the structure
-- `id` --- MD5 hash of the
-  [RFC 8785](https://www.rfc-editor.org/rfc/rfc8785) canonical JSON of the structure
-
+- `id` --- MD5 hash of the [RFC 8785][] canonical JSON of the structure
 
 ## Asset
 
@@ -305,3 +324,13 @@ original content is inserted in the `revisions` table.
 - `specs` --- snapshot of node specs
 - `id` --- an internal integer primary key, not exposed by the API
 - `time_created` and `time_updated` --- for forensics, not exposed by the API
+
+## Alembic Version
+
+The `alembic_version` table is managed by [Alembic][], a SQL migration tool, to
+stamp the current version of the database. The Tiled server checks this at
+startup to ensure that the version of Tiled being used is compatible with the
+version of the database.
+
+[RFC 8785]: https://www.rfc-editor.org/rfc/rfc8785
+[Alembic]: https://alembic.sqlalchemy.org/en/latest/