Support FullText search on SQLite (#723)

* Add fts5 full text index. * Activate full_text unit test on SQLite * ⁉️ Add a rudimentary alembic migration, experiment with adapter.py * 🛤️ Track Alembic Version in core * Getting closer, but missing the JOIN * Working * Refine hook to connect to related table. * ✍️ Make names consistent * Fix names * Create virtual table in more idomatic way.~ * 🥅 Catch all extant files in the FTS5 virtual table upon migration. * Restore deleted comma after copying from other branch * Remove unused imports, left from rebaes * Add update & delete to fulltext tests * Format with black * 🫷 Cirucumvent the creation of a FTS5 table in postgres * Give functions unique names. * Test FullText search on migrated data. * Update CHANGELOG. * Close connection. --------- Co-authored-by: kari Barry <[email protected]>
bluesky · May 25, 2024 · cdee371 · cdee371
1 parent 80fbc0e
commit cdee371
Show file tree

Hide file tree

Showing 10 changed files with 348 additions and 33 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,7 +3,11 @@ Write the date in place of the "Unreleased" in the case a new version is release
 
 # Changelog
 
-## Unreleased
+## v0.1.0b1
+
+### Added
+
+- Support for `FullText` search on SQLite-backed catalogs
 
 ### Fixed
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -143,6 +143,7 @@ dataframe = [
 dev = [
     "coverage",
     "flake8",
+    "importlib_resources;python_version < \"3.9\"",
     "ldap3",
     "pre-commit",
     "pytest <8",  # TMP pin while plugins catch up

diff --git a/tiled/_tests/sql/__init__.py b/tiled/_tests/sql/__init__.py
diff --git a/tiled/_tests/sql/before_creating_fts5_virtual_table.sql b/tiled/_tests/sql/before_creating_fts5_virtual_table.sql
@@ -0,0 +1,111 @@
+PRAGMA foreign_keys=OFF;
+BEGIN TRANSACTION;
+CREATE TABLE nodes (
+	id INTEGER NOT NULL,
+	"key" VARCHAR(1023) NOT NULL,
+	ancestors JSON,
+	structure_family VARCHAR(9) NOT NULL,
+	metadata JSON NOT NULL,
+	specs JSON NOT NULL,
+	time_created DATETIME DEFAULT (CURRENT_TIMESTAMP),
+	time_updated DATETIME DEFAULT (CURRENT_TIMESTAMP),
+	PRIMARY KEY (id),
+	CONSTRAINT key_ancestors_unique_constraint UNIQUE ("key", ancestors)
+);
+INSERT INTO nodes VALUES(1,'x','[]','array','{"color":"blue"}','[]','2024-05-25 10:18:38','2024-05-25 10:18:38');
+CREATE TABLE structures (
+	id VARCHAR(32) NOT NULL,
+	structure JSON NOT NULL,
+	PRIMARY KEY (id),
+	UNIQUE (id)
+);
+INSERT INTO structures VALUES('8e5b0a1237f27c3d04d2cb94bc695ff8','{"data_type":{"endianness":"little","kind":"i","itemsize":8},"chunks":[[3]],"shape":[3],"dims":null,"resizable":false}');
+CREATE TABLE assets (
+	id INTEGER NOT NULL,
+	data_uri VARCHAR(1023),
+	is_directory BOOLEAN NOT NULL,
+	hash_type VARCHAR(63),
+	hash_content VARCHAR(1023),
+	size INTEGER,
+	time_created DATETIME DEFAULT (CURRENT_TIMESTAMP),
+	time_updated DATETIME DEFAULT (CURRENT_TIMESTAMP),
+	PRIMARY KEY (id)
+);
+INSERT INTO assets VALUES(1,'file://localhost/home/dallan/Repos/bnl/tiled/data/x',1,NULL,NULL,NULL,'2024-05-25 10:18:38','2024-05-25 10:18:38');
+CREATE TABLE data_sources (
+	id INTEGER NOT NULL,
+	node_id INTEGER NOT NULL,
+	structure_id VARCHAR(32),
+	mimetype VARCHAR(255) NOT NULL,
+	parameters JSON,
+	management VARCHAR(9) NOT NULL,
+	structure_family VARCHAR(9) NOT NULL,
+	time_created DATETIME DEFAULT (CURRENT_TIMESTAMP),
+	time_updated DATETIME DEFAULT (CURRENT_TIMESTAMP),
+	PRIMARY KEY (id),
+	FOREIGN KEY(node_id) REFERENCES nodes (id) ON DELETE CASCADE,
+	FOREIGN KEY(structure_id) REFERENCES structures (id) ON DELETE CASCADE
+);
+INSERT INTO data_sources VALUES(1,1,'8e5b0a1237f27c3d04d2cb94bc695ff8','application/x-zarr','{}','writable','array','2024-05-25 10:18:38','2024-05-25 10:18:38');
+CREATE TABLE revisions (
+	id INTEGER NOT NULL,
+	node_id INTEGER NOT NULL,
+	revision_number INTEGER NOT NULL,
+	metadata JSON NOT NULL,
+	specs JSON NOT NULL,
+	time_created DATETIME DEFAULT (CURRENT_TIMESTAMP),
+	time_updated DATETIME DEFAULT (CURRENT_TIMESTAMP),
+	PRIMARY KEY (id),
+	CONSTRAINT node_id_revision_number_unique_constraint UNIQUE (node_id, revision_number),
+	FOREIGN KEY(node_id) REFERENCES nodes (id) ON DELETE CASCADE
+);
+CREATE TABLE data_source_asset_association (
+	data_source_id INTEGER NOT NULL,
+	asset_id INTEGER NOT NULL,
+	parameter VARCHAR(255),
+	num INTEGER,
+	PRIMARY KEY (data_source_id, asset_id),
+	CONSTRAINT parameter_num_unique_constraint UNIQUE (data_source_id, parameter, num),
+	FOREIGN KEY(data_source_id) REFERENCES data_sources (id) ON DELETE CASCADE,
+	FOREIGN KEY(asset_id) REFERENCES assets (id) ON DELETE CASCADE
+);
+INSERT INTO data_source_asset_association VALUES(1,1,'data_uri',NULL);
+CREATE TABLE alembic_version (
+	version_num VARCHAR(32) NOT NULL,
+	CONSTRAINT alembic_version_pkc PRIMARY KEY (version_num)
+);
+INSERT INTO alembic_version VALUES('e756b9381c14');
+CREATE INDEX ix_nodes_id ON nodes (id);
+CREATE INDEX top_level_metadata ON nodes (ancestors, time_created, id, metadata);
+CREATE UNIQUE INDEX ix_assets_data_uri ON assets (data_uri);
+CREATE INDEX ix_assets_id ON assets (id);
+CREATE INDEX ix_data_sources_id ON data_sources (id);
+CREATE INDEX ix_revisions_id ON revisions (id);
+CREATE TRIGGER cannot_insert_num_null_if_num_exists
+BEFORE INSERT ON data_source_asset_association
+WHEN NEW.num IS NULL
+BEGIN
+    SELECT RAISE(ABORT, 'Can only insert num=NULL if no other row exists for the same parameter')
+    WHERE EXISTS
+    (
+        SELECT 1
+        FROM data_source_asset_association
+        WHERE parameter = NEW.parameter
+        AND data_source_id = NEW.data_source_id
+    );
+END;
+CREATE TRIGGER cannot_insert_num_int_if_num_null_exists
+BEFORE INSERT ON data_source_asset_association
+WHEN NEW.num IS NOT NULL
+BEGIN
+    SELECT RAISE(ABORT, 'Can only insert INTEGER num if no NULL row exists for the same parameter')
+    WHERE EXISTS
+    (
+        SELECT 1
+        FROM data_source_asset_association
+        WHERE parameter = NEW.parameter
+        AND num IS NULL
+        AND data_source_id = NEW.data_source_id
+    );
+END;
+COMMIT;
diff --git a/tiled/_tests/test_queries.py b/tiled/_tests/test_queries.py
@@ -29,7 +29,7 @@
 )
 from ..server.app import build_app
 from .conftest import TILED_TEST_POSTGRESQL_URI
-from .utils import fail_with_status_code, temp_postgres
+from .utils import fail_with_status_code, sqlite_from_dump, temp_postgres
 
 keys = list(string.ascii_lowercase)
 mapping = {
@@ -50,6 +50,10 @@
     numpy.ones(10), metadata={"color": "purple"}
 )
 
+mapping["full_text_test_case_urple"] = ArrayAdapter.from_array(
+    numpy.ones(10), metadata={"color": "urple"}
+)
+
 mapping["specs_foo_bar"] = ArrayAdapter.from_array(numpy.ones(10), specs=["foo", "bar"])
 mapping["specs_foo_bar_baz"] = ArrayAdapter.from_array(
     numpy.ones(10), specs=["foo", "bar", "baz"]
@@ -163,18 +167,53 @@ def test_contains(client):
 
 
 def test_full_text(client):
-    if client.metadata["backend"] in {"sqlite"}:
-
-        def cm():
-            return fail_with_status_code(HTTP_400_BAD_REQUEST)
-
-    else:
-        cm = nullcontext
-    with cm():
-        assert list(client.search(FullText("z"))) == ["z", "does_contain_z"]
-        # plainto_tsquery fails to find certain words, weirdly, so it is a useful
-        # test that we are using tsquery
-        assert list(client.search(FullText("purple"))) == ["full_text_test_case"]
+    "Basic test of FullText query"
+    assert list(client.search(FullText("z"))) == ["z", "does_contain_z"]
+    # plainto_tsquery fails to find certain words, weirdly, so it is a useful
+    # test that we are using tsquery
+    assert list(client.search(FullText("purple"))) == ["full_text_test_case"]
+    assert list(client.search(FullText("urple"))) == ["full_text_test_case_urple"]
+
+
+def test_full_text_after_migration():
+    # Load a SQL database created by an older version of Tiled, predating FullText
+    # support, and verify that the migration indexes the pre-existing metadata.
+    with sqlite_from_dump("before_creating_fts5_virtual_table.sql") as database_path:
+        subprocess.check_call(
+            [sys.executable]
+            + f"-m tiled catalog upgrade-database sqlite+aiosqlite:///{database_path}".split()
+        )
+        catalog = from_uri(database_path)
+        app = build_app(catalog)
+        with Context.from_app(app) as context:
+            client = from_context(context)
+            assert list(client.search(FullText("blue"))) == ["x"]
+            assert list(client.search(FullText("red"))) == []  # does not exist
+
+
+def test_full_text_update(client):
+    if client.metadata["backend"] == "map":
+        pytest.skip("Updating not supported")
+    # Update the fulltext index and check that it is current with the main data.
+    try:
+        client["full_text_test_case"].update_metadata({"color": "red"})
+        assert list(client.search(FullText("purple"))) == []
+        assert list(client.search(FullText("red"))) == ["full_text_test_case"]
+    finally:
+        # Reset case in the event tests are run out of order.
+        client["full_text_test_case"].update_metadata({"color": "purple"})
+
+
+def test_full_text_delete(client):
+    if client.metadata["backend"] == "map":
+        pytest.skip("Updating not supported")
+    # Delete a record the fulltext index and check that it is current with the main data.
+    client.write_array(numpy.ones(10), metadata={"item": "toaster"}, key="test_delete")
+    # Assert that the data was written
+    assert list(client.search(FullText("toaster"))) == ["test_delete"]
+    client.delete("test_delete")
+    assert list(client.search(FullText("purple"))) == ["full_text_test_case"]
+    assert list(client.search(FullText("toaster"))) == []
 
 
 def test_regex(client):

diff --git a/tiled/_tests/utils.py b/tiled/_tests/utils.py
@@ -1,7 +1,11 @@
 import contextlib
 import getpass
+import sqlite3
+import sys
+import tempfile
 import uuid
 from enum import IntEnum
+from pathlib import Path
 
 import httpx
 import pytest
@@ -11,6 +15,11 @@
 from ..client import context
 from ..client.base import BaseClient
 
+if sys.version_info < (3, 9):
+    import importlib_resources as resources
+else:
+    from importlib import resources  # Python >= 3.9 only
+
 
 @contextlib.contextmanager
 def fail_with_status_code(status_code):
@@ -64,3 +73,19 @@ class URL_LIMITS(IntEnum):
     HUGE = 80_000
     DEFAULT = BaseClient.URL_CHARACTER_LIMIT
     TINY = 10
+
+
+@contextlib.contextmanager
+def sqlite_from_dump(filename):
+    """Create a SQLite db in a temporary directory, loading a SQL script.
+
+    SQL script should be given as a filename, assumed to be in tiled/_tests/sql/
+    """
+    with tempfile.TemporaryDirectory() as directory:
+        database_path = Path(directory, "catalog.db")
+        conn = sqlite3.connect(database_path)
+        ref = resources.files("tiled._tests.sql") / filename
+        with resources.as_file(ref) as path:
+            conn.executescript(path.read_text())
+        conn.close()
+        yield database_path
diff --git a/tiled/catalog/adapter.py b/tiled/catalog/adapter.py
@@ -35,6 +35,7 @@
 from sqlalchemy.orm import selectinload
 from sqlalchemy.pool import AsyncAdaptedQueuePool
 from sqlalchemy.sql.expression import cast
+from sqlalchemy.sql.sqltypes import MatchType
 from starlette.status import HTTP_404_NOT_FOUND, HTTP_415_UNSUPPORTED_MEDIA_TYPE
 
 from tiled.queries import (
@@ -371,12 +372,24 @@ def structure(self):
             return self.data_sources[0].structure
         return None
 
+    def apply_conditions(self, statement):
+        # IF this is a sqlite database and we are doing a full text MATCH
+        # query, we need a JOIN with the FTS5 virtual table.
+        if (self.context.engine.dialect.name == "sqlite") and any(
+            isinstance(condition.type, MatchType) for condition in self.conditions
+        ):
+            statement = statement.join(
+                orm.metadata_fts5, orm.metadata_fts5.c.rowid == orm.Node.id
+            )
+        for condition in self.conditions:
+            statement = statement.filter(condition)
+        return statement
+
     async def async_len(self):
         statement = select(func.count(orm.Node.key)).filter(
             orm.Node.ancestors == self.segments
         )
-        for condition in self.conditions:
-            statement = statement.filter(condition)
+        statement = self.apply_conditions(statement)
         async with self.context.session() as db:
             return (await db.execute(statement)).scalar_one()
 
@@ -398,17 +411,13 @@ async def lookup_adapter(
             # Search queries and access controls apply only at the top level.
             assert not first_level.conditions
             return await first_level.lookup_adapter(segments[1:])
-        statement = (
-            select(orm.Node)
-            .filter(orm.Node.ancestors == self.segments + ancestors)
-            .options(
-                selectinload(orm.Node.data_sources).selectinload(
-                    orm.DataSource.structure
-                )
-            )
+        statement = select(orm.Node)
+        statement = self.apply_conditions(statement)
+        statement = statement.filter(
+            orm.Node.ancestors == self.segments + ancestors
+        ).options(
+            selectinload(orm.Node.data_sources).selectinload(orm.DataSource.structure)
         )
-        for condition in self.conditions:
-            statement = statement.filter(condition)
         async with self.context.session() as db:
             node = (await db.execute(statement.filter(orm.Node.key == key))).scalar()
         if node is None:
@@ -953,8 +962,7 @@ async def keys_range(self, offset, limit):
                 (offset + limit) if limit is not None else None,  # noqa: E203
             )
         statement = select(orm.Node.key).filter(orm.Node.ancestors == self.segments)
-        for condition in self.conditions:
-            statement = statement.filter(condition)
+        statement = self.apply_conditions(statement)
         async with self.context.session() as db:
             return (
                 (
@@ -976,8 +984,7 @@ async def items_range(self, offset, limit):
                 (offset + limit) if limit is not None else None,  # noqa: E203
             )
         statement = select(orm.Node).filter(orm.Node.ancestors == self.segments)
-        for condition in self.conditions:
-            statement = statement.filter(condition)
+        statement = self.apply_conditions(statement)
         async with self.context.session() as db:
             nodes = (
                 (
@@ -1187,7 +1194,7 @@ def contains(query, tree):
 def full_text(query, tree):
     dialect_name = tree.engine.url.get_dialect().name
     if dialect_name == "sqlite":
-        raise UnsupportedQueryType("full_text")
+        condition = orm.metadata_fts5.c.metadata.match(query.text)
     elif dialect_name == "postgresql":
         tsvector = func.jsonb_to_tsvector(
             cast("simple", REGCONFIG), orm.Node.metadata_, cast(["string"], JSONB)

diff --git a/tiled/catalog/core.py b/tiled/catalog/core.py
@@ -5,6 +5,7 @@
 
 # This is list of all valid revisions (from current to oldest).
 ALL_REVISIONS = [
+    "ed3a4223a600",
     "e756b9381c14",
     "2ca16566d692",
     "1cd99c02d0c7",