Skip to content

Commit

Permalink
Support FullText search on SQLite (#723)
Browse files Browse the repository at this point in the history
* Add fts5 full text index.

* Activate full_text unit test on SQLite

* ⁉️ Add a rudimentary alembic migration, experiment with adapter.py

* 🛤️ Track Alembic Version in core

* Getting closer, but missing the JOIN

* Working

* Refine hook to connect to related table.

* ✍️ Make names consistent

* Fix names

* Create virtual table in more idomatic way.~

* 🥅 Catch all extant files in the FTS5 virtual table upon migration.

* Restore deleted comma after copying from other branch

* Remove unused imports, left from rebaes

* Add update & delete to fulltext tests

* Format with black

* 🫷 Cirucumvent the creation of a FTS5 table in postgres

* Give functions unique names.

* Test FullText search on migrated data.

* Update CHANGELOG.

* Close connection.

---------

Co-authored-by: kari Barry <[email protected]>
  • Loading branch information
danielballan and Kezzsim authored May 25, 2024
1 parent 80fbc0e commit cdee371
Show file tree
Hide file tree
Showing 10 changed files with 348 additions and 33 deletions.
6 changes: 5 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@ Write the date in place of the "Unreleased" in the case a new version is release

# Changelog

## Unreleased
## v0.1.0b1

### Added

- Support for `FullText` search on SQLite-backed catalogs

### Fixed

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -143,6 +143,7 @@ dataframe = [
dev = [
"coverage",
"flake8",
"importlib_resources;python_version < \"3.9\"",
"ldap3",
"pre-commit",
"pytest <8", # TMP pin while plugins catch up
Expand Down
Empty file added tiled/_tests/sql/__init__.py
Empty file.
111 changes: 111 additions & 0 deletions tiled/_tests/sql/before_creating_fts5_virtual_table.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
PRAGMA foreign_keys=OFF;
BEGIN TRANSACTION;
CREATE TABLE nodes (
id INTEGER NOT NULL,
"key" VARCHAR(1023) NOT NULL,
ancestors JSON,
structure_family VARCHAR(9) NOT NULL,
metadata JSON NOT NULL,
specs JSON NOT NULL,
time_created DATETIME DEFAULT (CURRENT_TIMESTAMP),
time_updated DATETIME DEFAULT (CURRENT_TIMESTAMP),
PRIMARY KEY (id),
CONSTRAINT key_ancestors_unique_constraint UNIQUE ("key", ancestors)
);
INSERT INTO nodes VALUES(1,'x','[]','array','{"color":"blue"}','[]','2024-05-25 10:18:38','2024-05-25 10:18:38');
CREATE TABLE structures (
id VARCHAR(32) NOT NULL,
structure JSON NOT NULL,
PRIMARY KEY (id),
UNIQUE (id)
);
INSERT INTO structures VALUES('8e5b0a1237f27c3d04d2cb94bc695ff8','{"data_type":{"endianness":"little","kind":"i","itemsize":8},"chunks":[[3]],"shape":[3],"dims":null,"resizable":false}');
CREATE TABLE assets (
id INTEGER NOT NULL,
data_uri VARCHAR(1023),
is_directory BOOLEAN NOT NULL,
hash_type VARCHAR(63),
hash_content VARCHAR(1023),
size INTEGER,
time_created DATETIME DEFAULT (CURRENT_TIMESTAMP),
time_updated DATETIME DEFAULT (CURRENT_TIMESTAMP),
PRIMARY KEY (id)
);
INSERT INTO assets VALUES(1,'file://localhost/home/dallan/Repos/bnl/tiled/data/x',1,NULL,NULL,NULL,'2024-05-25 10:18:38','2024-05-25 10:18:38');
CREATE TABLE data_sources (
id INTEGER NOT NULL,
node_id INTEGER NOT NULL,
structure_id VARCHAR(32),
mimetype VARCHAR(255) NOT NULL,
parameters JSON,
management VARCHAR(9) NOT NULL,
structure_family VARCHAR(9) NOT NULL,
time_created DATETIME DEFAULT (CURRENT_TIMESTAMP),
time_updated DATETIME DEFAULT (CURRENT_TIMESTAMP),
PRIMARY KEY (id),
FOREIGN KEY(node_id) REFERENCES nodes (id) ON DELETE CASCADE,
FOREIGN KEY(structure_id) REFERENCES structures (id) ON DELETE CASCADE
);
INSERT INTO data_sources VALUES(1,1,'8e5b0a1237f27c3d04d2cb94bc695ff8','application/x-zarr','{}','writable','array','2024-05-25 10:18:38','2024-05-25 10:18:38');
CREATE TABLE revisions (
id INTEGER NOT NULL,
node_id INTEGER NOT NULL,
revision_number INTEGER NOT NULL,
metadata JSON NOT NULL,
specs JSON NOT NULL,
time_created DATETIME DEFAULT (CURRENT_TIMESTAMP),
time_updated DATETIME DEFAULT (CURRENT_TIMESTAMP),
PRIMARY KEY (id),
CONSTRAINT node_id_revision_number_unique_constraint UNIQUE (node_id, revision_number),
FOREIGN KEY(node_id) REFERENCES nodes (id) ON DELETE CASCADE
);
CREATE TABLE data_source_asset_association (
data_source_id INTEGER NOT NULL,
asset_id INTEGER NOT NULL,
parameter VARCHAR(255),
num INTEGER,
PRIMARY KEY (data_source_id, asset_id),
CONSTRAINT parameter_num_unique_constraint UNIQUE (data_source_id, parameter, num),
FOREIGN KEY(data_source_id) REFERENCES data_sources (id) ON DELETE CASCADE,
FOREIGN KEY(asset_id) REFERENCES assets (id) ON DELETE CASCADE
);
INSERT INTO data_source_asset_association VALUES(1,1,'data_uri',NULL);
CREATE TABLE alembic_version (
version_num VARCHAR(32) NOT NULL,
CONSTRAINT alembic_version_pkc PRIMARY KEY (version_num)
);
INSERT INTO alembic_version VALUES('e756b9381c14');
CREATE INDEX ix_nodes_id ON nodes (id);
CREATE INDEX top_level_metadata ON nodes (ancestors, time_created, id, metadata);
CREATE UNIQUE INDEX ix_assets_data_uri ON assets (data_uri);
CREATE INDEX ix_assets_id ON assets (id);
CREATE INDEX ix_data_sources_id ON data_sources (id);
CREATE INDEX ix_revisions_id ON revisions (id);
CREATE TRIGGER cannot_insert_num_null_if_num_exists
BEFORE INSERT ON data_source_asset_association
WHEN NEW.num IS NULL
BEGIN
SELECT RAISE(ABORT, 'Can only insert num=NULL if no other row exists for the same parameter')
WHERE EXISTS
(
SELECT 1
FROM data_source_asset_association
WHERE parameter = NEW.parameter
AND data_source_id = NEW.data_source_id
);
END;
CREATE TRIGGER cannot_insert_num_int_if_num_null_exists
BEFORE INSERT ON data_source_asset_association
WHEN NEW.num IS NOT NULL
BEGIN
SELECT RAISE(ABORT, 'Can only insert INTEGER num if no NULL row exists for the same parameter')
WHERE EXISTS
(
SELECT 1
FROM data_source_asset_association
WHERE parameter = NEW.parameter
AND num IS NULL
AND data_source_id = NEW.data_source_id
);
END;
COMMIT;
65 changes: 52 additions & 13 deletions tiled/_tests/test_queries.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
)
from ..server.app import build_app
from .conftest import TILED_TEST_POSTGRESQL_URI
from .utils import fail_with_status_code, temp_postgres
from .utils import fail_with_status_code, sqlite_from_dump, temp_postgres

keys = list(string.ascii_lowercase)
mapping = {
Expand All @@ -50,6 +50,10 @@
numpy.ones(10), metadata={"color": "purple"}
)

mapping["full_text_test_case_urple"] = ArrayAdapter.from_array(
numpy.ones(10), metadata={"color": "urple"}
)

mapping["specs_foo_bar"] = ArrayAdapter.from_array(numpy.ones(10), specs=["foo", "bar"])
mapping["specs_foo_bar_baz"] = ArrayAdapter.from_array(
numpy.ones(10), specs=["foo", "bar", "baz"]
Expand Down Expand Up @@ -163,18 +167,53 @@ def test_contains(client):


def test_full_text(client):
if client.metadata["backend"] in {"sqlite"}:

def cm():
return fail_with_status_code(HTTP_400_BAD_REQUEST)

else:
cm = nullcontext
with cm():
assert list(client.search(FullText("z"))) == ["z", "does_contain_z"]
# plainto_tsquery fails to find certain words, weirdly, so it is a useful
# test that we are using tsquery
assert list(client.search(FullText("purple"))) == ["full_text_test_case"]
"Basic test of FullText query"
assert list(client.search(FullText("z"))) == ["z", "does_contain_z"]
# plainto_tsquery fails to find certain words, weirdly, so it is a useful
# test that we are using tsquery
assert list(client.search(FullText("purple"))) == ["full_text_test_case"]
assert list(client.search(FullText("urple"))) == ["full_text_test_case_urple"]


def test_full_text_after_migration():
# Load a SQL database created by an older version of Tiled, predating FullText
# support, and verify that the migration indexes the pre-existing metadata.
with sqlite_from_dump("before_creating_fts5_virtual_table.sql") as database_path:
subprocess.check_call(
[sys.executable]
+ f"-m tiled catalog upgrade-database sqlite+aiosqlite:///{database_path}".split()
)
catalog = from_uri(database_path)
app = build_app(catalog)
with Context.from_app(app) as context:
client = from_context(context)
assert list(client.search(FullText("blue"))) == ["x"]
assert list(client.search(FullText("red"))) == [] # does not exist


def test_full_text_update(client):
if client.metadata["backend"] == "map":
pytest.skip("Updating not supported")
# Update the fulltext index and check that it is current with the main data.
try:
client["full_text_test_case"].update_metadata({"color": "red"})
assert list(client.search(FullText("purple"))) == []
assert list(client.search(FullText("red"))) == ["full_text_test_case"]
finally:
# Reset case in the event tests are run out of order.
client["full_text_test_case"].update_metadata({"color": "purple"})


def test_full_text_delete(client):
if client.metadata["backend"] == "map":
pytest.skip("Updating not supported")
# Delete a record the fulltext index and check that it is current with the main data.
client.write_array(numpy.ones(10), metadata={"item": "toaster"}, key="test_delete")
# Assert that the data was written
assert list(client.search(FullText("toaster"))) == ["test_delete"]
client.delete("test_delete")
assert list(client.search(FullText("purple"))) == ["full_text_test_case"]
assert list(client.search(FullText("toaster"))) == []


def test_regex(client):
Expand Down
25 changes: 25 additions & 0 deletions tiled/_tests/utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
import contextlib
import getpass
import sqlite3
import sys
import tempfile
import uuid
from enum import IntEnum
from pathlib import Path

import httpx
import pytest
Expand All @@ -11,6 +15,11 @@
from ..client import context
from ..client.base import BaseClient

if sys.version_info < (3, 9):
import importlib_resources as resources
else:
from importlib import resources # Python >= 3.9 only


@contextlib.contextmanager
def fail_with_status_code(status_code):
Expand Down Expand Up @@ -64,3 +73,19 @@ class URL_LIMITS(IntEnum):
HUGE = 80_000
DEFAULT = BaseClient.URL_CHARACTER_LIMIT
TINY = 10


@contextlib.contextmanager
def sqlite_from_dump(filename):
"""Create a SQLite db in a temporary directory, loading a SQL script.
SQL script should be given as a filename, assumed to be in tiled/_tests/sql/
"""
with tempfile.TemporaryDirectory() as directory:
database_path = Path(directory, "catalog.db")
conn = sqlite3.connect(database_path)
ref = resources.files("tiled._tests.sql") / filename
with resources.as_file(ref) as path:
conn.executescript(path.read_text())
conn.close()
yield database_path
41 changes: 24 additions & 17 deletions tiled/catalog/adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@
from sqlalchemy.orm import selectinload
from sqlalchemy.pool import AsyncAdaptedQueuePool
from sqlalchemy.sql.expression import cast
from sqlalchemy.sql.sqltypes import MatchType
from starlette.status import HTTP_404_NOT_FOUND, HTTP_415_UNSUPPORTED_MEDIA_TYPE

from tiled.queries import (
Expand Down Expand Up @@ -371,12 +372,24 @@ def structure(self):
return self.data_sources[0].structure
return None

def apply_conditions(self, statement):
# IF this is a sqlite database and we are doing a full text MATCH
# query, we need a JOIN with the FTS5 virtual table.
if (self.context.engine.dialect.name == "sqlite") and any(
isinstance(condition.type, MatchType) for condition in self.conditions
):
statement = statement.join(
orm.metadata_fts5, orm.metadata_fts5.c.rowid == orm.Node.id
)
for condition in self.conditions:
statement = statement.filter(condition)
return statement

async def async_len(self):
statement = select(func.count(orm.Node.key)).filter(
orm.Node.ancestors == self.segments
)
for condition in self.conditions:
statement = statement.filter(condition)
statement = self.apply_conditions(statement)
async with self.context.session() as db:
return (await db.execute(statement)).scalar_one()

Expand All @@ -398,17 +411,13 @@ async def lookup_adapter(
# Search queries and access controls apply only at the top level.
assert not first_level.conditions
return await first_level.lookup_adapter(segments[1:])
statement = (
select(orm.Node)
.filter(orm.Node.ancestors == self.segments + ancestors)
.options(
selectinload(orm.Node.data_sources).selectinload(
orm.DataSource.structure
)
)
statement = select(orm.Node)
statement = self.apply_conditions(statement)
statement = statement.filter(
orm.Node.ancestors == self.segments + ancestors
).options(
selectinload(orm.Node.data_sources).selectinload(orm.DataSource.structure)
)
for condition in self.conditions:
statement = statement.filter(condition)
async with self.context.session() as db:
node = (await db.execute(statement.filter(orm.Node.key == key))).scalar()
if node is None:
Expand Down Expand Up @@ -953,8 +962,7 @@ async def keys_range(self, offset, limit):
(offset + limit) if limit is not None else None, # noqa: E203
)
statement = select(orm.Node.key).filter(orm.Node.ancestors == self.segments)
for condition in self.conditions:
statement = statement.filter(condition)
statement = self.apply_conditions(statement)
async with self.context.session() as db:
return (
(
Expand All @@ -976,8 +984,7 @@ async def items_range(self, offset, limit):
(offset + limit) if limit is not None else None, # noqa: E203
)
statement = select(orm.Node).filter(orm.Node.ancestors == self.segments)
for condition in self.conditions:
statement = statement.filter(condition)
statement = self.apply_conditions(statement)
async with self.context.session() as db:
nodes = (
(
Expand Down Expand Up @@ -1187,7 +1194,7 @@ def contains(query, tree):
def full_text(query, tree):
dialect_name = tree.engine.url.get_dialect().name
if dialect_name == "sqlite":
raise UnsupportedQueryType("full_text")
condition = orm.metadata_fts5.c.metadata.match(query.text)
elif dialect_name == "postgresql":
tsvector = func.jsonb_to_tsvector(
cast("simple", REGCONFIG), orm.Node.metadata_, cast(["string"], JSONB)
Expand Down
1 change: 1 addition & 0 deletions tiled/catalog/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

# This is list of all valid revisions (from current to oldest).
ALL_REVISIONS = [
"ed3a4223a600",
"e756b9381c14",
"2ca16566d692",
"1cd99c02d0c7",
Expand Down
Loading

0 comments on commit cdee371

Please sign in to comment.