From e62418abfde43b4c540ba3ace791e867e48c1fbc Mon Sep 17 00:00:00 2001 From: Adam Hendel <15756360+ChuckHend@users.noreply.github.com> Date: Mon, 23 Oct 2023 19:50:31 -0500 Subject: [PATCH] add index --- Cargo.toml | 2 +- Trunk.toml | 2 +- src/api.rs | 4 +++- src/init.rs | 20 +++++++++++++++++--- src/search.rs | 1 + 5 files changed, 23 insertions(+), 6 deletions(-) diff --git a/Cargo.toml b/Cargo.toml index 23d2b5e..a8ae3a7 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "vectorize" -version = "0.1.2" +version = "0.2.0" edition = "2021" publish = false diff --git a/Trunk.toml b/Trunk.toml index b8006b5..ab131ec 100644 --- a/Trunk.toml +++ b/Trunk.toml @@ -6,7 +6,7 @@ description = "The simplest implementation of LLM-backed vector search on Postgr homepage = "https://github.com/tembo-io/pg_vectorize" documentation = "https://github.com/tembo-io/pg_vectorize" categories = ["orchestration", "machine_learning"] -version = "0.1.2" +version = "0.2.0" [build] postgres_version = "15" diff --git a/src/api.rs b/src/api.rs index a3677ac..52b3d9c 100644 --- a/src/api.rs +++ b/src/api.rs @@ -97,7 +97,9 @@ fn table( ); let ran: Result<_, spi::Error> = Spi::connect(|mut c| { - let _r = c.update(&init_embed_q, None, None)?; + for q in init_embed_q { + let _r = c.update(&q, None, None)?; + } Ok(()) }); if let Err(e) = ran { diff --git a/src/init.rs b/src/init.rs index c7e9d2f..f320dee 100644 --- a/src/init.rs +++ b/src/init.rs @@ -65,7 +65,7 @@ pub fn init_embedding_table_query( transformer: &types::Transformer, search_alg: &types::SimilarityAlg, transform_method: &TableMethod, -) -> String { +) -> Vec { // TODO: when adding support for other models, add the output dimension to the transformer attributes // so that they can be read here, not hard-coded here below // currently only supports the text-embedding-ada-002 embedding model - output dim 1536 @@ -80,8 +80,15 @@ pub fn init_embedding_table_query( (types::Transformer::openai, types::SimilarityAlg::pgv_cosine_similarity) => "vector(1536)", }; match transform_method { - TableMethod::append => append_embedding_column(job_name, schema, table, col_type), - TableMethod::join => create_embedding_table(job_name, col_type), + TableMethod::append => { + vec![ + append_embedding_column(job_name, schema, table, col_type), + create_hnsw_cosine_index(job_name, schema, table), + ] + } + TableMethod::join => { + vec![create_embedding_table(job_name, col_type)] + } } } @@ -97,6 +104,13 @@ fn create_embedding_table(job_name: &str, col_type: &str) -> String { ) } +fn create_hnsw_cosine_index(job_name: &str, schema: &str, table: &str) -> String { + format!( + "CREATE INDEX IF NOT EXISTS {job_name}_idx ON {schema}.{table} USING hnsw ({job_name}_embeddings vector_cosine_ops); + ", + ) +} + fn append_embedding_column(job_name: &str, schema: &str, table: &str, col_type: &str) -> String { // TODO: when adding support for other models, add the output dimension to the transformer attributes // so that they can be read here, not hard-coded here below diff --git a/src/search.rs b/src/search.rs index 44aca68..12a3b23 100644 --- a/src/search.rs +++ b/src/search.rs @@ -15,6 +15,7 @@ pub fn cosine_similarity_search( 1 - ({project}_embeddings <=> '{emb}'::vector) AS cosine_similarity, * FROM {schema}.{table} + WHERE {project}_updated_at is NOT NULL ORDER BY cosine_similarity DESC LIMIT {num_results}; "