From 57fe4aa73b1b98dd8eac87c6440e0f2a0c66d4e8 Mon Sep 17 00:00:00 2001 From: Timon Vonk Date: Fri, 6 Sep 2024 19:15:40 +0200 Subject: [PATCH] feat(indexing)!: Use UUIDv3 for indexing node ids (#277) Use UUIDv3 to generate node ids for storage and cache. This is more reliable than the previous u64 hashing, with less chance for collision. Additionally, the previous hash algorithm changes over Rust releases and should not be used. Closes #272 and needed for proper Rust 1.81 support as in #275 BREAKING CHANGE: All generated ids are now UUIDs, meaning all persisted data needs to be purged or manually updated, as default upserts will fail. There is no backwards compatibility. --- Cargo.lock | 1 + Cargo.toml | 1 + swiftide-core/Cargo.toml | 2 +- swiftide-core/src/node.rs | 30 ++++++++++++------- swiftide-integrations/src/fluvio/loader.rs | 2 ++ swiftide-integrations/src/lancedb/mod.rs | 9 +++++- swiftide-integrations/src/lancedb/persist.rs | 11 ++++--- .../src/qdrant/indexing_node.rs | 18 +++++------ swiftide-integrations/src/redis/mod.rs | 4 +-- swiftide-integrations/src/redis/node_cache.rs | 2 +- swiftide-integrations/src/redis/persist.rs | 8 ++--- 11 files changed, 56 insertions(+), 32 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 10701ec1..5800faf5 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -8683,6 +8683,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "81dfa00651efa65069b0b6b651f4aaa31ba9e3c3ce0137aaad053604ee7e0314" dependencies = [ "getrandom", + "md-5", "serde", ] diff --git a/Cargo.toml b/Cargo.toml index 7bf2fb2a..64ba47b6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -46,6 +46,7 @@ lazy_static = { version = "1.5.0" } chrono = { version = "0.4" } indoc = { version = "2.0" } regex = { version = "1.10.6" } +uuid = { version = "1.10", features = ["v3", "v4", "serde"] } # Integrations spider = { version = "2.2" } diff --git a/swiftide-core/Cargo.toml b/swiftide-core/Cargo.toml index 46dc63c0..02a1e0bb 100644 --- a/swiftide-core/Cargo.toml +++ b/swiftide-core/Cargo.toml @@ -28,7 +28,7 @@ lazy_static = { workspace = true } derive_builder = { workspace = true } tera = { version = "1.20", default-features = false } -uuid = { version = "1.10", features = ["v4"] } +uuid = { workspace = true, features = ["v4", "v3"] } [dev-dependencies] test-case = { workspace = true } diff --git a/swiftide-core/src/node.rs b/swiftide-core/src/node.rs index 76fa8701..f27c6e49 100644 --- a/swiftide-core/src/node.rs +++ b/swiftide-core/src/node.rs @@ -21,6 +21,7 @@ use std::{ collections::HashMap, fmt::Debug, hash::{Hash, Hasher}, + os::unix::ffi::OsStrExt, path::PathBuf, }; @@ -37,7 +38,7 @@ use crate::{metadata::Metadata, Embedding, SparseEmbedding}; #[derive(Default, Clone, Serialize, Deserialize, PartialEq)] pub struct Node { /// Optional identifier for the node. - pub id: Option, + pub id: Option, /// File path associated with the node. pub path: PathBuf, /// Data chunk contained in the node. @@ -162,17 +163,26 @@ impl Node { format!("{}\n{}", metadata, self.chunk) } - /// Calculates a hash value for the node based on its path and chunk. + /// Retrieve the identifier of the node. /// - /// The hash value is calculated using the default hasher provided by the standard library. + /// Calculates the identifier of the node based on its path and chunk as bytes, returning a + /// UUID (v3). /// - /// # Returns - /// - /// A 64-bit hash value representing the node. - pub fn calculate_hash(&self) -> u64 { - let mut hasher = std::collections::hash_map::DefaultHasher::new(); - self.hash(&mut hasher); - hasher.finish() + /// If previously calculated, the identifier is returned directly. If not, a new identifier is + /// calculated without storing it. + pub fn id(&self) -> uuid::Uuid { + if let Some(id) = self.id { + return id; + } + + let bytes = [self.path.as_os_str().as_bytes(), self.chunk.as_bytes()].concat(); + uuid::Uuid::new_v3(&uuid::Uuid::NAMESPACE_OID, &bytes) + } + + /// Updates the identifier of the node. + pub fn update_id(&mut self) { + self.id = None; + self.id = Some(self.id()); } } diff --git a/swiftide-integrations/src/fluvio/loader.rs b/swiftide-integrations/src/fluvio/loader.rs index 9fcbdb37..9173d783 100644 --- a/swiftide-integrations/src/fluvio/loader.rs +++ b/swiftide-integrations/src/fluvio/loader.rs @@ -84,6 +84,7 @@ mod tests { .with_wait_for(testcontainers::core::WaitFor::message_on_stdout( "started successfully", )) + .with_wait_for(testcontainers::core::WaitFor::seconds(1)) .with_network(NETWORK_NAME) .with_container_name("sc") .with_cmd("./fluvio-run sc --local /fluvio/metadata".split(' ')) @@ -96,6 +97,7 @@ mod tests { .with_wait_for(testcontainers::core::WaitFor::message_on_stdout( "started successfully", )) + .with_wait_for(testcontainers::core::WaitFor::seconds(1)) .with_network(NETWORK_NAME) .with_container_name("spu") .with_cmd(format!("./fluvio-run spu -i 5001 -p spu:{SPU_PORT1} -v spu:{SPU_PORT2} --sc-addr sc:9004 --log-base-dir /fluvio/data").split(' ')) diff --git a/swiftide-integrations/src/lancedb/mod.rs b/swiftide-integrations/src/lancedb/mod.rs index 4715fcf3..a79dd30a 100644 --- a/swiftide-integrations/src/lancedb/mod.rs +++ b/swiftide-integrations/src/lancedb/mod.rs @@ -157,7 +157,14 @@ impl LanceDBBuilder { fields.push(Field::new(field.field_name(), DataType::Utf8, false)); } FieldConfig::ID => { - fields.push(Field::new(field.field_name(), DataType::UInt64, false)); + fields.push(Field::new( + field.field_name(), + DataType::FixedSizeList( + Arc::new(Field::new("item", DataType::UInt8, true)), + 16, + ), + false, + )); } } } diff --git a/swiftide-integrations/src/lancedb/persist.rs b/swiftide-integrations/src/lancedb/persist.rs index 323b4cec..c578f473 100644 --- a/swiftide-integrations/src/lancedb/persist.rs +++ b/swiftide-integrations/src/lancedb/persist.rs @@ -3,12 +3,11 @@ use std::sync::Arc; use anyhow::Context as _; use anyhow::Result; use arrow_array::types::Float32Type; -use arrow_array::types::UInt64Type; +use arrow_array::types::UInt8Type; use arrow_array::types::Utf8Type; use arrow_array::Array; use arrow_array::FixedSizeListArray; use arrow_array::GenericByteArray; -use arrow_array::PrimitiveArray; use arrow_array::RecordBatch; use arrow_array::RecordBatchIterator; use async_trait::async_trait; @@ -138,10 +137,14 @@ impl LanceDB { FieldConfig::ID => { let mut row = Vec::with_capacity(nodes.len()); for node in nodes { - let data = Some(node.calculate_hash()); + let data = Some(node.id().as_bytes().map(Some)); row.push(data); } - batches.push(Arc::new(PrimitiveArray::::from_iter(row))); + batches.push(Arc::new(FixedSizeListArray::from_iter_primitive::< + UInt8Type, + _, + _, + >(row, 16))); } } } diff --git a/swiftide-integrations/src/qdrant/indexing_node.rs b/swiftide-integrations/src/qdrant/indexing_node.rs index 8fb5e4b1..1fc0310a 100644 --- a/swiftide-integrations/src/qdrant/indexing_node.rs +++ b/swiftide-integrations/src/qdrant/indexing_node.rs @@ -35,7 +35,7 @@ impl TryInto for NodeWithVectors<'_> { fn try_into(self) -> Result { let node = self.node; // Calculate a unique identifier for the node. - let id = node.calculate_hash(); + let id = node.id(); // Extend the metadata with additional information. // TODO: The node is already cloned in the `NodeWithVectors` constructor. @@ -64,7 +64,7 @@ impl TryInto for NodeWithVectors<'_> { try_create_vectors(&self.vector_fields, vectors, node.sparse_vectors.clone())?; // Construct the `qdrant::PointStruct` and return it. - Ok(qdrant::PointStruct::new(id, vectors, payload)) + Ok(qdrant::PointStruct::new(id.to_string(), vectors, payload)) } } @@ -121,10 +121,10 @@ mod tests { use crate::qdrant::indexing_node::NodeWithVectors; - static EXPECTED_VECTOR_ID: u64 = 17_298_870_094_173_045_322; + static EXPECTED_UUID: &str = "d42d252d-671d-37ef-a157-8e85d0710610"; #[test_case( - Node { id: Some(1), path: "/path".into(), chunk: "data".into(), + Node { id: None, path: "/path".into(), chunk: "data".into(), vectors: Some(HashMap::from([(EmbeddedField::Chunk, vec![1.0])])), original_size: 4, offset: 0, @@ -133,7 +133,7 @@ mod tests { ..Default::default() }, HashSet::from([EmbeddedField::Combined]), - PointStruct { id: Some(PointId::from(EXPECTED_VECTOR_ID)), payload: HashMap::from([ + PointStruct { id: Some(PointId::from(EXPECTED_UUID)), payload: HashMap::from([ ("content".into(), Value::from("data")), ("path".into(), Value::from("/path")), ("m1".into(), Value::from("mv1"))]), @@ -142,7 +142,7 @@ mod tests { "Node with single vector creates struct with unnamed vector" )] #[test_case( - Node { id: Some(1), path: "/path".into(), chunk: "data".into(), + Node { id: None, path: "/path".into(), chunk: "data".into(), vectors: Some(HashMap::from([ (EmbeddedField::Chunk, vec![1.0]), (EmbeddedField::Metadata("m1".into()), vec![2.0]) @@ -154,7 +154,7 @@ mod tests { ..Default::default() }, HashSet::from([EmbeddedField::Chunk, EmbeddedField::Metadata("m1".into())]), - PointStruct { id: Some(PointId::from(EXPECTED_VECTOR_ID)), payload: HashMap::from([ + PointStruct { id: Some(PointId::from(EXPECTED_UUID)), payload: HashMap::from([ ("content".into(), Value::from("data")), ("path".into(), Value::from("/path")), ("m1".into(), Value::from("mv1"))]), @@ -170,7 +170,7 @@ mod tests { "Node with multiple vectors creates struct with named vectors" )] #[test_case( - Node { id: Some(1), path: "/path".into(), chunk: "data".into(), + Node { id: None, path: "/path".into(), chunk: "data".into(), vectors: Some(HashMap::from([ (EmbeddedField::Chunk, vec![1.0]), (EmbeddedField::Combined, vec![1.0]), @@ -184,7 +184,7 @@ mod tests { ..Default::default() }, HashSet::from([EmbeddedField::Combined]), - PointStruct { id: Some(PointId::from(EXPECTED_VECTOR_ID)), payload: HashMap::from([ + PointStruct { id: Some(PointId::from(EXPECTED_UUID)), payload: HashMap::from([ ("content".into(), Value::from("data")), ("path".into(), Value::from("/path")), ("m1".into(), Value::from("mv1")), diff --git a/swiftide-integrations/src/redis/mod.rs b/swiftide-integrations/src/redis/mod.rs index d3fce4ed..545e23b9 100644 --- a/swiftide-integrations/src/redis/mod.rs +++ b/swiftide-integrations/src/redis/mod.rs @@ -123,7 +123,7 @@ impl Redis { /// /// A `String` representing the Redis key for the node. fn cache_key_for_node(&self, node: &Node) -> String { - format!("{}:{}", self.cache_key_prefix, node.calculate_hash()) + format!("{}:{}", self.cache_key_prefix, node.id()) } /// Generates a key for a given node to be persisted in Redis. @@ -131,7 +131,7 @@ impl Redis { if let Some(key_fn) = self.persist_key_fn { key_fn(node) } else { - let hash = node.calculate_hash(); + let hash = node.id(); Ok(format!("{}:{}", node.path.to_string_lossy(), hash)) } } diff --git a/swiftide-integrations/src/redis/node_cache.rs b/swiftide-integrations/src/redis/node_cache.rs index 92c93ae5..02c8249e 100644 --- a/swiftide-integrations/src/redis/node_cache.rs +++ b/swiftide-integrations/src/redis/node_cache.rs @@ -99,7 +99,7 @@ mod tests { cache.reset_cache().await; let node = Node { - id: Some(1), + id: None, path: "test".into(), chunk: "chunk".into(), ..Default::default() diff --git a/swiftide-integrations/src/redis/persist.rs b/swiftide-integrations/src/redis/persist.rs index 84549ae6..85bc79fc 100644 --- a/swiftide-integrations/src/redis/persist.rs +++ b/swiftide-integrations/src/redis/persist.rs @@ -109,7 +109,7 @@ mod tests { .unwrap(); let node = Node { - id: Some(1), + id: None, path: "test".into(), chunk: "chunk".into(), ..Default::default() @@ -134,12 +134,12 @@ mod tests { .unwrap(); let nodes = vec![ Node { - id: Some(1), + id: None, path: "test".into(), ..Default::default() }, Node { - id: Some(2), + id: None, path: "other".into(), ..Default::default() }, @@ -168,7 +168,7 @@ mod tests { .build() .unwrap(); let node = Node { - id: Some(1), + id: None, ..Default::default() };