Skip to content

Commit

Permalink
feat(indexing)!: Use UUIDv3 for indexing node ids (#277)
Browse files Browse the repository at this point in the history
Use UUIDv3 to generate node ids for storage and cache. This is more
reliable than the previous u64 hashing, with less chance for collision.
Additionally, the previous hash algorithm changes over Rust releases and
should not be used.

Closes #272 and needed for proper Rust 1.81 support as in #275 

BREAKING CHANGE: All generated ids are now UUIDs, meaning all persisted
data needs to be purged or manually updated, as default upserts will
fail. There is no backwards compatibility.
  • Loading branch information
timonv committed Sep 6, 2024
1 parent 5a724df commit 57fe4aa
Show file tree
Hide file tree
Showing 11 changed files with 56 additions and 32 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@ lazy_static = { version = "1.5.0" }
chrono = { version = "0.4" }
indoc = { version = "2.0" }
regex = { version = "1.10.6" }
uuid = { version = "1.10", features = ["v3", "v4", "serde"] }

# Integrations
spider = { version = "2.2" }
Expand Down
2 changes: 1 addition & 1 deletion swiftide-core/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ lazy_static = { workspace = true }
derive_builder = { workspace = true }

tera = { version = "1.20", default-features = false }
uuid = { version = "1.10", features = ["v4"] }
uuid = { workspace = true, features = ["v4", "v3"] }

[dev-dependencies]
test-case = { workspace = true }
Expand Down
30 changes: 20 additions & 10 deletions swiftide-core/src/node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ use std::{
collections::HashMap,
fmt::Debug,
hash::{Hash, Hasher},
os::unix::ffi::OsStrExt,
path::PathBuf,
};

Expand All @@ -37,7 +38,7 @@ use crate::{metadata::Metadata, Embedding, SparseEmbedding};
#[derive(Default, Clone, Serialize, Deserialize, PartialEq)]
pub struct Node {
/// Optional identifier for the node.
pub id: Option<u64>,
pub id: Option<uuid::Uuid>,
/// File path associated with the node.
pub path: PathBuf,
/// Data chunk contained in the node.
Expand Down Expand Up @@ -162,17 +163,26 @@ impl Node {
format!("{}\n{}", metadata, self.chunk)
}

/// Calculates a hash value for the node based on its path and chunk.
/// Retrieve the identifier of the node.
///
/// The hash value is calculated using the default hasher provided by the standard library.
/// Calculates the identifier of the node based on its path and chunk as bytes, returning a
/// UUID (v3).
///
/// # Returns
///
/// A 64-bit hash value representing the node.
pub fn calculate_hash(&self) -> u64 {
let mut hasher = std::collections::hash_map::DefaultHasher::new();
self.hash(&mut hasher);
hasher.finish()
/// If previously calculated, the identifier is returned directly. If not, a new identifier is
/// calculated without storing it.
pub fn id(&self) -> uuid::Uuid {
if let Some(id) = self.id {
return id;
}

let bytes = [self.path.as_os_str().as_bytes(), self.chunk.as_bytes()].concat();
uuid::Uuid::new_v3(&uuid::Uuid::NAMESPACE_OID, &bytes)
}

/// Updates the identifier of the node.
pub fn update_id(&mut self) {
self.id = None;
self.id = Some(self.id());
}
}

Expand Down
2 changes: 2 additions & 0 deletions swiftide-integrations/src/fluvio/loader.rs
Original file line number Diff line number Diff line change
Expand Up @@ -84,6 +84,7 @@ mod tests {
.with_wait_for(testcontainers::core::WaitFor::message_on_stdout(
"started successfully",
))
.with_wait_for(testcontainers::core::WaitFor::seconds(1))
.with_network(NETWORK_NAME)
.with_container_name("sc")
.with_cmd("./fluvio-run sc --local /fluvio/metadata".split(' '))
Expand All @@ -96,6 +97,7 @@ mod tests {
.with_wait_for(testcontainers::core::WaitFor::message_on_stdout(
"started successfully",
))
.with_wait_for(testcontainers::core::WaitFor::seconds(1))
.with_network(NETWORK_NAME)
.with_container_name("spu")
.with_cmd(format!("./fluvio-run spu -i 5001 -p spu:{SPU_PORT1} -v spu:{SPU_PORT2} --sc-addr sc:9004 --log-base-dir /fluvio/data").split(' '))
Expand Down
9 changes: 8 additions & 1 deletion swiftide-integrations/src/lancedb/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -157,7 +157,14 @@ impl LanceDBBuilder {
fields.push(Field::new(field.field_name(), DataType::Utf8, false));
}
FieldConfig::ID => {
fields.push(Field::new(field.field_name(), DataType::UInt64, false));
fields.push(Field::new(
field.field_name(),
DataType::FixedSizeList(
Arc::new(Field::new("item", DataType::UInt8, true)),
16,
),
false,
));
}
}
}
Expand Down
11 changes: 7 additions & 4 deletions swiftide-integrations/src/lancedb/persist.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,11 @@ use std::sync::Arc;
use anyhow::Context as _;
use anyhow::Result;
use arrow_array::types::Float32Type;
use arrow_array::types::UInt64Type;
use arrow_array::types::UInt8Type;
use arrow_array::types::Utf8Type;
use arrow_array::Array;
use arrow_array::FixedSizeListArray;
use arrow_array::GenericByteArray;
use arrow_array::PrimitiveArray;
use arrow_array::RecordBatch;
use arrow_array::RecordBatchIterator;
use async_trait::async_trait;
Expand Down Expand Up @@ -138,10 +137,14 @@ impl LanceDB {
FieldConfig::ID => {
let mut row = Vec::with_capacity(nodes.len());
for node in nodes {
let data = Some(node.calculate_hash());
let data = Some(node.id().as_bytes().map(Some));
row.push(data);
}
batches.push(Arc::new(PrimitiveArray::<UInt64Type>::from_iter(row)));
batches.push(Arc::new(FixedSizeListArray::from_iter_primitive::<
UInt8Type,
_,
_,
>(row, 16)));
}
}
}
Expand Down
18 changes: 9 additions & 9 deletions swiftide-integrations/src/qdrant/indexing_node.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ impl TryInto<qdrant::PointStruct> for NodeWithVectors<'_> {
fn try_into(self) -> Result<qdrant::PointStruct> {
let node = self.node;
// Calculate a unique identifier for the node.
let id = node.calculate_hash();
let id = node.id();

// Extend the metadata with additional information.
// TODO: The node is already cloned in the `NodeWithVectors` constructor.
Expand Down Expand Up @@ -64,7 +64,7 @@ impl TryInto<qdrant::PointStruct> for NodeWithVectors<'_> {
try_create_vectors(&self.vector_fields, vectors, node.sparse_vectors.clone())?;

// Construct the `qdrant::PointStruct` and return it.
Ok(qdrant::PointStruct::new(id, vectors, payload))
Ok(qdrant::PointStruct::new(id.to_string(), vectors, payload))
}
}

Expand Down Expand Up @@ -121,10 +121,10 @@ mod tests {

use crate::qdrant::indexing_node::NodeWithVectors;

static EXPECTED_VECTOR_ID: u64 = 17_298_870_094_173_045_322;
static EXPECTED_UUID: &str = "d42d252d-671d-37ef-a157-8e85d0710610";

#[test_case(
Node { id: Some(1), path: "/path".into(), chunk: "data".into(),
Node { id: None, path: "/path".into(), chunk: "data".into(),
vectors: Some(HashMap::from([(EmbeddedField::Chunk, vec![1.0])])),
original_size: 4,
offset: 0,
Expand All @@ -133,7 +133,7 @@ mod tests {
..Default::default()
},
HashSet::from([EmbeddedField::Combined]),
PointStruct { id: Some(PointId::from(EXPECTED_VECTOR_ID)), payload: HashMap::from([
PointStruct { id: Some(PointId::from(EXPECTED_UUID)), payload: HashMap::from([
("content".into(), Value::from("data")),
("path".into(), Value::from("/path")),
("m1".into(), Value::from("mv1"))]),
Expand All @@ -142,7 +142,7 @@ mod tests {
"Node with single vector creates struct with unnamed vector"
)]
#[test_case(
Node { id: Some(1), path: "/path".into(), chunk: "data".into(),
Node { id: None, path: "/path".into(), chunk: "data".into(),
vectors: Some(HashMap::from([
(EmbeddedField::Chunk, vec![1.0]),
(EmbeddedField::Metadata("m1".into()), vec![2.0])
Expand All @@ -154,7 +154,7 @@ mod tests {
..Default::default()
},
HashSet::from([EmbeddedField::Chunk, EmbeddedField::Metadata("m1".into())]),
PointStruct { id: Some(PointId::from(EXPECTED_VECTOR_ID)), payload: HashMap::from([
PointStruct { id: Some(PointId::from(EXPECTED_UUID)), payload: HashMap::from([
("content".into(), Value::from("data")),
("path".into(), Value::from("/path")),
("m1".into(), Value::from("mv1"))]),
Expand All @@ -170,7 +170,7 @@ mod tests {
"Node with multiple vectors creates struct with named vectors"
)]
#[test_case(
Node { id: Some(1), path: "/path".into(), chunk: "data".into(),
Node { id: None, path: "/path".into(), chunk: "data".into(),
vectors: Some(HashMap::from([
(EmbeddedField::Chunk, vec![1.0]),
(EmbeddedField::Combined, vec![1.0]),
Expand All @@ -184,7 +184,7 @@ mod tests {
..Default::default()
},
HashSet::from([EmbeddedField::Combined]),
PointStruct { id: Some(PointId::from(EXPECTED_VECTOR_ID)), payload: HashMap::from([
PointStruct { id: Some(PointId::from(EXPECTED_UUID)), payload: HashMap::from([
("content".into(), Value::from("data")),
("path".into(), Value::from("/path")),
("m1".into(), Value::from("mv1")),
Expand Down
4 changes: 2 additions & 2 deletions swiftide-integrations/src/redis/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -123,15 +123,15 @@ impl Redis {
///
/// A `String` representing the Redis key for the node.
fn cache_key_for_node(&self, node: &Node) -> String {
format!("{}:{}", self.cache_key_prefix, node.calculate_hash())
format!("{}:{}", self.cache_key_prefix, node.id())
}

/// Generates a key for a given node to be persisted in Redis.
fn persist_key_for_node(&self, node: &Node) -> Result<String> {
if let Some(key_fn) = self.persist_key_fn {
key_fn(node)
} else {
let hash = node.calculate_hash();
let hash = node.id();
Ok(format!("{}:{}", node.path.to_string_lossy(), hash))
}
}
Expand Down
2 changes: 1 addition & 1 deletion swiftide-integrations/src/redis/node_cache.rs
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ mod tests {
cache.reset_cache().await;

let node = Node {
id: Some(1),
id: None,
path: "test".into(),
chunk: "chunk".into(),
..Default::default()
Expand Down
8 changes: 4 additions & 4 deletions swiftide-integrations/src/redis/persist.rs
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ mod tests {
.unwrap();

let node = Node {
id: Some(1),
id: None,
path: "test".into(),
chunk: "chunk".into(),
..Default::default()
Expand All @@ -134,12 +134,12 @@ mod tests {
.unwrap();
let nodes = vec![
Node {
id: Some(1),
id: None,
path: "test".into(),
..Default::default()
},
Node {
id: Some(2),
id: None,
path: "other".into(),
..Default::default()
},
Expand Down Expand Up @@ -168,7 +168,7 @@ mod tests {
.build()
.unwrap();
let node = Node {
id: Some(1),
id: None,
..Default::default()
};

Expand Down

0 comments on commit 57fe4aa

Please sign in to comment.