Merge branch 'main' into default-fast-field-tokenizer

quickwit-oss · Jul 19, 2023 · 77b7c0c · 77b7c0c
2 parents a132547 + e3986b7
commit 77b7c0c
Show file tree

Hide file tree

Showing 31 changed files with 741 additions and 388 deletions.
diff --git a/docs/reference/rest-api.md b/docs/reference/rest-api.md
@@ -24,7 +24,7 @@ GET [..]/search?query=barack%20obama
 
 Successful requests return a 2xx HTTP status code.
 
-Failed requests return a 4xx HTTP status code. The response body of failed requests holds a JSON object containing an `message` field that describes the error.
+Failed requests return a 4xx HTTP status code. The response body of failed requests holds a JSON object containing a `message` field that describes the error.
 
 ```json
 {
@@ -37,7 +37,7 @@ Failed requests return a 4xx HTTP status code. The response body of failed reque
 ### Search in an index
 
 Search for documents matching a query in the given index `api/v1/<index id>/search`. This endpoint is available as long as you have at least one node running a searcher service in the cluster.
-The search endpoint accepts `GET` and `POST` requests. The [parameters](#get-parameters) are URL parameters in case of `GET` or JSON key value pairs in case of `POST`.
+The search endpoint accepts `GET` and `POST` requests. The [parameters](#get-parameters) are URL parameters for `GET` requests or JSON key-value pairs for `POST` requests.
 
 ```
 GET api/v1/<index id>/search?query=searchterm
@@ -91,10 +91,10 @@ The response is a JSON object, and the content type is `application/json; charse
 GET api/v1/<index id>/search/stream?query=searchterm&fast_field=my_id
 ```
 
-Streams field values from ALL documents matching a search query in the given index `<index id>`, in a specified output format among the following:
+Streams field values from ALL documents matching a search query in the target index `<index id>`, in a specified output format among the following:
 
 - [CSV](https://datatracker.ietf.org/doc/html/rfc4180)
-- [ClickHouse RowBinary](https://clickhouse.tech/docs/en/interfaces/formats/#rowbinary). If `partition_by_field` is set, Quickwit returns chunks of data for a each partition field value. Each chunk starts with 16 bytes being partition value and content length and then the `fast_field` values in `RowBinary` format.
+- [ClickHouse RowBinary](https://clickhouse.tech/docs/en/interfaces/formats/#rowbinary). If `partition_by_field` is set, Quickwit returns chunks of data for each partition field value. Each chunk starts with 16 bytes being partition value and content length and then the `fast_field` values in `RowBinary` format.
 
 `fast_field` and `partition_by_field` must be fast fields of type `i64` or `u64`.
 
@@ -104,7 +104,7 @@ This endpoint is available as long as you have at least one node running a searc
 
 :::note
 
-The endpoint will return 10 million values if 10 million documents match the query. This is expected, this endpoint is made to support queries matching millions of document and return field values in a reasonable response time.
+The endpoint will return 10 million values if 10 million documents match the query. This is expected, this endpoint is made to support queries matching millions of documents and return field values in a reasonable response time.
 
 :::
 
@@ -345,24 +345,27 @@ Delete index of ID `index id`.
 
 #### Response
 
-The response is the list of delete split files, and the content type is `application/json; charset=UTF-8.`
+The response is the list of deleted split files; the content type is `application/json; charset=UTF-8.`
 
 ```json
 [
     {
+        "split_id": "01GK1XNAECH7P14850S9VV6P94",
+        "num_docs": 1337,
+        "uncompressed_docs_size_bytes": 23933408,
         "file_name": "01GK1XNAECH7P14850S9VV6P94.split",
-        "file_size_in_bytes": 2991676
+        "file_size_bytes": 2991676
     }
 ]
 ```
 
-### Get all indexes metadatas
+### Get all indexes metadata
 
 ```
 GET api/v1/indexes
 ```
 
-Get the indexes metadatas of all indexes present in the metastore.
+Retrieve the metadata of all indexes present in the metastore.
 
 #### Response
 

diff --git a/quickwit/quickwit-cli/src/index.rs b/quickwit/quickwit-cli/src/index.rs
@@ -891,8 +891,8 @@ pub async fn delete_index_cli(args: DeleteIndexArgs) -> anyhow::Result<()> {
             "The following files will be removed from the index `{}`",
             args.index_id
         );
-        for file_entry in affected_files {
-            println!(" - {}", file_entry.file_name);
+        for split_info in affected_files {
+            println!(" - {}", split_info.file_name.display());
         }
         return Ok(());
     }

diff --git a/quickwit/quickwit-cli/src/tool.rs b/quickwit/quickwit-cli/src/tool.rs
@@ -509,47 +509,47 @@ pub async fn garbage_collect_index_cli(args: GarbageCollectIndexArgs) -> anyhow:
     let removal_info = index_service
         .garbage_collect_index(&args.index_id, args.grace_period, args.dry_run)
         .await?;
-    if removal_info.removed_split_entries.is_empty() && removal_info.failed_split_ids.is_empty() {
+    if removal_info.removed_split_entries.is_empty() && removal_info.failed_splits.is_empty() {
         println!("No dangling files to garbage collect.");
         return Ok(());
     }
 
     if args.dry_run {
         println!("The following files will be garbage collected.");
-        for file_entry in removal_info.removed_split_entries {
-            println!(" - {}", file_entry.file_name);
+        for split_info in removal_info.removed_split_entries {
+            println!(" - {}", split_info.file_name.display());
         }
         return Ok(());
     }
 
-    if !removal_info.failed_split_ids.is_empty() {
+    if !removal_info.failed_splits.is_empty() {
         println!("The following splits were attempted to be removed, but failed.");
-        for split_id in removal_info.failed_split_ids.iter() {
-            println!(" - {split_id}");
+        for split_info in &removal_info.failed_splits {
+            println!(" - {}", split_info.split_id);
         }
         println!(
             "{} Splits were unable to be removed.",
-            removal_info.failed_split_ids.len()
+            removal_info.failed_splits.len()
         );
     }
 
     let deleted_bytes: u64 = removal_info
         .removed_split_entries
         .iter()
-        .map(|entry| entry.file_size_in_bytes)
+        .map(|split_info| split_info.file_size_bytes.get_bytes())
         .sum();
     println!(
         "{}MB of storage garbage collected.",
         deleted_bytes / 1_000_000
     );
 
-    if removal_info.failed_split_ids.is_empty() {
+    if removal_info.failed_splits.is_empty() {
         println!(
             "{} Index successfully garbage collected.",
             "✔".color(GREEN_COLOR)
         );
     } else if removal_info.removed_split_entries.is_empty()
-        && !removal_info.failed_split_ids.is_empty()
+        && !removal_info.failed_splits.is_empty()
     {
         println!("{} Failed to garbage collect index.", "✘".color(RED_COLOR));
     } else {

diff --git a/quickwit/quickwit-cluster/src/node.rs b/quickwit/quickwit-cluster/src/node.rs
@@ -64,10 +64,12 @@ impl ClusterNode {
         port: u16,
         is_self_node: bool,
         enabled_services: &[&str],
+        indexing_tasks: &[IndexingTask],
     ) -> Self {
+        use itertools::Itertools;
         use quickwit_common::tower::make_channel;
 
-        use crate::member::{ENABLED_SERVICES_KEY, GRPC_ADVERTISE_ADDR_KEY};
+        use crate::member::{ENABLED_SERVICES_KEY, GRPC_ADVERTISE_ADDR_KEY, INDEXING_TASK_PREFIX};
 
         let gossip_advertise_addr = ([127, 0, 0, 1], port).into();
         let grpc_advertise_addr = ([127, 0, 0, 1], port + 1).into();
@@ -76,6 +78,13 @@ impl ClusterNode {
         let mut node_state = NodeState::default();
         node_state.set(ENABLED_SERVICES_KEY, enabled_services.join(","));
         node_state.set(GRPC_ADVERTISE_ADDR_KEY, grpc_advertise_addr.to_string());
+
+        for (indexing_task, indexing_tasks_group) in
+            indexing_tasks.iter().group_by(|&task| task).into_iter()
+        {
+            let key = format!("{INDEXING_TASK_PREFIX}:{}", indexing_task.to_string());
+            node_state.set(key, indexing_tasks_group.count().to_string());
+        }
         Self::try_new(chitchat_id, &node_state, channel, is_self_node).unwrap()
     }
 

diff --git a/quickwit/quickwit-common/src/file_entry.rs b/quickwit/quickwit-common/src/file_entry.rs
diff --git a/quickwit/quickwit-common/src/lib.rs b/quickwit/quickwit-common/src/lib.rs
@@ -22,7 +22,6 @@
 mod coolid;
 
 pub mod binary_heap;
-mod file_entry;
 pub mod fs;
 pub mod io;
 mod kill_switch;
@@ -49,7 +48,6 @@ use std::ops::{Range, RangeInclusive};
 use std::str::FromStr;
 
 pub use coolid::new_coolid;
-pub use file_entry::FileEntry;
 pub use kill_switch::KillSwitch;
 pub use progress::{Progress, ProtectedZoneGuard};
 pub use stream_utils::{BoxStream, ServiceStream};

diff --git a/quickwit/quickwit-core/Cargo.toml b/quickwit/quickwit-core/Cargo.toml
@@ -39,3 +39,6 @@ quickwit-storage = { workspace = true }
 [dev-dependencies]
 mockall = { workspace = true }
 serde_yaml = { workspace = true }
+
+quickwit-common = { workspace = true, features = ["testsuite"] }
+quickwit-metastore = { workspace = true, features = ["testsuite"] }