Merge branch 'main' into trinity--ui-sort-order

quickwit-oss · Oct 21, 2023 · 0ad7454 · 0ad7454
2 parents eae6550 + 340b189
commit 0ad7454
Show file tree

Hide file tree

Showing 173 changed files with 11,746 additions and 7,949 deletions.
diff --git a/.github/actions/cargo-build-macos-binary/action.yml b/.github/actions/cargo-build-macos-binary/action.yml
@@ -20,6 +20,8 @@ runs:
         node-version: 20
         cache: "yarn"
         cache-dependency-path: quickwit/quickwit-ui/yarn.lock
+    - run: yarn global add node-gyp
+      shell: bash
     - run: make build-ui
       shell: bash
     - name: Install protoc

diff --git a/.github/actions/cross-build-binary/action.yml b/.github/actions/cross-build-binary/action.yml
@@ -20,6 +20,8 @@ runs:
         node-version: 20
         cache: "yarn"
         cache-dependency-path: quickwit/quickwit-ui/yarn.lock
+    - run: yarn global add node-gyp
+      shell: bash
     - run: make build-ui
       shell: bash
     - name: Install rustup

diff --git a/.github/workflows/publish_nightly_packages.yml b/.github/workflows/publish_nightly_packages.yml
@@ -10,6 +10,7 @@ jobs:
     name: Build ${{ matrix.target }}
     runs-on: macos-latest
     strategy:
+      fail-fast: false
       matrix:
         target: [x86_64-apple-darwin, aarch64-apple-darwin]
     steps:
@@ -21,6 +22,7 @@ jobs:
           token: ${{ secrets.GITHUB_TOKEN }}
   build-linux-binaries:
     strategy:
+      fail-fast: false
       matrix:
         target: [x86_64-unknown-linux-gnu, aarch64-unknown-linux-gnu]
     name: Build ${{ matrix.target }}

diff --git a/config/tutorials/gh-archive/index-config-for-clickhouse.yaml b/config/tutorials/gh-archive/index-config-for-clickhouse.yaml
@@ -16,7 +16,7 @@ doc_mapping:
       input_formats:
         - unix_timestamp
       output_format: unix_timestamp_secs
-      precision: seconds
+      fast_precision: seconds
       fast: true
     - name: event_type
       type: text

diff --git a/config/tutorials/gh-archive/index-config.yaml b/config/tutorials/gh-archive/index-config.yaml
@@ -37,7 +37,7 @@ doc_mapping:
       fast: true
       input_formats:
         - rfc3339
-      precision: seconds
+      fast_precision: seconds
   timestamp_field: created_at
 
 indexing_settings:

diff --git a/config/tutorials/hdfs-logs/index-config-partitioned.yaml b/config/tutorials/hdfs-logs/index-config-partitioned.yaml
@@ -13,7 +13,7 @@ doc_mapping:
       input_formats:
         - unix_timestamp
       output_format: unix_timestamp_secs
-      precision: seconds
+      fast_precision: seconds
       fast: true
     - name: tenant_id
       type: u64

diff --git a/config/tutorials/hdfs-logs/index-config-retention-policy.yaml b/config/tutorials/hdfs-logs/index-config-retention-policy.yaml
@@ -13,7 +13,7 @@ doc_mapping:
       input_formats:
         - unix_timestamp
       output_format: unix_timestamp_secs
-      precision: seconds
+      fast_precision: seconds
       fast: true
     - name: tenant_id
       type: u64

diff --git a/config/tutorials/hdfs-logs/index-config.yaml b/config/tutorials/hdfs-logs/index-config.yaml
@@ -13,7 +13,7 @@ doc_mapping:
       input_formats:
         - unix_timestamp
       output_format: unix_timestamp_secs
-      precision: seconds
+      fast_precision: seconds
       fast: true
     - name: tenant_id
       type: u64

diff --git a/config/tutorials/otel-trace/index-config.yaml b/config/tutorials/otel-trace/index-config.yaml
@@ -34,7 +34,7 @@ doc_mapping:
     - name: span_start_timestamp_secs
       type: datetime
       indexed: true
-      precision: seconds
+      fast_precision: seconds
       fast: true
       input_formats: [unix_timestamp]
       output_format: unix_timestamp_secs

diff --git a/config/tutorials/stackoverflow/index-config.yaml b/config/tutorials/stackoverflow/index-config.yaml
@@ -22,7 +22,7 @@ doc_mapping:
       fast: true
       input_formats:
         - rfc3339
-      precision: seconds
+      fast_precision: seconds
   timestamp_field: creationDate
 
 search_settings:

diff --git a/docs/configuration/index-config.md b/docs/configuration/index-config.md
@@ -34,7 +34,7 @@ doc_mapping:
       input_formats:
         - unix_timestamp
       output_format: unix_timestamp_secs
-      precision: seconds
+      fast_precision: seconds
       fast: true
     - name: severity_text
       type: text
@@ -222,7 +222,7 @@ The timezone name format specifier (`%Z`) is not supported currently.
 Converting timestamps from float to integer values may occurs with a loss of precision.
 :::
 
-When a `datetime` field is stored as a fast field, the `precision` parameter indicates the precision used to truncate the values before encoding, which improves compression (truncation here means zeroing). The `precision` parameter can take the following values: `seconds`, `milliseconds`, `microseconds`, or `nanoseconds`. It only affects what is stored in fast fields when a `datetime` field is marked as "fast". Finally, operations on `datetime` fast fields, e.g. via aggregations, need to be done at the nanosecond level.
+When a `datetime` field is stored as a fast field, the `fast_precision` parameter indicates the precision used to truncate the values before encoding, which improves compression (truncation here means zeroing). The `fast_precision` parameter can take the following values: `seconds`, `milliseconds`, `microseconds`, or `nanoseconds`. It only affects what is stored in fast fields when a `datetime` field is marked as "fast". Finally, operations on `datetime` fast fields, e.g. via aggregations, need to be done at the nanosecond level.
 
 :::info
 Internally `datetime` is stored in `nanoseconds` in fast fields and in the docstore, and in `seconds` in the term dictionary.
@@ -248,7 +248,7 @@ output_format: unix_timestamp_secs
 stored: true
 indexed: true
 fast: true
-precision: milliseconds
+fast_precision: milliseconds
 ```
 
 **Parameters for datetime field**
@@ -260,7 +260,7 @@ precision: milliseconds
 | `stored`        | Whether the field values are stored in the document store | `true` |
 | `indexed`       | Whether the field values are indexed | `true` |
 | `fast`          | Whether the field values are stored in a fast field | `false` |
-| `precision`     | The precision (`seconds`, `milliseconds`, `microseconds`, or `nanoseconds`) used to store the fast values. | `seconds` |
+| `fast_precision`     | The precision (`seconds`, `milliseconds`, `microseconds`, or `nanoseconds`) used to store the fast values. | `seconds` |
 
 #### `bool` type
 

diff --git a/docs/distributed-tracing/otel-service.md b/docs/distributed-tracing/otel-service.md
@@ -84,7 +84,7 @@ doc_mapping:
       input_formats: [unix_timestamp]
       indexed: false
       fast: true
-      precision: seconds
+      fast_precision: seconds
       stored: false
     - name: span_duration_millis
       type: u64

diff --git a/docs/get-started/quickstart.md b/docs/get-started/quickstart.md
@@ -108,7 +108,7 @@ doc_mapping:
       fast: true
       input_formats:
         - rfc3339
-      precision: seconds
+      fast_precision: seconds
   timestamp_field: creationDate
 
 search_settings:

diff --git a/docs/get-started/tutorials/add-full-text-search-to-your-olap-db.md b/docs/get-started/tutorials/add-full-text-search-to-your-olap-db.md
@@ -68,7 +68,7 @@ doc_mapping:
       input_formats:
         - unix_timestamp
       output_format: unix_timestamp_secs
-      precision: seconds
+      fast_precision: seconds
       fast: true
     - name: event_type
       type: text

diff --git a/docs/get-started/tutorials/tutorial-hdfs-logs-distributed-search-aws-s3.md b/docs/get-started/tutorials/tutorial-hdfs-logs-distributed-search-aws-s3.md
@@ -106,7 +106,7 @@ doc_mapping:
       input_formats:
         - unix_timestamp
       output_format: unix_timestamp_secs
-      precision: seconds
+      fast_precision: seconds
       fast: true
     - name: tenant_id
       type: u64
@@ -151,7 +151,8 @@ gunzip -c hdfs-logs-multitenants.json.gz | ./quickwit index ingest --index hdfs-
 
 :::note
 
-4GB of RAM is enough to index this dataset; an instance like `t4g.medium` with 4GB and 2 vCPU indexed this dataset in 20 minutes.
+8GB of RAM is enough to index this dataset; an instance like `t4g.large` with 8GB and 2 vCPU indexed this dataset in less than 10 minutes 
+(provided that you have some CPU credits).
 
 This step can also be done on your local machine. 
 The `ingest` subcommand generates locally [splits](../../overview/architecture) of 10 million documents and will upload 
@@ -165,7 +166,34 @@ You can check it's working by using `search` subcommand and look for `ERROR` in
 ./quickwit index search --index hdfs-logs --query "severity_text:ERROR"
 ```
 
-Now that we have indexed the logs and can search from one instance, It's time to configure and start two other instances to form a cluster.
+which returns the json
+
+```json
+{
+  "num_hits": 345,
+  "hits": [
+    {
+      "attributes": {
+        "class": "org.apache.hadoop.hdfs.server.datanode.DataNode"
+      },
+      "body": "RECEIVED SIGNAL 15: SIGTERM",
+      "resource": {
+        "service": "datanode/16"
+      },
+      "severity_text": "ERROR",
+      "tenant_id": 51,
+      "timestamp": 1469687755
+    },
+    ...
+  ],
+  "elapsed_time_micros": 522542
+}
+```
+
+You can see that this query has 345 hits. In this case for the first run, the server responded in 523 milliseconds.
+Subsequent runs use the cached metastore and can be resolved in under 100 milliseconds.
+
+Now that we have indexed the logs and can search from one instance, it's time to configure and start two other instances to form a cluster.
 
 ## Start two more instances
 
@@ -244,49 +272,6 @@ Now that you have a search cluster, ideally, you will want to load balance exter
 This can quickly be done by adding an AWS load balancer to listen to incoming HTTP or HTTPS traffic and forward it to a target group.
 You can now play with your cluster, kill processes randomly, add/remove new instances, and keep calm.
 
-
-## Use time pruning
-
-Let's execute a simple query that returns only `ERROR` entries on field `severity_text`:
-
-```bash
-curl -v 'http://your-load-balancer/api/v1/hdfs-logs/search?query=severity_text:ERROR
-```
-
-which returns the json
-
-```json
-{
-  "num_hits": 10000,
-  "hits": [
-    {
-      "body": "Receiving BP-108841162-10.10.34.11-1440074360971:blk_1073836032_95208 src: /10.10.34.20:60300 dest: /10.10.34.13:50010",
-      "resource": {
-        "service": "datanode/03"
-      },
-      "severity_text": "INFO",
-      "tenant_id": 58,
-      "timestamp": 1440670490
-    }
-    ...
-  ],
-  "elapsed_time_micros": 2502
-}
-```
-
-You can see that this query has 10 000 hits and that the server responds in 2.5 milliseconds.
-
-The index config shows that we can use the timestamp field parameters `start_timestamp` and `end_timestamp` and benefit from time pruning. 
-Behind the scenes, Quickwit will only query [splits](../../overview/architecture) that have logs in this time range. This can have a significant impact on speed.
-
-
-```bash
-curl -v 'http://your-load-balancer/api/v1/hdfs-logs/search?query=severity_text:ERROR&start_timestamp=1442834249&end_timestamp=1442900000'
-```
-
-Returns 6 hits in 0.36 seconds.
-
-
 ## Clean
 
 Let's do some cleanup by deleting the index:

diff --git a/docs/get-started/tutorials/tutorial-hdfs-logs.md b/docs/get-started/tutorials/tutorial-hdfs-logs.md
@@ -95,7 +95,7 @@ doc_mapping:
       input_formats:
         - unix_timestamp
       output_format: unix_timestamp_secs
-      precision: seconds
+      fast_precision: seconds
       fast: true
     - name: tenant_id
       type: u64

diff --git a/docs/guides/schemaless.md b/docs/guides/schemaless.md
@@ -37,7 +37,7 @@ doc_mapping:
       input_formats:
         - unix_timestamp
       output_format: unix_timestamp_secs
-      precision: seconds
+      fast_precision: seconds
       fast: true
     - name: server
       type: text
@@ -125,7 +125,7 @@ doc_mapping:
       input_formats:
         - unix_timestamp
       output_format: unix_timestamp_secs
-      precision: seconds
+      fast_precision: seconds
       fast: true
     - name: user_id
       type: text
@@ -159,7 +159,7 @@ doc_mapping:
       input_formats:
         - unix_timestamp
       output_format: unix_timestamp_secs
-      precision: seconds
+      fast_precision: seconds
       fast: true
     - name: user_id
       type: text
@@ -230,7 +230,7 @@ doc_mapping:
       input_formats:
         - unix_timestamp
       output_format: unix_timestamp_secs
-      precision: seconds
+      fast_precision: seconds
       fast: true
     - name: Attributes
       type: json

diff --git a/docs/ingest-data/kafka.md b/docs/ingest-data/kafka.md
@@ -58,7 +58,7 @@ doc_mapping:
       fast: true
       input_formats:
         - rfc3339
-      precision: seconds
+      fast_precision: seconds
   timestamp_field: created_at
 
 indexing_settings:

diff --git a/docs/ingest-data/kinesis.md b/docs/ingest-data/kinesis.md
@@ -68,7 +68,7 @@ doc_mapping:
       fast: true
       input_formats:
         - rfc3339
-      precision: seconds
+      fast_precision: seconds
   timestamp_field: created_at
 
 indexing_settings:

diff --git a/docs/ingest-data/pulsar.md b/docs/ingest-data/pulsar.md
@@ -105,7 +105,7 @@ doc_mapping:
       fast: true
       input_formats:
         - rfc3339
-      precision: seconds
+      fast_precision: seconds
   timestamp_field: creationDate
 
 search_settings:

diff --git a/docs/log-management/otel-service.md b/docs/log-management/otel-service.md
@@ -36,7 +36,7 @@ doc_mapping:
       input_formats: [unix_timestamp]
       indexed: false
       fast: true
-      precision: seconds
+      fast_precision: seconds
       stored: false
     - name: timestamp_nanos
       type: u64

diff --git a/docs/log-management/send-logs/using-vector.md b/docs/log-management/send-logs/using-vector.md
@@ -61,7 +61,7 @@ doc_mapping:
       output_format: unix_timestamp_nanos
       indexed: false
       fast: true
-      precision: milliseconds
+      fast_precision: milliseconds
     - name: observed_timestamp_nanos
       type: datetime
       input_formats: [unix_timestamp]

diff --git a/docs/reference/cli.md b/docs/reference/cli.md
@@ -336,7 +336,7 @@ quickwit index ingest
 | `--batch-size-limit` | Size limit of each submitted document batch. |
 | `--wait` | Wait for all documents to be commited and available for search before exiting |
 | `--force` | Force a commit after the last document is sent, and wait for all documents to be committed and available for search before exiting |
-| `--commit-timeout` | Duration of the commit timeout operation. |
+| `--commit-timeout` | Timeout for ingest operations that require waiting for the final commit (`--wait` or `--force`). This is different from the `commit_timeout_secs` indexing setting which sets the maximum time before commiting splits after their creation. |
 
 *Examples*
 

diff --git a/docs/reference/rest-api.md b/docs/reference/rest-api.md
@@ -61,8 +61,8 @@ POST api/v1/<index id>/search
 | Variable            | Type       | Description                                                                                                                                            | Default value                                      |
 |---------------------|------------|--------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------------------|
 | `query`           | `String`   | Query text. See the [query language doc](query-language.md) (mandatory)                                                                                |                                                    |
-| `start_timestamp` | `i64`      | If set, restrict search to documents with a `timestamp >= start_timestamp`. The value must be in seconds.                                              |                                                    |
-| `end_timestamp`   | `i64`      | If set, restrict search to documents with a `timestamp < end_timestamp`. The value must be in seconds.                                                 |                                                    |
+| `start_timestamp` | `i64`      | If set, restrict search to documents with a `timestamp >= start_timestamp`, taking advantage of potential time pruning oportunities. The value must be in seconds.                                              |                                                    |
+| `end_timestamp`   | `i64`      | If set, restrict search to documents with a `timestamp < end_timestamp`, taking advantage of potential time pruning oportunities. The value must be in seconds.                                                 |                                                    |
 | `start_offset`    | `Integer`  | Number of documents to skip                                                                                                                            | `0`                                                |
 | `max_hits`        | `Integer`  | Maximum number of hits to return (by default 20)                                                                                                       | `20`                                               |
 | `search_field`    | `[String]` | Fields to search on if no field name is specified in the query. Comma-separated list, e.g. "field1,field2"                                             | index_config.search_settings.default_search_fields |
@@ -262,7 +262,7 @@ curl -XPOST http://0.0.0.0:8080/api/v1/indexes --data @index_config.json -H "Con
                 "name": "timestamp",
                 "type": "datetime",
                 "input_formats": ["unix_timestamp"],
-                "precision": "seconds",
+                "fast_precision": "seconds",
                 "fast": true
             },
             {