Skip to content

Commit

Permalink
Merge branch 'main' into slop_docs
Browse files Browse the repository at this point in the history
  • Loading branch information
fulmicoton authored Jul 20, 2023
2 parents ca19ad2 + ffc935c commit d00ebc7
Show file tree
Hide file tree
Showing 170 changed files with 5,609 additions and 2,609 deletions.
43 changes: 25 additions & 18 deletions .github/workflows/coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ on:
env:
AWS_ACCESS_KEY_ID: "placeholder"
AWS_SECRET_ACCESS_KEY: "placeholder"
AWS_REGION: "us-east-1"
AWS_REGION: us-east-1
CARGO_INCREMENTAL: 0
QW_DISABLE_TELEMETRY: 1
QW_S3_ENDPOINT: "http://localhost:4566" # Services are exposed as localhost because we are not running coverage in a container.
Expand All @@ -24,7 +24,7 @@ env:
jobs:
test:
name: Coverage
runs-on: ubuntu-latest
runs-on: buildjet-8vcpu-ubuntu-2204 # ubuntu-latest
# Setting a containing will require to fix the QW_S3_ENDPOINT to http://localstack:4566
services:
localstack:
Expand All @@ -34,15 +34,15 @@ jobs:
- "4571:4571"
- "8080:8080"
env:
# `kinesalite` provides a more accurate implementation than
# the default Kinesis provider (`kinesis-mock`).
KINESIS_PROVIDER: kinesalite
SERVICES: kinesis,s3
options: >-
--health-cmd "curl -k https://localhost:4566"
--health-interval 10s
--health-timeout 5s
--health-retries 5
credentials:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_ACCESS_TOKEN }}

postgres:
image: postgres:latest
Expand All @@ -57,6 +57,9 @@ jobs:
--health-interval 10s
--health-timeout 5s
--health-retries 5
credentials:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_ACCESS_TOKEN }}

kafka-broker:
image: confluentinc/cp-kafka:7.2.1
Expand All @@ -74,24 +77,33 @@ jobs:
KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1
KAFKA_JMX_PORT: 9101
KAFKA_JMX_HOSTNAME: localhost
KAFKA_HEAP_OPTS: -Xms256M -Xmx256M
options: >-
--health-cmd "cub kafka-ready -b localhost:9092 1 5"
--health-interval 10s
--health-timeout 5s
--health-retries 5
credentials:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_ACCESS_TOKEN }}

zookeeper:
image: confluentinc/cp-zookeeper:7.2.1
ports:
- "2181:2181"
env:
KAFKA_HEAP_OPTS: -Xms256M -Xmx256M
ZOOKEEPER_CLIENT_PORT: 2181
ZOOKEEPER_TICK_TIME: 2000
options: >-
--health-cmd "cub zk-ready localhost:2181 5"
--health-interval 10s
--health-timeout 5s
--health-retries 5
credentials:
username: ${{ secrets.DOCKERHUB_USERNAME }}
password: ${{ secrets.DOCKERHUB_ACCESS_TOKEN }}

steps:
- uses: actions/checkout@v3

Expand All @@ -106,7 +118,6 @@ jobs:
path: |
~/.cargo/git
~/.cargo/registry
target
key: ${{ runner.os }}-cargo-test-${{ hashFiles('Cargo.lock') }}
restore-keys: |
${{ runner.os }}-cargo-test-${{ hashFiles('Cargo.lock') }}
Expand All @@ -127,24 +138,20 @@ jobs:
- name: Run Pulsar service
run: DOCKER_SERVICES=pulsar make docker-compose-up

- uses: actions-rs/toolchain@v1
with:
toolchain: 1.62
override: true
components: clippy, llvm-tools-preview, rustfmt
- name: Install Rust
run: rustup update stable

- uses: taiki-e/install-action@v1
- uses: taiki-e/install-action@v2
with:
tool: cargo-llvm-cov,protoc

- uses: Swatinem/rust-cache@v2
tool: cargo-llvm-cov,nextest,protoc

# We limit the number of jobs to 4 to avoid OOM errors when linking the binary.
- name: Generate code coverage
run: |
cargo llvm-cov clean --workspace
cargo llvm-cov --test failpoints --no-report --features fail/failpoints
cargo llvm-cov --no-report --all-features
cargo llvm-cov --no-run --lcov --output-path lcov.info
cargo llvm-cov nextest --no-report --test failpoints --features fail/failpoints --retries 2
CARGO_BUILD_JOBS=4 cargo llvm-cov nextest --no-report --all-features --retries 2
cargo llvm-cov report --lcov --output-path lcov.info
working-directory: ./quickwit

- name: Upload coverage to Codecov
Expand Down
34 changes: 34 additions & 0 deletions config/tutorials/wikipedia/multilang-index-config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
#
# Index config file for multilang wikipedia datasets.
#

version: 0.6

index_id: multilang-wikipedia

doc_mapping:
tokenizers:
- name: multilang
type: multilang
field_mappings:
- name: title
type: text
tokenizer: multilang
record: position
stored: true
fieldnorms: true
- name: body
type: text
tokenizer: multilang
record: position
stored: true
fieldnorms: true
- name: url
type: text
tokenizer: raw

search_settings:
default_search_fields: [title, body]

indexing_settings:
commit_timeout_secs: 10
19 changes: 9 additions & 10 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,6 @@ services:
- all
- localstack
environment:
# `kinesalite` provides a more accurate implementation than
# the default Kinesis provider (`kinesis-mock`).
KINESIS_PROVIDER: kinesalite
SERVICES: kinesis,s3
PERSISTENCE: 1
volumes:
Expand All @@ -53,7 +50,7 @@ services:

postgres:
# The oldest supported version. EOL November 9, 2023
image: postgres:${POSTGRES_VERSION:-11.19-alpine}
image: postgres:${POSTGRES_VERSION:-11.19-alpine}
container_name: postgres
ports:
- "${MAP_HOST_POSTGRESS:-127.0.0.1}:5432:5432"
Expand Down Expand Up @@ -82,14 +79,14 @@ services:
- "${MAP_HOST_PULSAR:-127.0.0.1}:6650:6650"
- "${MAP_HOST_PULSAR:-127.0.0.1}:8081:8080"
environment:
PULSAR_MEM: " -Xms512m -Xmx512m -XX:MaxDirectMemorySize=2g"
PULSAR_MEM: "-Xms256M -Xmx256M"
profiles:
- all
- pulsar

kafka-broker:
# The oldest supported version with arm64 docker images. EOL October 27, 2023
image: confluentinc/cp-kafka:${CP_VERSION:-7.0.9}
# The oldest supported version with arm64 docker images. EOL October 27, 2023
image: confluentinc/cp-kafka:${CP_VERSION:-7.0.9}
container_name: kafka-broker
depends_on:
- zookeeper
Expand All @@ -110,6 +107,7 @@ services:
KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1
KAFKA_JMX_PORT: 9101
KAFKA_JMX_HOSTNAME: localhost
KAFKA_HEAP_OPTS: -Xms256M -Xmx256M
healthcheck:
test: ["CMD", "cub", "kafka-ready", "-b", "localhost:9092", "1", "5"]
start_period: 5s
Expand All @@ -118,7 +116,7 @@ services:
retries: 100

zookeeper:
# The oldest supported version with arm64 images. EOL October 27, 2023
# The oldest supported version with arm64 images. EOL October 27, 2023
image: confluentinc/cp-zookeeper:${CP_VERSION:-7.0.9}
container_name: zookeeper
ports:
Expand All @@ -127,6 +125,7 @@ services:
- all
- kafka
environment:
KAFKA_HEAP_OPTS: -Xms256M -Xmx256M
ZOOKEEPER_CLIENT_PORT: 2181
ZOOKEEPER_TICK_TIME: 2000
healthcheck:
Expand Down Expand Up @@ -157,8 +156,8 @@ services:
- grafana
- monitoring
environment:
GF_AUTH_DISABLE_LOGIN_FORM: true
GF_AUTH_ANONYMOUS_ENABLED: true
GF_AUTH_DISABLE_LOGIN_FORM: "true"
GF_AUTH_ANONYMOUS_ENABLED: "true"
GF_AUTH_ANONYMOUS_ORG_ROLE: Admin
volumes:
- ./monitoring/grafana/dashboards:/var/lib/grafana/dashboards
Expand Down
30 changes: 21 additions & 9 deletions docs/configuration/index-config.md
Original file line number Diff line number Diff line change
Expand Up @@ -187,31 +187,43 @@ fast: true

#### `datetime` type

The `datetime` type handles dates and datetimes. Each `datetime` field can be configured to support multiple input formats.
The `datetime` type handles dates and datetimes. Since JSON doesn’t have a date type, the `datetime` field support multiple input types and formats. The supported input types are:
- floating-point or integer numbers representing a Unix timestamp
- strings containing a formatted date, datetime, or Unix timestamp

The `input_formats` field parameter specifies the accepted date formats. The following input formats are natively supported:
- `iso8601`
- `rfc2822`
- `rfc3339`
- `strptime`
- `unix_timestamp`

**Input formats**

When specifying multiple input formats, the corresponding parsers are attempted in the order they are declared. The following formats are natively supported:
- `iso8601`, `rfc2822`, `rfc3339`: parse dates using standard ISO and RFC formats.
- `strptime`: parse dates using the Unix [strptime](https://man7.org/linux/man-pages/man3/strptime.3.html) format with some variations:
- `strptime` format specifiers: `%C`, `%d`, `%D`, `%e`, `%F`, `%g`, `%G`, `%h`, `%H`, `%I`, `%j`, `%k`, `%l`, `%m`, `%M`, `%n`, `%R`, `%S`, `%t`, `%T`, `%u`, `%U`, `%V`, `%w`, `%W`, `%y`, `%Y`, `%%`.
- `%f` for milliseconds precision support.
- `%z` timezone offsets can be specified as `(+|-)hhmm` or `(+|-)hh:mm`.

- `unix_timestamp`: parse float and integer numbers to Unix timestamps. Floating-point values are converted to timestamps expressed in seconds. Integer values are converted to Unix timestamps whose precision determined in `seconds`, `milliseconds`, `microseconds`, or `nanoseconds` is inferred from the number of input digits. Internally, datetimes are stored as `i64`, and Quickwit only supports timestamp values ranging from `Apr 13, 1972 23:59:55` to `Mar 16, 2242 12:56:31` as a result.
:::warning
The timezone name format specifier (`%Z`) is not supported currently.
:::

- `unix_timestamp`: parse float and integer numbers to Unix timestamps. Floating-point values are converted to timestamps expressed in seconds. Integer values are converted to Unix timestamps whose precision, determined in `seconds`, `milliseconds`, `microseconds`, or `nanoseconds`, is inferred from the number of input digits. Internally, datetimes are converted to UTC (if the time zone is specified) and stored as *i64* integers. As a result, Quickwit only supports timestamp values ranging from `Apr 13, 1972 23:59:55` to `Mar 16, 2242 12:56:31`.

:::warning
We discourage ingesting decimal timestamps because the conversion occurs with a loss of precision in some cases. Prefer integer values instead.
Converting timestamps from float to integer values may occurs with a loss of precision.
:::

When a `datetime` field is stored as a fast field, the `precision` parameter indicates the precision used to truncate the values before encoding, which improves compression (truncation here means zeroing). The `precision` parameter can take the following values: `seconds`, `milliseconds`, `microseconds`, or `nanoseconds`. It only affects what is stored in fast fields when a `datetime` field is marked as fast field. Finally, operations on `datetime` fastfields, e.g. via aggregations, need to be done at the nanosecond level.
When a `datetime` field is stored as a fast field, the `precision` parameter indicates the precision used to truncate the values before encoding, which improves compression (truncation here means zeroing). The `precision` parameter can take the following values: `seconds`, `milliseconds`, `microseconds`, or `nanoseconds`. It only affects what is stored in fast fields when a `datetime` field is marked as "fast". Finally, operations on `datetime` fast fields, e.g. via aggregations, need to be done at the nanosecond level.

:::info
Internally `datetime` is stored in `nanoseconds` in fast fields and in the docstore, and in `seconds` in the term dictionary.
:::

:::warning
The timezone name format specifier (`%Z`) is not currently supported in `strptime` format.
:::

In addition, Quickwit supports the `output_format` field option to specify with which precision datetimes are deserialized. This options supports the same value as input formats except for `unix_timestamp` which is replaced by the following formats:
In addition, Quickwit supports the `output_format` field parameter to specify with which precision datetimes are deserialized. This parameter supports the same value as input formats except for `unix_timestamp` which is replaced by the following formats:
- `unix_timestamp_secs`: displays timestamps in seconds.
- `unix_timestamp_millis`: displays timestamps in milliseconds.
- `unix_timestamp_micros`: displays timestamps in microseconds.
Expand Down
7 changes: 6 additions & 1 deletion docs/configuration/node-config.md
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ storage:
| Property | Description | Default value |
| --- | --- | --- |
| `flavor` | The optional storage flavor to use. Available flavors are `garage`, `gcs`, and `minio`. | |
| `flavor` | The optional storage flavor to use. Available flavors are `digital_ocean`, `garage`, `gcs`, and `minio`. | |
| `access_key_id` | The AWS access key ID. | |
| `secret_access_key` | The AWS secret access key. | |
| `region` | The AWS region to send requests to. | `us-east-1` (SDK default) |
Expand All @@ -91,10 +91,15 @@ Hardcoding credentials into configuration files is not secure and strongly disco
**Storage flavors**

Storage flavors ensure that Quickwit works correctly with storage providers that deviate from the S3 API by automatically configuring the appropriate settings. The available flavors are:
- `digital_ocean`
- `garage`
- `gcs`
- `minio`

*Digital Ocean*

The Digital Ocean flavor (`digital_ocean`) forces path-style access and turns off multi-object delete requests.

*Garage flavor*

The Garage flavor (`garage`) overrides the `region` parameter to `garage` and forces path-style access.
Expand Down
14 changes: 7 additions & 7 deletions docs/log-management/send-logs/using-otel-collector-with-helm.md
Original file line number Diff line number Diff line change
Expand Up @@ -174,23 +174,23 @@ After a few seconds, you should see logs on your indexer that show indexing has
2022-11-30T18:27:52.733Z INFO index_batch{index_id=otel-log-v0 source_id=_ingest-api-source pipeline_ord=0}:uploader:stage_and_upload{split=01GK4WPTXK8GH3AGTRNBN9A8YG}:store_split: quickwit_indexing::split_store::indexing_split_store: store-split-remote-success split_size_in_megabytes=0.018351 num_docs=22 elapsed_secs=0.07654519 throughput_mb_s=0.23974074 is_mature=false
```

If you see some errors there, it's probably coming from a misconfiguration of your object storage. If you need some help, please open an issue on [GitHub](https://github.com/quickwit-oss/quickwit) or come on our [discord server](https://discord.gg/MT27AG5EVE).
If you see some errors there, it's probably coming from a misconfiguration of your object storage. If you need some help, please open an issue on [GitHub](https://github.com/quickwit-oss/quickwit) or come on our [discord server](https://discord.gg/MT27AG5EVE).


### Ready to search logs

You are now ready to search, wait 30 seconds and you will see the first indexed logs: just [open the UI](http://localhost:7280/ui/search?query=*&index_id=otel-logs-v0&max_hits=10&sort_by_field=-timestamp_secs) and play with it. Funny thing you will see quickwit logs in it :).
You are now ready to search, wait 30 seconds and you will see the first indexed logs: just [open the UI](http://localhost:7280/ui/search?query=*&index_id=otel-logs-v0&max_hits=10&sort_by=-timestamp_secs) and play with it. Funny thing you will see quickwit logs in it :).

Example of queries:

- [body.message:quickwit](http://localhost:7280/ui/search?query=body.message:quickwit&index_id=otel-logs-v0&max_hits=10&sort_by_field=-timestamp_secs)
- [resource_attributes.k8s.container.name:quickwit](http://localhost:7280/ui/search?query=resource_attributes.k8s.container.name%3Aquickwit&index_id=otel-logs-v0&max_hits=10&sort_by_field=-timestamp_secs)
- [resource_attributes.k8s.container.restart_count:1](http://localhost:7280/ui/search?query=resource_attributes.k8s.container.restart_count%3A1&index_id=otel-logs-v0&max_hits=10&sort_by_field=-timestamp_secs)
- [body.message:quickwit](http://localhost:7280/ui/search?query=body.message:quickwit&index_id=otel-logs-v0&max_hits=10&sort_by=-timestamp_secs)
- [resource_attributes.k8s.container.name:quickwit](http://localhost:7280/ui/search?query=resource_attributes.k8s.container.name%3Aquickwit&index_id=otel-logs-v0&max_hits=10&sort_by=-timestamp_secs)
- [resource_attributes.k8s.container.restart_count:1](http://localhost:7280/ui/search?query=resource_attributes.k8s.container.restart_count%3A1&index_id=otel-logs-v0&max_hits=10&sort_by=-timestamp_secs)



![UI screenshot](../../assets/screenshot-ui-otel-logs.png)

And that's all folks!
That's all, folks!

### Clean up

Expand Down
2 changes: 1 addition & 1 deletion docs/overview/concepts/querying.md
Original file line number Diff line number Diff line change
Expand Up @@ -104,4 +104,4 @@ Quickwit does caching in many places to deliver a highly performing query engine

### Scoring

Quickwit supports sorting docs by their BM25 scores. In order to query by score, [fieldnorms](../../configuration/index-config.md#text-type) must be enabled for the field. By default BM25 scoring is disabled to improve query times but it can be opt-in by setting `sort_by_field` option to `_score` in queries.
Quickwit supports sorting docs by their BM25 scores. In order to query by score, [fieldnorms](../../configuration/index-config.md#text-type) must be enabled for the field. By default, BM25 scoring is disabled to improve query latencies but it can be opt-in by setting the `sort_by` option to `_score` in queries.
Loading

0 comments on commit d00ebc7

Please sign in to comment.