diff --git a/.github/workflows/helm-release.yaml b/.github/workflows/helm-release.yaml index b0ee40c36568..c303939fdf39 100644 --- a/.github/workflows/helm-release.yaml +++ b/.github/workflows/helm-release.yaml @@ -1,7 +1,13 @@ name: helm-release on: - workflow_dispatch: # must be invoked manually + workflow_dispatch: # for manual testing + push: + branches: + - main + - k[0-9]+ + paths: + - 'production/helm/loki/Chart.yaml' jobs: call-update-helm-repo: diff --git a/.github/workflows/helm-tagged-release-pr.yaml b/.github/workflows/helm-tagged-release-pr.yaml index 1a5e6bdeccff..fb6ed4315497 100644 --- a/.github/workflows/helm-tagged-release-pr.yaml +++ b/.github/workflows/helm-tagged-release-pr.yaml @@ -1,15 +1,23 @@ -name: helm-weekly-release-pr +name: Helm tagged release PR on: release: types: - released + workflow_dispatch: # for manual testing + jobs: weekly-release-pr: runs-on: ubuntu-latest + env: + RELEASE_VERSION: "${{ github.event.release.tag_name || 'test' }}" + BUILD_IN_CONTAINER: false steps: - uses: actions/checkout@v4 + - uses: gabe565/setup-helm-docs-action@v1 + with: + version: v1.11.2 - id: "get_github_app_token" name: "get github app token" @@ -21,13 +29,14 @@ jobs: - name: Update/regenerate files id: update - run: bash .github/workflows/scripts/helm-tagged-release.sh ${{ github.event.release.tag_name }} + run: | + bash .github/workflows/scripts/helm-tagged-release.sh ${RELEASE_VERSION} - name: Create Pull Request uses: peter-evans/create-pull-request@v5 with: token: ${{ steps.get_github_app_token.outputs.token }} - title: Release loki Helm chart ${{ steps.update.outputs.new_chart_version }} + title: "chore: release loki helm chart ${{ steps.update.outputs.new_chart_version }}" body: Automated PR created by [helm-tagged-release-pr.yaml](https://github.com/grafana/loki/blob/main/.github/workflows/helm-tagged-release-pr.yaml) commit-message: Update loki chart to ${{ steps.update.outputs.new_chart_version }} branch: helm-chart-tagged-${{ steps.update.outputs.new_chart_version }} diff --git a/.github/workflows/helm-weekly-release-pr.yaml b/.github/workflows/helm-weekly-release-pr.yaml index 7ac88b7b9584..343a9fae3539 100644 --- a/.github/workflows/helm-weekly-release-pr.yaml +++ b/.github/workflows/helm-weekly-release-pr.yaml @@ -1,4 +1,4 @@ -name: helm-weekly-release-pr +name: Helm weekly release PR on: schedule: @@ -6,12 +6,21 @@ on: workflow_dispatch: # for manual testing +permissions: + contents: "read" + id-token: "write" + pull-requests: "write" + jobs: weekly-release-pr: runs-on: ubuntu-latest + env: + BUILD_IN_CONTAINER: false steps: - uses: actions/checkout@v4 - - uses: imjasonh/setup-crane@v0.4 + - uses: gabe565/setup-helm-docs-action@v1 + with: + version: v1.11.2 - id: "get_github_app_token" name: "get github app token" @@ -21,15 +30,37 @@ jobs: owner: "${{ github.repository_owner }}" private-key: "${{ secrets.APP_PRIVATE_KEY }}" - - name: Update/regenerate files + - name: "Login to DockerHub (from vault)" + uses: "grafana/shared-workflows/actions/dockerhub-login@main" + + - uses: imjasonh/setup-crane@v0.4 + + - name: Update/regenerate files for k release + id: update-k + run: | + bash .github/workflows/scripts/helm-weekly-release.sh -k + + - name: Create Pull Request + uses: peter-evans/create-pull-request@v5 + with: + token: ${{ steps.get_github_app_token.outputs.token }} + title: "chore: release loki helm chart ${{ steps.update-k.outputs.new_chart_version }}" + body: Automated PR created by [helm-weekly-release-pr.yaml](https://github.com/grafana/loki/blob/main/.github/workflows/helm-weekly-release-pr.yaml) + commit-message: Update loki chart to ${{ steps.update-k.outputs.new_chart_version }} + branch: helm-chart-weekly-${{ steps.update-k.outputs.new_chart_version }} + base: ${{ steps.update-k.outputs.weekly }} + labels: helm + + - name: Update/regenerate files for standard release id: update - run: bash .github/workflows/scripts/helm-weekly-release.sh + run: | + bash .github/workflows/scripts/helm-weekly-release.sh - name: Create Pull Request uses: peter-evans/create-pull-request@v5 with: token: ${{ steps.get_github_app_token.outputs.token }} - title: Release loki Helm chart ${{ steps.update.outputs.new_chart_version }} + title: "chore: release loki helm chart ${{ steps.update.outputs.new_chart_version }}" body: Automated PR created by [helm-weekly-release-pr.yaml](https://github.com/grafana/loki/blob/main/.github/workflows/helm-weekly-release-pr.yaml) commit-message: Update loki chart to ${{ steps.update.outputs.new_chart_version }} branch: helm-chart-weekly-${{ steps.update.outputs.new_chart_version }} diff --git a/.github/workflows/logql-analyzer.yml b/.github/workflows/logql-analyzer.yml new file mode 100644 index 000000000000..d78d90fb805a --- /dev/null +++ b/.github/workflows/logql-analyzer.yml @@ -0,0 +1,114 @@ +name: LogQL Analyzer + +on: + workflow_dispatch: + release: + types: + - released + +permissions: + contents: read + id-token: write + +jobs: + analyze: + runs-on: ubuntu-latest + + env: + BUILD_TIMEOUT: 60 + IMAGE_PREFIX: "grafana" + RELEASE_VERSION: "${{ github.event.release.tag_name || 'test' }}" + steps: + - uses: actions/checkout@v4 + with: + fetch-tags: true + path: loki + + - name: prepare + id: prepare + env: + MAJOR_MINOR_VERSION_REGEXP: '([0-9]+\\.[0-9]+)' + RELEASE_TAG_REGEXP: '^([0-9]+\\.[0-9]+\\.[0-9]+)$' + working-directory: loki + run: | + echo "$(./tools/image-tag)" > .tag + if [[ "$RELEASE_VERSION" == "test" ]]; then + echo "RELEASE_VERSION is not set, using image tag" + RELEASE_VERSION="$(cat .tag)" + fi + echo "RELEASE_VERSION: $RELEASE_VERSION" + + # if the tag matches the pattern `D.D.D` then RELEASE_NAME="D-D-x", otherwise RELEASE_NAME="next" + RELEASE_NAME=$([[ $RELEASE_VERSION =~ $RELEASE_TAG_REGEXP ]] && echo $RELEASE_TAG | grep -oE $MAJOR_MINOR_VERSION_REGEXP | sed "s/\\./-/g" | sed "s/$/-x/" || echo "next") + echo "RELEASE_NAME: $RELEASE_NAME" + + echo "release_version=${RELEASE_VERSION}" >> "$GITHUB_OUTPUT" + echo "release_name=${RELEASE_NAME}" >> "$GITHUB_OUTPUT" + + - id: "get-github-app-token" + name: "get github app token" + uses: "actions/create-github-app-token@v1" + with: + app-id: "${{ secrets.APP_ID }}" + owner: "${{ github.repository_owner }}" + private-key: "${{ secrets.APP_PRIVATE_KEY }}" + + - name: "Set up QEMU" + uses: "docker/setup-qemu-action@v3" + - name: "set up docker buildx" + uses: "docker/setup-buildx-action@v3" + - name: "Login to DockerHub (from vault)" + uses: "grafana/shared-workflows/actions/dockerhub-login@main" + + - name: "Build and push" + timeout-minutes: "${{ fromJSON(env.BUILD_TIMEOUT) }}" + uses: "docker/build-push-action@v6" + with: + build-args: "IMAGE_TAG=${{ steps.prepare.outputs.release_version }}" + context: loki + file: "loki/cmd/logql-analyzer/Dockerfile" + platforms: "linux/amd64" + push: true + tags: "grafana/logql-analyzer:${{ steps.prepare.outputs.release_version }}" + + - name: Log in to Google Artifact Registry + uses: grafana/shared-workflows/actions/login-to-gar@main + with: + registry: "us-docker.pkg.dev" + environment: "prod" + + - name: Update to latest image + env: + GITHUB_TOKEN: ${{ steps.get-github-app-token.outputs.token }} + RELEASE_NAME: ${{ steps.prepare.outputs.release_name }} + RELEASE_VERSION: ${{ steps.prepare.outputs.release_version }} + run: | + set -e -o pipefail + + cat << EOF > config.json + { + "repo_name": "deployment_tools", + "destination_branch": "master", + "git_author_email": "119986603+updater-for-ci[bot]@users.noreply.github.com", + "git_author_name": "version_bumper[bot]", + "git_committer_email": "119986603+updater-for-ci[bot]@users.noreply.github.com", + "git_committer_name": "version_bumper[bot]", + "pull_request_branch_prefix": "logql-analyzer/updater", + "pull_request_enabled": true, + "pull_request_existing_strategy": "replace", + "pull_request_title_prefix": "[logql-analyzer updater] ", + "pull_request_message": "Add logql-analyzer version to ${RELEASE_VERSION} to supported versions", + "update_jsonnet_attribute_configs": [ + { + "file_path": "ksonnet/environments/logql-analyzer/supported-versions.libsonnet", + "jsonnet_key": "${RELEASE_NAME}", + "jsonnet_value": "grafana/logql-analyzer:${RELEASE_VERSION}-amd64", + "upsert": true + } + ] + } + EOF + + docker run --rm \ + -e GITHUB_TOKEN="$GITHUB_TOKEN" \ + -e CONFIG_JSON="$(cat config.json)" us-docker.pkg.dev/grafanalabs-global/docker-deployment-tools-prod/updater |& tee updater-output.log diff --git a/.github/workflows/scripts/common.sh b/.github/workflows/scripts/common.sh old mode 100644 new mode 100755 index b5cba118af71..e6d37b25c617 --- a/.github/workflows/scripts/common.sh +++ b/.github/workflows/scripts/common.sh @@ -22,24 +22,20 @@ get_yaml_node() { # Increments the part of the semver string # $1: version itself # $2: number of part: 0 – major, 1 – minor, 2 – patch +# shellcheck disable=SC2207,SC2046,SC2248,SC2250 increment_semver() { local delimiter=. - local array=("$(echo "$1" | tr "${delimiter}" '\n')") - array[$2]=$((array[$2] + 1)) - echo "$( - local IFS=${delimiter} - echo "${array[*]}" - )" + local array=($(echo "$1" | tr $delimiter '\n')) + array[$2]=$((array[$2]+1)) + echo $(local IFS=$delimiter ; echo "${array[*]}") } # Sets the patch segment of a semver to 0 # $1: version itself +# shellcheck disable=SC2207,SC2046,SC2248,SC2250 set_semver_patch_to_zero() { local delimiter=. - local array=("$(echo "$1" | tr "${delimiter}" '\n')") + local array=($(echo "$1" | tr $delimiter '\n')) array[2]="0" - echo "$( - local IFS=${delimiter} - echo "${array[*]}" - )" + echo $(local IFS=$delimiter ; echo "${array[*]}") } diff --git a/.github/workflows/scripts/helm-tagged-release.sh b/.github/workflows/scripts/helm-tagged-release.sh index fd6c06f520d0..4a0d90eec59a 100755 --- a/.github/workflows/scripts/helm-tagged-release.sh +++ b/.github/workflows/scripts/helm-tagged-release.sh @@ -51,4 +51,5 @@ sed --in-place \ make TTY='' helm-docs -echo "::set-output name=new_chart_version::${new_chart_version}" +# shellcheck disable=SC2154,SC2250 +echo "new_chart_version=${new_chart_version}" >> "$GITHUB_OUTPUT" diff --git a/.github/workflows/scripts/helm-weekly-release.sh b/.github/workflows/scripts/helm-weekly-release.sh index 64d5f29f4557..6b3d6043b604 100755 --- a/.github/workflows/scripts/helm-weekly-release.sh +++ b/.github/workflows/scripts/helm-weekly-release.sh @@ -11,7 +11,11 @@ source "${script_dir}/common.sh" find_latest_image_tag() { local docker_hub_repo=$1 local regExp="^(k|weekly-k)\d+-[a-z0-9]+" - crane ls "${docker_hub_repo}" | grep -P "${regExp}" | sed -E "s/([weekly-]*k[[:digit:]]*)-([^-]*).*/\1-\2/g" | uniq | sort -Vur | head -1 + local crane_results + crane_results="$(crane ls "${docker_hub_repo}" | grep -P "${regExp}" | sed -E "s/([weekly-]*k[[:digit:]]*)-([^-]*).*/\1-\2/g" | sort -Vur)" + set +o pipefail + echo "${crane_results}" | head -1 + set -o pipefail } # takes k197-abcdef and returns r197, k197-abcdef-arm64 and returns k197, weekly-k197-abcdef and returns k197 @@ -22,6 +26,7 @@ extract_k_version() { calculate_next_chart_version() { local current_chart_version=$1 local latest_image_tag=$2 + local k_release=$3 local current_chart_semver current_chart_semver=$(echo "${current_chart_version}" | grep -P -o '^(\d+.){2}\d+') @@ -35,7 +40,12 @@ calculate_next_chart_version() { # Also reset the patch release number to 0. new_chart_semver=$(set_semver_patch_to_zero "${new_chart_semver}") fi - echo "${new_chart_semver}-weekly.${new_chart_weekly}" + + if ${k_release}; then + echo "${new_chart_semver}-weekly.${new_chart_weekly}" + else + echo "${new_chart_semver}" + fi } validate_version_update() { @@ -60,25 +70,45 @@ validate_version_update() { fi } +k_release=false +if [[ "$1" == "-k" ]]; then + k_release=true + shift +fi + values_file=production/helm/loki/values.yaml chart_file=production/helm/loki/Chart.yaml latest_loki_tag=$(find_latest_image_tag grafana/loki) latest_gel_tag=$(find_latest_image_tag grafana/enterprise-logs) current_chart_version=$(get_yaml_node "${chart_file}" .version) -new_chart_version=$(calculate_next_chart_version "${current_chart_version}" "${latest_loki_tag}") +new_chart_version=$(calculate_next_chart_version "${current_chart_version}" "${latest_loki_tag}" "${k_release}") validate_version_update "${new_chart_version}" "${current_chart_version}" "${latest_gel_tag}" "${latest_loki_tag}" -update_yaml_node "${values_file}" .loki.image.tag "${latest_loki_tag}" -update_yaml_node "${values_file}" .enterprise.image.tag "${latest_gel_tag}" -update_yaml_node "${chart_file}" .appVersion "$(extract_k_version "${latest_loki_tag}")" +if ${k_release}; then + update_yaml_node "${values_file}" .loki.image.tag "${latest_loki_tag}" + update_yaml_node "${values_file}" .enterprise.image.tag "${latest_gel_tag}" + update_yaml_node "${chart_file}" .appVersion "$(extract_k_version "${latest_loki_tag}")" +fi + update_yaml_node "${chart_file}" .version "${new_chart_version}" -sed --in-place \ - --regexp-extended \ - "s/(.*\.*)/\1\n\n## ${new_chart_version}\n\n- \[CHANGE\] Changed version of Grafana Loki to ${latest_loki_tag}\n- \[CHANGE\] Changed version of Grafana Enterprise Logs to ${latest_gel_tag}/g" production/helm/loki/CHANGELOG.md +if ${k_release}; then + sed --in-place \ + --regexp-extended \ + "s/(.*\.*)/\1\n\n## ${new_chart_version}\n\n- \[CHANGE\] Changed version of Grafana Loki to ${latest_loki_tag}\n- \[CHANGE\] Changed version of Grafana Enterprise Logs to ${latest_gel_tag}/g" production/helm/loki/CHANGELOG.md +else + sed --in-place \ + --regexp-extended \ + "s/(.*\.*)/\1\n\n## ${new_chart_version}/g" production/helm/loki/CHANGELOG.md +fi make TTY='' helm-docs -echo "::set-output name=new_chart_version::${new_chart_version}" +# shellcheck disable=SC2154,SC2250 +echo "new_chart_version=${new_chart_version}" >> "$GITHUB_OUTPUT" +if ${k_release}; then + # shellcheck disable=SC2154,SC2250 + echo "weekly=$(extract_k_version "${latest_loki_tag}")" >> "$GITHUB_OUTPUT" +fi diff --git a/docs/Makefile b/docs/Makefile index 4bed302d7179..ec038bd7a056 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -17,7 +17,12 @@ ifeq ($(BUILD_IN_CONTAINER),true) -c /helm-docs/production/helm/ \ -t reference.md.gotmpl \ -o reference.md + $(PODMAN) run --rm --volume "$(realpath ..):/helm-docs" -u "$$(id -u)" "docker.io/jnorwood/helm-docs:v1.11.0" \ + -c /helm-docs/production/helm/ \ + -t README.md.gotmpl \ + -o README.md else helm-docs -c ../production/helm/ -t reference.md.gotmpl -o reference.md + helm-docs -c ../production/helm/ -t README.md.gotmpl -o README.md endif mv "$(basename $<)" "$@" diff --git a/docs/sources/send-data/fluentbit/_index.md b/docs/sources/send-data/fluentbit/_index.md index ea2af6a4ac4b..5a6884efd5db 100644 --- a/docs/sources/send-data/fluentbit/_index.md +++ b/docs/sources/send-data/fluentbit/_index.md @@ -1,282 +1,24 @@ --- -title: Fluent Bit client +title: Fluent Bit menuTitle: Fluent Bit description: Provides instructions for how to install, configure, and use the Fluent Bit client to send logs to Loki. aliases: - ../clients/fluentbit/ weight: 500 --- -# Fluent Bit client +# Fluent Bit -[Fluent Bit](https://fluentbit.io/) is a fast and lightweight logs and metrics processor and forwarder that can be configured with the Grafana Fluent Bit Plugin described here or with the [Fluent-bit Loki output plugin](https://docs.fluentbit.io/manual/pipeline/outputs/loki) to ship logs to Loki. -This plugin has more configuration options compared to the built-in Fluent Bit Loki plugin. -You can define which log files you want to collect using the [`Tail`](https://docs.fluentbit.io/manual/pipeline/inputs/tail) or [`Stdin`](https://docs.fluentbit.io/manual/pipeline/inputs/standard-input) data pipeline input. Additionally, Fluent Bit supports multiple `Filter` and `Parser` plugins (`Kubernetes`, `JSON`, etc.) to structure and alter log lines. +[Fluent Bit](https://fluentbit.io/) is a fast, lightweight logs and metrics agent. It is a CNCF graduated sub-project under the umbrella of Fluentd. Fluent Bit is licensed under the terms of the Apache License v2.0. -{{< youtube id="s43IBSVyTpQ" >}} +When using Fluent Bit to ship logs to Loki, you can define which log files you want to collect using the [`Tail`](https://docs.fluentbit.io/manual/pipeline/inputs/tail) or [`Stdin`](https://docs.fluentbit.io/manual/pipeline/inputs/standard-input) data pipeline input. Additionally, Fluent Bit supports multiple `Filter` and `Parser` plugins (`Kubernetes`, `JSON`, etc.) to structure and alter log lines. -## Usage +There are two Fluent Bit plugins for Loki: -### Docker +1. The integrated `loki` [plugin](https://grafana.com/docs/loki//send-data/fluentbit/fluent-bit-plugin/), which is officially maintained by the Fluent Bit project. +2. The `grafana-loki` [plugin](https://grafana.com/docs/loki//send-data/fluentbit/community-plugin/), an alternative community plugin by Grafana Labs. -You can run a Fluent Bit container with Loki output plugin pre-installed using our [Docker Hub](https://hub.docker.com/r/grafana/fluent-bit-plugin-loki) image: +We recommend using the `loki` plugin as this provides the most complete feature set and is actively maintained by the Fluent Bit project. -```bash -docker run -v /var/log:/var/log \ - -e LOG_PATH="/var/log/*.log" -e LOKI_URL="http://localhost:3100/loki/api/v1/push" \ - grafana/fluent-bit-plugin-loki:latest -``` +## Tutorial -Or, an alternative is to run the fluent-bit container using [Docker Hub](https://hub.docker.com/r/fluent/fluent-bit) image: - -### Docker Container Logs - -To ship logs from Docker containers to Grafana Cloud using Fluent Bit, you can use the Fluent Bit Docker image and configure it to forward logs directly to Grafana Cloud's Loki. Below is a step-by-step guide on setting up Fluent Bit for this purpose. - -#### Prerequisites - -- Docker is installed on your machine. -- You have a Grafana Cloud account with access to Loki. - -#### Configuration - -1. Create a Fluent Bit configuration file named `fluent-bit.conf` with the following content, which defines the input from Docker container logs and sets up the output to send logs to your Grafana Cloud Loki instance: - - ```ini - [SERVICE] - Flush 1 - Log_Level info - - [INPUT] - Name tail - Path /var/lib/docker/containers/*/*.log - Parser docker - Tag docker.* - - [OUTPUT] - Name loki - Match * - Host logs-prod-006.grafana.net - Port 443 - TLS On - TLS.Verify On - HTTP_User 478625 - HTTP_Passwd YOUR_GRAFANA_CLOUD_API_KEY - Labels job=fluentbit - -### Kubernetes - -You can run Fluent Bit as a [Daemonset](https://kubernetes.io/docs/concepts/workloads/controllers/daemonset/) to collect all your Kubernetes workload logs. - -To do so you can use the [Fluent Bit helm chart](https://github.com/fluent/helm-charts) with the following `values.yaml` changing the value of `FLUENT_LOKI_URL`: - -```yaml -image: - # Here we use the Docker image which has the plugin installed - repository: grafana/fluent-bit-plugin-loki - tag: main-e2ed1c0 - -args: - - "-e" - - "/fluent-bit/bin/out_grafana_loki.so" - - --workdir=/fluent-bit/etc - - --config=/fluent-bit/etc/conf/fluent-bit.conf - -env: - # Note that for security reasons you should fetch the credentials through a Kubernetes Secret https://kubernetes.io/docs/concepts/configuration/secret/ . You may use the envFrom for this. - - name: FLUENT_LOKI_URL - value: https://user:pass@your-loki.endpoint/loki/api/v1/push - -config: - inputs: | - [INPUT] - Name tail - Tag kube.* - Path /var/log/containers/*.log - # Be aware that local clusters like docker-desktop or kind use the docker log format and not the cri (https://docs.fluentbit.io/manual/installation/kubernetes#container-runtime-interface-cri-parser) - multiline.parser docker, cri - Mem_Buf_Limit 5MB - Skip_Long_Lines On - - outputs: | - [Output] - Name grafana-loki - Match kube.* - Url ${FLUENT_LOKI_URL} - Labels {job="fluent-bit"} - LabelKeys level,app # this sets the values for actual Loki streams and the other labels are converted to structured_metadata https://grafana.com/docs/loki//get-started/labels/structured-metadata/ - BatchWait 1 - BatchSize 1001024 - LineFormat json - LogLevel info - AutoKubernetesLabels true -``` - -```bash -helm repo add fluent https://fluent.github.io/helm-charts -helm repo update -helm install fluent-bit fluent/fluent-bit -f values.yaml -``` - -By default it will collect all containers logs and extract labels from Kubernetes API (`container_name`, `namespace`, etc..). - -If you also want to host your Loki instance inside the cluster install the [official Loki helm chart](https://grafana.com/docs/loki//setup/install/helm/). - -### AWS Elastic Container Service (ECS) - -You can use fluent-bit Loki Docker image as a Firelens log router in AWS ECS. -For more information about this see our [AWS documentation]({{< relref "../promtail/cloud/ecs" >}}) - -### Local - -First, you need to follow the [instructions](https://github.com/grafana/loki/blob/main/clients/cmd/fluent-bit/README.md) in order to build the plugin dynamic library. - -Assuming you have Fluent Bit installed in your `$PATH`, you can run the plugin using: - -```bash -fluent-bit -e /path/to/built/out_grafana_loki.so -c fluent-bit.conf -``` - -You can also adapt your plugins.conf, removing the need to change the command line options: - -```conf -[PLUGINS] - Path /path/to/built/out_grafana_loki.so -``` - -## Configuration Options - -| Key | Description | Default | -|----------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------| -| Url | Url of Loki server API endpoint. | http://localhost:3100/loki/api/v1/push | -| TenantID | The tenant ID used by default to push logs to Loki. If omitted or empty it assumes Loki is running in single-tenant mode and no `X-Scope-OrgID` header is sent. | "" | -| BatchWait | Time to wait before send a log batch to Loki, full or not. | 1s | -| BatchSize | Log batch size to send a log batch to Loki (unit: Bytes). | 10 KiB (10 * 1024 Bytes) | -| Timeout | Maximum time to wait for Loki server to respond to a request. | 10s | -| MinBackoff | Initial backoff time between retries. | 500ms | -| MaxBackoff | Maximum backoff time between retries. | 5m | -| MaxRetries | Maximum number of retries when sending batches. Setting it to `0` will retry indefinitely. | 10 | -| Labels | labels for API requests. | {job="fluent-bit"} | -| LogLevel | LogLevel for plugin logger. | "info" | -| RemoveKeys | Specify removing keys. | none | -| AutoKubernetesLabels | If set to true, it will add all Kubernetes labels to Loki labels | false | -| LabelKeys | Comma separated list of keys to use as stream labels. All other keys will be placed into the log line. LabelKeys is deactivated when using `LabelMapPath` label mapping configuration. | none | -| LineFormat | Format to use when flattening the record to a log line. Valid values are "json" or "key_value". If set to "json" the log line sent to Loki will be the fluentd record (excluding any keys extracted out as labels) dumped as json. If set to "key_value", the log line will be each item in the record concatenated together (separated by a single space) in the format =. | json | -| DropSingleKey | If set to true and after extracting label_keys a record only has a single key remaining, the log line sent to Loki will just be the value of the record key. | true | -| LabelMapPath | Path to a json file defining how to transform nested records. | none | -| Buffer | Enable buffering mechanism | false | -| BufferType | Specify the buffering mechanism to use (currently only dque is implemented). | dque | -| DqueDir | Path to the directory for queued logs | /tmp/flb-storage/loki | -| DqueSegmentSize | Segment size in terms of number of records per segment | 500 | -| DqueSync | Whether to fsync each queue change. Specify no fsync with "normal", and fsync with "full". | "normal" | -| DqueName | Queue name, must be uniq per output | dque | - -### Labels - -Labels are used to [query logs]({{< relref "../../query" >}}) `{container_name="nginx", cluster="us-west1"}`, they are usually metadata about the workload producing the log stream (`instance`, `container_name`, `region`, `cluster`, `level`). In Loki labels are indexed consequently you should be cautious when choosing them (high cardinality label values can have performance drastic impact). - -You can use `Labels`, `RemoveKeys` , `LabelKeys` and `LabelMapPath` to how the output plugin will perform labels extraction. - -### AutoKubernetesLabels - -If set to true, it will add all Kubernetes labels to Loki labels automatically and ignore parameters `LabelKeys`, LabelMapPath. - -### LabelMapPath - -When using the `Parser` and `Filter` plugins Fluent Bit can extract and add data to the current record/log data. While Loki labels are key value pair, record data can be nested structures. -You can pass a JSON file that defines how to extract labels from each record. Each json key from the file will be matched with the log record to find label values. Values from the configuration are used as label names. - -Considering the record below : - -```json -{ - "kubernetes": { - "container_name": "promtail", - "pod_name": "promtail-xxx", - "namespace_name": "prod", - "labels" : { - "team": "x-men" - } - }, - "HOSTNAME": "docker-desktop", - "log" : "a log line", - "time": "20190926T152206Z" -} -``` - -and a LabelMap file as follow : - -```json -{ - "kubernetes": { - "container_name": "container", - "pod_name": "pod", - "namespace_name": "namespace", - "labels" : { - "team": "team" - } - } -} -``` - -The labels extracted will be `{team="x-men", container="promtail", pod="promtail-xxx", namespace="prod"}`. - -If you don't want the `kubernetes` and `HOSTNAME` fields to appear in the log line you can use the `RemoveKeys` configuration field. (e.g. `RemoveKeys kubernetes,HOSTNAME`). - -### Buffering - -Buffering refers to the ability to store the records somewhere, and while they are processed and delivered, still be able to store more. The Loki output plugin can be blocked by the Loki client because of its design: - -- If the BatchSize is over the limit, the output plugin pauses receiving new records until the pending batch is successfully sent to the server -- If the Loki server is unreachable (retry 429s, 500s and connection-level errors), the output plugin blocks new records until the Loki server is available again, and the pending batch is successfully sent to the server or as long as the maximum number of attempts has been reached within configured back-off mechanism - -The blocking state with some of the input plugins is not acceptable, because it can have an undesirable side effect on the part that generates the logs. Fluent Bit implements a buffering mechanism that is based on parallel processing. Therefore, it cannot send logs in order. There are two ways of handling the out-of-order logs: - -- Configure Loki to [accept out-of-order writes](https://grafana.com/docs/loki//configure/#accept-out-of-order-writes). - -- Configure the Loki output plugin to use the buffering mechanism based on [`dque`](https://github.com/joncrlsn/dque), which is compatible with the Loki server strict time ordering: - - ```properties - [Output] - Name grafana-loki - Match * - Url http://localhost:3100/loki/api/v1/push - Buffer true - DqueSegmentSize 8096 - DqueDir /tmp/flb-storage/buffer - DqueName loki.0 - ``` - -### Configuration examples - -To configure the Loki output plugin add this section to fluent-bit.conf - -```properties -[Output] - Name grafana-loki - Match * - Url http://localhost:3100/loki/api/v1/push - BatchWait 1s - BatchSize 30720 - # (30KiB) - Labels {test="fluent-bit-go", lang="Golang"} - RemoveKeys key1,key2 - LabelKeys key3,key4 - LineFormat key_value -``` - -```properties -[Output] - Name grafana-loki - Match * - Url http://localhost:3100/loki/api/v1/push - BatchWait 1s - BatchSize 30720 # (30KiB) - AutoKubernetesLabels true - RemoveKeys key1,key2 -``` - -A full [example configuration file](https://github.com/grafana/loki/blob/main/clients/cmd/fluent-bit/fluent-bit.conf) is also available in the Loki repository. - -### Running multiple plugin instances - -You can run multiple plugin instances in the same fluent-bit process, for example if you want to push to different Loki servers or route logs into different Loki tenant IDs. To do so, add additional `[Output]` sections. +To get started with the `loki` plugin, follow the [Sending logs to Loki using Fluent Bit tutorial](https://grafana.com/docs/loki//send-data/fluentbit/fluent-bit-loki-tutorial/). diff --git a/docs/sources/send-data/fluentbit/community-plugin.md b/docs/sources/send-data/fluentbit/community-plugin.md new file mode 100644 index 000000000000..60dd5fef74a3 --- /dev/null +++ b/docs/sources/send-data/fluentbit/community-plugin.md @@ -0,0 +1,281 @@ +--- +title: Fluent Bit community plugin +menuTitle: Fluent Bit Community Plugin +description: Provides instructions for how to install, configure, and use the Fluent Bit Community plugin to send logs to Loki. +aliases: +- ../clients/fluentbit/ +weight: 500 +--- +# Fluent Bit community plugin + +{{< admonition type="warning" >}} + +We recommend using the official [Fluent Bit Loki plugin](https://grafana.com/docs/loki//send-data/fluentbit/fluent-bit-plugin/). The official plugin is more feature-rich and has better support for features such as structured metadata. The community plugin is still available for use, but it's no longer actively maintained. + +{{< /admonition >}} + +The Fluent Bit community plugin by Grafana Labs (`grafana-loki`) provided an alternative way to send logs to Loki. Although very similar to the [official plugin](https://grafana.com/docs/loki//send-data/fluentbit/fluent-bit-plugin/) there are some differences in the configuration options. This page provides instructions for how to install, configure, and use the Fluent Bit community plugin to send logs to Loki. Although the plugin is no longer actively maintained, this documentation is still available for reference. + +{{< youtube id="s43IBSVyTpQ" >}} + +## Usage + +### Docker + +You can run a Fluent Bit container with Loki output plugin pre-installed using our [Docker Hub](https://hub.docker.com/r/grafana/fluent-bit-plugin-loki) image: + +```bash +docker run -v /var/log:/var/log \ + -e LOG_PATH="/var/log/*.log" -e LOKI_URL="http://localhost:3100/loki/api/v1/push" \ + grafana/fluent-bit-plugin-loki:latest +``` + +Or, an alternative is to run the fluent-bit container using [Docker Hub](https://hub.docker.com/r/fluent/fluent-bit) image: + +### Docker container logs + +To ship logs from Docker containers to Grafana Cloud using Fluent Bit, you can use the Fluent Bit Docker image and configure it to forward logs directly to Grafana Loki. Below is a step-by-step guide on setting up Fluent Bit for this purpose. + +#### Prerequisites + +- Docker is installed on your machine. +- Running instance of Loki OSS. + +#### Configuration + +1. Create a Fluent Bit configuration file named `fluent-bit.conf` with the following content, which defines the input from Docker container logs and sets up the output to send logs to your Grafana Cloud Loki instance: + + ```ini + [SERVICE] + Flush 1 + Log_Level info + + [INPUT] + Name tail + Path /var/lib/docker/containers/*/*.log + Parser docker + Tag docker.* + + [OUTPUT] + Name grafana-loki + Match * + Url http://localhost:3100/loki/api/v1/push + Labels {job="fluentbit"} + +### Kubernetes + +You can run Fluent Bit as a [daemonset](https://kubernetes.io/docs/concepts/workloads/controllers/daemonset/) to collect all your Kubernetes workload logs. + +To do so you can use the [Fluent Bit Helm chart](https://github.com/fluent/helm-charts) with the following `values.yaml` changing the value of `FLUENT_LOKI_URL`: + +```yaml +image: + # Here we use the Docker image which has the plugin installed + repository: grafana/fluent-bit-plugin-loki + tag: main-e2ed1c0 + +args: + - "-e" + - "/fluent-bit/bin/out_grafana_loki.so" + - --workdir=/fluent-bit/etc + - --config=/fluent-bit/etc/conf/fluent-bit.conf + +env: + # Note that for security reasons you should fetch the credentials through a Kubernetes Secret https://kubernetes.io/docs/concepts/configuration/secret/ . You may use the envFrom for this. + - name: FLUENT_LOKI_URL + value: https://user:pass@your-loki.endpoint/loki/api/v1/push + +config: + inputs: | + [INPUT] + Name tail + Tag kube.* + Path /var/log/containers/*.log + # Be aware that local clusters like docker-desktop or kind use the docker log format and not the cri (https://docs.fluentbit.io/manual/installation/kubernetes#container-runtime-interface-cri-parser) + multiline.parser docker, cri + Mem_Buf_Limit 5MB + Skip_Long_Lines On + + outputs: | + [Output] + Name grafana-loki + Match kube.* + Url ${FLUENT_LOKI_URL} + Labels {job="fluent-bit"} + LabelKeys level,app # this sets the values for actual Loki streams and the other labels are converted to structured_metadata https://grafana.com/docs/loki//get-started/labels/structured-metadata/ + BatchWait 1 + BatchSize 1001024 + LineFormat json + LogLevel info + AutoKubernetesLabels true +``` + +```bash +helm repo add fluent https://fluent.github.io/helm-charts +helm repo update +helm install fluent-bit fluent/fluent-bit -f values.yaml +``` + +By default it will collect all containers logs and extract labels from Kubernetes API (`container_name`, `namespace`, etc.). + +If you also want to host your Loki instance inside the cluster install the [official Loki Helm chart](https://grafana.com/docs/loki//setup/install/helm/). + +### AWS Elastic Container Service (ECS) + +You can use the fluent-bit Loki Docker image as a Firelens log router in AWS ECS. +For more information about this see our [AWS documentation](https://grafana.com/docs/loki//send-data/promtail/cloud/ecs/). + +### Local + +First, you need to follow the [instructions](https://github.com/grafana/loki/blob/main/clients/cmd/fluent-bit/README.md) in order to build the plugin dynamic library. + +Assuming you have Fluent Bit installed in your `$PATH` you can run the plugin using: + +```bash +fluent-bit -e /path/to/built/out_grafana_loki.so -c fluent-bit.conf +``` + +You can also adapt your plugins.conf, removing the need to change the command line options: + +```conf +[PLUGINS] + Path /path/to/built/out_grafana_loki.so +``` + +## Configuration options + +| Key | Description | Default | +|----------------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|----------------------------------------| +| Url | Url of Loki server API endpoint. | http://localhost:3100/loki/api/v1/push | +| TenantID | The tenant ID used by default to push logs to Loki. If omitted or empty it assumes Loki is running in single-tenant mode and no `X-Scope-OrgID` header is sent. | "" | +| BatchWait | Time to wait before send a log batch to Loki, full or not. | 1s | +| BatchSize | Log batch size to send a log batch to Loki (unit: Bytes). | 10 KiB (10 * 1024 Bytes) | +| Timeout | Maximum time to wait for Loki server to respond to a request. | 10s | +| MinBackoff | Initial backoff time between retries. | 500ms | +| MaxBackoff | Maximum backoff time between retries. | 5m | +| MaxRetries | Maximum number of retries when sending batches. Setting it to `0` will retry indefinitely. | 10 | +| Labels | Labels for API requests. | {job="fluent-bit"} | +| LogLevel | LogLevel for plugin logger. | `info` | +| RemoveKeys | Specify removing keys. | none | +| AutoKubernetesLabels | If set to `true`, it will add all Kubernetes labels to Loki labels. | false | +| LabelKeys | Comma separated list of keys to use as stream labels. All other keys will be placed into the log line. LabelKeys is deactivated when using `LabelMapPath` label mapping configuration. | none | +| LineFormat | Format to use when flattening the record to a log line. Valid values are `json` or `key_value`. If set to `json` the log line sent to Loki will be the fluentd record (excluding any keys extracted out as labels) dumped as json. If set to `key_value`, the log line will be each item in the record concatenated together (separated by a single space) in the format <key>=<value>. | json | +| DropSingleKey | If set to true and after extracting label_keys a record only has a single key remaining, the log line sent to Loki will just be the value of the record key. | true | +| LabelMapPath | Path to a json file defining how to transform nested records. | none | +| Buffer | Enable buffering mechanism. | false | +| BufferType | Specify the buffering mechanism to use (currently only `dque` is implemented). | dque | +| DqueDir | Path to the directory for queued logs. | /tmp/flb-storage/loki | +| DqueSegmentSize | Segment size in terms of number of records per segment. | 500 | +| DqueSync | Whether to fsync each queue change. Specify no fsync with `normal`, and fsync with `full`. | `normal` | +| DqueName | Queue name, must be unique per output. | dque | + +### Labels + +Labels, for example `{container_name="nginx", cluster="us-west1"}`, are used to [query logs](https://grafana.com/docs/loki//query/). Labels are usually metadata about the workload producing the log stream (`instance`, `container_name`, `region`, `cluster`, `level`). In Loki labels are indexed, so you should be cautious when choosing them. High cardinality label values can have drastic impact on query performance. + +You can use the config parameters `Labels`, `RemoveKeys` , `LabelKeys` and `LabelMapPath` to instruct the output plugin how to perform labels extraction from your log entries or to add static labels to all log entries. + +### AutoKubernetesLabels + +If set to `true`, `AutoKubernetesLabels` will add all Kubernetes labels to Loki labels automatically and ignore parameters `LabelKeys`, `LabelMapPath`. + +### LabelMapPath + +When using the `Parser` and `Filter` plugins Fluent Bit can extract and add data to the current record/log data. While Loki labels are key value pairs, record data can be nested structures. +You can pass a JSON file that defines how to extract labels from each record. Each JSON key from the file will be matched with the log record to find label values. Values from the configuration are used as label names. + +Considering the record below : + +```json +{ + "kubernetes": { + "container_name": "promtail", + "pod_name": "promtail-xxx", + "namespace_name": "prod", + "labels" : { + "team": "x-men" + } + }, + "HOSTNAME": "docker-desktop", + "log" : "a log line", + "time": "20190926T152206Z" +} +``` + +and a LabelMap file as follows : + +```json +{ + "kubernetes": { + "container_name": "container", + "pod_name": "pod", + "namespace_name": "namespace", + "labels" : { + "team": "team" + } + } +} +``` + +The labels extracted will be `{team="x-men", container="promtail", pod="promtail-xxx", namespace="prod"}`. + +If you don't want the `kubernetes` and `HOSTNAME` fields to appear in the log line you can use the `RemoveKeys` configuration field. For example, `RemoveKeys kubernetes,HOSTNAME`. + +### Buffering + +Buffering refers to the ability to store the records somewhere, and while they are processed and delivered, still be able to continue storing more records. The Loki output plugin can be blocked by the Loki client because of its design: + +- If the BatchSize is over the limit, the output plugin pauses receiving new records until the pending batch is successfully sent to the server +- If the Loki server is unreachable (retry 429s, 500s and connection-level errors), the output plugin blocks new records until the Loki server is available again, and the pending batch is successfully sent to the server or as long as the maximum number of attempts has been reached within configured back-off mechanism + +The blocking state with some of the input plugins is not acceptable, because it can have an undesirable side effect on the part that generates the logs. Fluent Bit implements a buffering mechanism that is based on parallel processing. Therefore, it cannot send logs in order. There are two ways of handling the out-of-order logs: + +- Configure Loki to [accept out-of-order writes](https://grafana.com/docs/loki//configure/#accept-out-of-order-writes). + +- Configure the Loki output plugin to use the buffering mechanism based on [`dque`](https://github.com/joncrlsn/dque), which is compatible with the Loki server strict time ordering: + + ```properties + [Output] + Name grafana-loki + Match * + Url http://localhost:3100/loki/api/v1/push + Buffer true + DqueSegmentSize 8096 + DqueDir /tmp/flb-storage/buffer + DqueName loki.0 + ``` + +### Configuration examples + +To configure the Loki output plugin add this section to your luent-bit.conf file. + +```properties +[Output] + Name grafana-loki + Match * + Url http://localhost:3100/loki/api/v1/push + BatchWait 1s + BatchSize 30720 + # (30KiB) + Labels {test="fluent-bit-go", lang="Golang"} + RemoveKeys key1,key2 + LabelKeys key3,key4 + LineFormat key_value +``` + +```properties +[Output] + Name grafana-loki + Match * + Url http://localhost:3100/loki/api/v1/push + BatchWait 1s + BatchSize 30720 # (30KiB) + AutoKubernetesLabels true + RemoveKeys key1,key2 +``` + +A full [example configuration file](https://github.com/grafana/loki/blob/main/clients/cmd/fluent-bit/fluent-bit.conf) is also available in the Loki repository. + +### Running multiple plugin instances + +You can run multiple plugin instances in the same fluent-bit process, for example if you want to push to different Loki servers or route logs into different Loki tenant IDs. To do so, add additional `[Output]` sections. diff --git a/docs/sources/send-data/fluentbit/fluent-bit-loki-tutorial.md b/docs/sources/send-data/fluentbit/fluent-bit-loki-tutorial.md new file mode 100644 index 000000000000..67e2a583e044 --- /dev/null +++ b/docs/sources/send-data/fluentbit/fluent-bit-loki-tutorial.md @@ -0,0 +1,268 @@ +--- +title: Sending logs to Loki using Fluent Bit tutorial +menuTitle: Fluent Bit tutorial +description: Sending logs to Loki using Fluent Bit using the official Fluent Bit Loki output plugin. +weight: 250 +killercoda: + title: Sending logs to Loki using Fluent Bit tutorial + description: Sending logs to Loki using Fluent Bit using the official Fluent Bit Loki output plugin. + preprocessing: + substitutions: + - regexp: loki-fundamentals-fluent-bit-1 + replacement: loki-fundamentals_fluent-bit_1 + - regexp: docker compose + replacement: docker-compose + backend: + imageid: ubuntu +--- + + + +# Sending logs to Loki using Fluent Bit tutorial + +In this tutorial, you will learn how to send logs to Loki using Fluent Bit. Fluent Bit is a lightweight and fast log processor and forwarder that can collect, process, and deliver logs to various destinations. We will use the official Fluent Bit Loki output plugin to send logs to Loki. + + + + +## Dependencies + +Before you begin, ensure you have the following to run the demo: + +- Docker +- Docker Compose + +{{< admonition type="tip" >}} +Alternatively, you can try out this example in our interactive learning environment: [Sending logs to Loki using Fluent Bit tutorial](https://killercoda.com/grafana-labs/course/loki/fluentbit-loki-tutorial). + +It's a fully configured environment with all the dependencies already installed. + +![Interactive](/media/docs/loki/loki-ile.svg) + +Provide feedback, report bugs, and raise issues in the [Grafana Killercoda repository](https://github.com/grafana/killercoda). +{{< /admonition >}} + + + +## Scenario + +In this scenario, we have a microservices application called the Carnivorous Greenhouse. This application consists of the following services: + +- **User Service:** Manages user data and authentication for the application. Such as creating users and logging in. +- **Plant Service:** Manages the creation of new plants and updates other services when a new plant is created. +- **Simulation Service:** Generates sensor data for each plant. +- **Websocket Service:** Manages the websocket connections for the application. +- **Bug Service:** A service that when enabled, randomly causes services to fail and generate additional logs. +- **Main App:** The main application that ties all the services together. +- **Database:** A database that stores user and plant data. + +Each service has been instrumented with the Fluent Bit logging framework to generate logs. If you would like to learn more about how the Carnivorous Greenhouse application was instrumented with Fluent Bit, refer to the [Carnivorous Greenhouse repository](https://github.com/grafana/loki-fundamentals/blob/fluentbit-official/greenhouse/loggingfw.py). + + + + + +## Step 1: Environment setup + +In this step, we will set up our environment by cloning the repository that contains our demo application and spinning up our observability stack using Docker Compose. + +1. To get started, clone the repository that contains our demo application: + + ```bash + git clone -b fluentbit-official https://github.com/grafana/loki-fundamentals.git + ``` + +1. Next we will spin up our observability stack using Docker Compose: + + ```bash + docker compose -f loki-fundamentals/docker-compose.yml up -d + ``` + + This will spin up the following services: + ```console + ✔ Container loki-fundamentals-grafana-1 Started + ✔ Container loki-fundamentals-loki-1 Started + ✔ Container loki-fundamentals-fluent-bit-1 Started + ``` +Once we have finished configuring the Fluent Bit agent and sending logs to Loki, we will be able to view the logs in Grafana. To check if Grafana is up and running, navigate to the following URL: [http://localhost:3000](http://localhost:3000) + + + + +## Step 2: Configure Fluent Bit to send logs to Loki + +To configure Fluent Bit to receive logs from our application, we need to provide a configuration file. This configuration file will define the components and their relationships. We will build the entire observability pipeline within this configuration file. + +### Open your code editor and locate the `fluent-bit.conf` file + +Fluent Bit requires a configuration file to define the components and their relationships. The configuration file is written using Fluent Bit configuration syntax. We will build the entire observability pipeline within this configuration file. To start, we will open the `fluent-bit.conf` file in the code editor: + +{{< docs/ignore >}} +> Note: Killercoda has an inbuilt Code editor which can be accessed via the `Editor` tab. +1. Expand the `loki-fundamentals` directory in the file explorer of the `Editor` tab. +1. Locate the `fluent-bit.conf` file in the top level directory, `loki-fundamentals`. +1. Click on the `fluent-bit.conf` file to open it in the code editor. +{{< /docs/ignore >}} + + +1. Open the `loki-fundamentals` directory in a code editor of your choice. +1. Locate the `fluent-bit.conf` file in the `loki-fundamentals` directory (Top level directory). +1. Click on the `fluent-bit.conf` file to open it in the code editor. + + +You will copy all of the configuration snippets into the `fluent-bit.conf` file. + +### Receiving Fluent Bit protocal logs + +The first step is to configure Fluent Bit to receive logs from the Carnivorous Greenhouse application. Since the application is instrumented with Fluent Bit logging framework, it will send logs using the forward protocol (unique to Fluent Bit). We will use the `forward` input plugin to receive logs from the application. + +Now add the following configuration to the `fluent-bit.conf` file: +```conf +[INPUT] + Name forward + Listen 0.0.0.0 + Port 24224 +``` + +In this configuration: +- `Name`: The name of the input plugin. In this case, we are using the `forward` input plugin. +- `Listen`: The IP address to listen on. In this case, we are listening on all IP addresses. +- `Port`: The port to listen on. In this case, we are listening on port `24224`. + +For more information on the `forward` input plugin, see the [Fluent Bit Forward documentation](https://docs.fluentbit.io/manual/pipeline/inputs/forward). + + + +### Export logs to Loki using the official Loki output plugin + +Lastly, we will configure Fluent Bit to export logs to Loki using the official Loki output plugin. The Loki output plugin allows you to send logs or events to a Loki service. It supports data enrichment with Kubernetes labels, custom label keys, and structured metadata. + +Add the following configuration to the `fluent-bit.conf` file: +```conf +[OUTPUT] + name loki + match service.** + host loki + port 3100 + labels agent=fluent-bit + label_map_path /fluent-bit/etc/conf/logmap.json +``` + +In this configuration: +- `name`: The name of the output plugin. In this case, we are using the `loki` output plugin. +- `match`: The tag to match. In this case, we are matching all logs with the tag `service.**`. +- `host`: The hostname of the Loki service. In this case, we are using the hostname `loki`. +- `port`: The port of the Loki service. In this case, we are using port `3100`. +- `labels`: Additional labels to add to the logs. In this case, we are adding the label `agent=fluent-bit`. +- `label_map_path`: The path to the label map file. In this case, we are using the file `logmap.json`. + +For more information on the `loki` output plugin, see the [Fluent Bit Loki documentation](https://docs.fluentbit.io/manual/pipeline/outputs/loki). + +#### `logmap.json` file + +The `logmap.json` file is used to map the log fields to the Loki labels. In this tutorial we have pre-filled the `logmap.json` file with the following configuration: +```json +{ +"service": "service_name", +"instance_id": "instance_id" + } +``` +This configuration maps the `service` field to the Loki label `service_name` and the `instance_id` field to the Loki label `instance_id`. + + +### Reload the Fluent Bit configuration + +After adding the configuration to the `fluent-bit.conf` file, you will need to reload the Fluent Bit configuration. To reload the configuration, run the following command: + +```bash +docker restart loki-fundamentals-fluent-bit-1 +``` +To verify that the configuration has been loaded successfully, you can check the Fluent Bit logs by running the following command: + +```bash +docker logs loki-fundamentals-fluent-bit-1 +``` + +## Stuck? Need help? + +If you get stuck or need help creating the configuration, you can copy and replace the entire `config.alloy` using the completed configuration file: + +```bash +cp loki-fundamentals/completed/fluent-bit.conf loki-fundamentals/fluent-bit.conf +docker restart loki-fundamentals-fluent-bit-1 +``` + + + + + +## Step 3: Start the Carnivorous Greenhouse + +In this step, we will start the Carnivorous Greenhouse application. To start the application, run the following command: + +{{< admonition type="note" >}} +This docker-compose file relies on the `loki-fundamentals_loki` Docker network. If you have not started the observability stack, you will need to start it first. +{{< /admonition >}} + + +{{< docs/ignore >}} + +> Note: This docker-compose file relies on the `loki-fundamentals_loki` docker network. If you have not started the observability stack, you will need to start it first. + +{{< /docs/ignore >}} + +```bash +docker compose -f loki-fundamentals/greenhouse/docker-compose-micro.yml up -d --build +``` + +This will start the following services: +```bash + ✔ Container greenhouse-db-1 Started + ✔ Container greenhouse-websocket_service-1 Started + ✔ Container greenhouse-bug_service-1 Started + ✔ Container greenhouse-user_service-1 Started + ✔ Container greenhouse-plant_service-1 Started + ✔ Container greenhouse-simulation_service-1 Started + ✔ Container greenhouse-main_app-1 Started +``` + +Once started, you can access the Carnivorous Greenhouse application at [http://localhost:5005](http://localhost:5005). Generate some logs by interacting with the application in the following ways: + +1. Create a user. +1. Log in. +1. Create a few plants to monitor. +1. Enable bug mode to activate the bug service. This will cause services to fail and generate additional logs. + +Finally to view the logs in Loki, navigate to the Loki Logs Explore view in Grafana at [http://localhost:3000/a/grafana-lokiexplore-app/explore](http://localhost:3000/a/grafana-lokiexplore-app/explore). + + + + + + +## Summary + +In this tutorial, you learned how to send logs to Loki using Fluent Bit. You configured Fluent Bit to receive logs from the Carnivorous Greenhouse application and export logs to Loki using the official Loki output plugin. Where to next? + +{{< docs/ignore >}} + +### Back to Docs +Head back to where you started from to continue with the [Loki documentation](https://grafana.com/docs/loki/latest/send-data/alloy). + +{{< /docs/ignore >}} + + +## Further reading + +For more information on Fluent Bit, refer to the following resources: +- [Fluent Bit documentation](https://docs.fluentbit.io/manual/) +- [Other examples of Fluent Bit configurations](https://grafana.com/docs/loki/latest/send-data/fluentbit/) + +## Complete metrics, logs, traces, and profiling example + +If you would like to use a demo that includes Mimir, Loki, Tempo, and Grafana, you can use [Introduction to Metrics, Logs, Traces, and Profiling in Grafana](https://github.com/grafana/intro-to-mlt). `Intro-to-mltp` provides a self-contained environment for learning about Mimir, Loki, Tempo, and Grafana. + +The project includes detailed explanations of each component and annotated configurations for a single-instance deployment. Data from `intro-to-mltp` can also be pushed to Grafana Cloud. + + + diff --git a/docs/sources/send-data/fluentbit/fluent-bit-plugin.md b/docs/sources/send-data/fluentbit/fluent-bit-plugin.md new file mode 100644 index 000000000000..7d0cca739370 --- /dev/null +++ b/docs/sources/send-data/fluentbit/fluent-bit-plugin.md @@ -0,0 +1,148 @@ +--- +title: Fluent Bit Loki output plugin +menuTitle: Fluent Bit +description: Provides instructions for how to install, configure, and use the Fluent Bit client to send logs to Loki. +aliases: +- ../clients/fluentbit/ +weight: 500 +--- +# Fluent Bit Loki output plugin + +[Fluent Bit](https://fluentbit.io/) is a fast and lightweight logs and metrics processor and forwarder that can be configured with the [Fluent-bit Loki output plugin](https://docs.fluentbit.io/manual/pipeline/outputs/loki) to ship logs to Loki. + +You can define which log files you want to collect using the [`Tail`](https://docs.fluentbit.io/manual/pipeline/inputs/tail) or [`Stdin`](https://docs.fluentbit.io/manual/pipeline/inputs/standard-input) data pipeline input. Additionally, Fluent Bit supports multiple `Filter` and `Parser` plugins (`Kubernetes`, `JSON`, etc.) to structure and alter log lines. + +{{< admonition type="note" >}} +There are two Fluent Bit plugins for Loki: the officially maintained plugin `loki` and the `grafana-loki` plugin. We recommend using the `loki` plugin described within this page as it's officially maintained by the Fluent Bit project. + +For more information, see the [Fluent Bit Loki output plugin documentation](https://docs.fluentbit.io/manual/pipeline/outputs/loki). Note that the `grafana-loki` plugin is no longer actively maintained. +{{< /admonition >}} + +## Configuration + +All configuration options for the Fluent Bit Loki output plugin are documented in the [Fluent Bit Loki output plugin documentation](https://docs.fluentbit.io/manual/pipeline/outputs/loki#configuration-parameters). + +Here is a generic example for connecting Fluent Bit to Loki hosted on Grafana Cloud: + +```conf + [OUTPUT] + Name loki + Match * + Host YourHostname.company.com + port 443 + tls on + tls.verify on + http_user XXX + http_passwd XXX +``` + +Replace `Host`, `http_user`, and `http_passwd` with your Grafana Cloud Loki endpoint and credentials. + + +## Usage examples + +Here are some examples of how to use Fluent Bit to send logs to Loki. + +### Tail Docker logs + +Here is an example to run Fluent Bit in a Docker container, collect Docker logs, and send them to a local Loki instance. + +```bash +docker run -v /var/lib/docker/containers:/var/lib/docker/containers fluent/fluent-bit:latest /fluent-bit/bin/fluent-bit -i tail -p Path="/var/lib/docker/containers/*/*.log" -p Parser=docker -p Tag="docker.*" -o loki -p host=loki -p port=3100 -p labels="agent=fluend-bit,env=docker" +``` + +In this example, we are using the `tail` input plugin to collect Docker logs and the `loki` output plugin to send logs to Loki. Note it is recommended to use a configuration file to define the input and output plugins. The `-p` flag is used to pass configuration parameters to the plugins. + +#### Configuration file (Alternative to command line arguments) + +Create a configuration file `fluent-bit.conf` with the following content: + +```conf +[INPUT] + Name tail + Path /var/lib/docker/containers/*/*.log + Parser docker + Tag docker.* + +[OUTPUT] + Name loki + Match * + Host loki + Port 3100 + Labels agent=fluend-bit,env=docker +``` + +Run Fluent Bit with the configuration file: + +```bash +docker run -v /var/lib/docker/containers:/var/lib/docker/containers -v $(pwd)/fluent-bit.conf:/fluent-bit/etc/fluent-bit.conf fluent/fluent-bit:latest /fluent-bit/bin/fluent-bit -c /fluent-bit/etc/fluent-bit.conf +``` + +### Collect Docker events + +Here is an example to run Fluent Bit in a Docker container, collect docker events, and send them to a local Loki instance. + +```bash +docker run -v /var/run/docker.sock:/var/run/docker.sock fluent/fluent-bit:latest /fluent-bit/bin/fluent-bit -i docker_events -o loki -p host=loki -p port=3100 -p labels="agent=fluend-bit,env=docker" +``` + +In this example, we are using the `docker_events` input plugin to collect Docker events and the `loki` output plugin to send logs to Loki. Note it is recommended to use a configuration file to define the input and output plugins. The `-p` flag is used to pass configuration parameters to the plugins. + +#### Configuration file (Alternative to command line arguments) + +Create a configuration file `fluent-bit.conf` with the following content: + +```conf +[INPUT] + Name docker_events + +[OUTPUT] + Name loki + Match * + Host loki + Port 3100 + Labels agent=fluent-bit,env=docker +``` + +Run Fluent Bit with the configuration file: + +```bash +docker run -v /var/run/docker.sock:/var/run/docker.sock -v $(pwd)/fluent-bit.conf:/fluent-bit/etc/fluent-bit.conf fluent/fluent-bit:latest /fluent-bit/bin/fluent-bit -c /fluent-bit/etc/fluent-bit.conf +``` + +### Collect Kubernetes logs + +The recommended way to collect logs from Kubernetes with Fluent Bit is to use the Helm chart provided by the Fluent Bit project. The Helm chart is available at [https://github.com/fluent/helm-charts](https://github.com/fluent/helm-charts). + +Here is an example of how to deploy the Fluent Bit Helm chart to collect logs from Kubernetes and send them to Loki: + +1. Add the Fluent Bit Helm repository: + + ```bash + helm repo add fluent https://fluent.github.io/helm-charts +1. Create a `values.yaml` file with the following content: + + ```yaml + config: + outputs: | + [OUTPUT] + Name loki + Match * + Host YourHost.Company.net + port 443 + tls on + tls.verify on + http_user XXX + http_passwd XXX + Labels agent=fluend-bit + + Note we are only updating the `outputs` section of the Fluent Bit configuration. This is to replace the default output plugin with the Loki output plugin. If you need to update other parts of the Fluent Bit configuration refer to the [Fluent Bit values file reference](https://github.com/fluent/helm-charts/blob/main/charts/fluent-bit/values.yaml). + +1. Deploy the Fluent Bit Helm chart: + + ```bash + helm install fluent-bit fluent/fluent-bit -f values.yaml + +## Next steps + +- [Sending logs to Loki using Fluent Bit tutorial](https://grafana.com/docs/loki//send-data/fluentbit/fluent-bit-loki-tutorial/) \ No newline at end of file diff --git a/docs/sources/setup/install/docker.md b/docs/sources/setup/install/docker.md index de2006250a7c..723adfc617a7 100644 --- a/docs/sources/setup/install/docker.md +++ b/docs/sources/setup/install/docker.md @@ -113,13 +113,9 @@ Run the following commands in your command line. They work for Windows or Linux You should see something similar to the following: ```bash - ✔ Container mydevice-minio-1 Started 0.0s - ✔ Container mydevice-flog-1 Started 0.0s - ✔ Container mydevice-write-1 Started 0.0s - ✔ Container mydevice-read-1 Started 0.0s - ✔ Container mydevice-gateway-1 Started 0.0s - ✔ Container mydevice-grafana-1 Started 0.0s - ✔ Container mydevice-promtail-1 Started 0.0s + ✔ Container loki-loki-1 Started 0.0s + ✔ Container loki-grafana-1 Started 0.0s + ✔ Container loki-promtail-1 Started 0.0s ``` 1. Verify that Loki is up and running. diff --git a/docs/sources/shared/configuration.md b/docs/sources/shared/configuration.md index 132de42b8107..28145ea6059e 100644 --- a/docs/sources/shared/configuration.md +++ b/docs/sources/shared/configuration.md @@ -793,6 +793,16 @@ kafka_config: # CLI flag: -kafka.write-timeout [write_timeout: | default = 10s] + # The SASL username for authentication to Kafka using the PLAIN mechanism. + # Both username and password must be set. + # CLI flag: -kafka.sasl-username + [sasl_username: | default = ""] + + # The SASL password for authentication to Kafka using the PLAIN mechanism. + # Both username and password must be set. + # CLI flag: -kafka.sasl-password + [sasl_password: | default = ""] + # The consumer group used by the consumer to track the last consumed offset. # The consumer group must be different for each ingester. If the configured # consumer group contains the '' placeholder, it is replaced with @@ -3754,9 +3764,14 @@ shard_streams: # CLI flag: -bloom-build.enable [bloom_creation_enabled: | default = false] -# Experimental. Number of splits to create for the series keyspace when building -# blooms. The series keyspace is split into this many parts to parallelize bloom -# creation. +# Experimental. Bloom planning strategy to use in bloom creation. Can be one of: +# 'split_keyspace_by_factor' +# CLI flag: -bloom-build.planning-strategy +[bloom_planning_strategy: | default = "split_keyspace_by_factor"] + +# Experimental. Only if `bloom-build.planning-strategy` is 'split'. Number of +# splits to create for the series keyspace when building blooms. The series +# keyspace is split into this many parts to parallelize bloom creation. # CLI flag: -bloom-build.split-keyspace-by [bloom_split_series_keyspace_by: | default = 256] @@ -4049,12 +4064,14 @@ When a memberlist config with atleast 1 join_members is defined, kvstore of type Configures additional object stores for a given storage provider. Supported stores: aws, azure, bos, filesystem, gcs, swift. Example: -storage_config: - named_stores: - aws: - store-1: - endpoint: s3://foo-bucket - region: us-west1 +```yaml + storage_config: + named_stores: + aws: + store-1: + endpoint: s3://foo-bucket + region: us-west1 +``` Named store from this example can be used by setting object_store to store-1 in period_config. ```yaml @@ -5540,12 +5557,14 @@ hedging: # Configures additional object stores for a given storage provider. # Supported stores: aws, azure, bos, filesystem, gcs, swift. # Example: -# storage_config: -# named_stores: -# aws: -# store-1: -# endpoint: s3://foo-bucket -# region: us-west1 +# ```yaml +# storage_config: +# named_stores: +# aws: +# store-1: +# endpoint: s3://foo-bucket +# region: us-west1 +# ``` # Named store from this example can be used by setting object_store to store-1 # in period_config. [named_stores: ] diff --git a/flake.nix b/flake.nix index 32ff2f5b25d9..c9e52eb589af 100644 --- a/flake.nix +++ b/flake.nix @@ -75,12 +75,16 @@ devShell = pkgs.mkShell { nativeBuildInputs = with pkgs; [ - (import ./packages/chart-releaser.nix { - inherit (prev) pkgs lib buildGoModule fetchFromGitHub; + (pkgs.callPackage ./nix/packages/chart-releaser.nix { + inherit pkgs; + inherit (pkgs) buildGoModule fetchFromGitHub; + }) + + (pkgs.callPackage ./nix/packages/faillint.nix { + inherit (pkgs) lib buildGoModule fetchFromGitHub; }) chart-testing - faillint gcc go golangci-lint @@ -89,7 +93,6 @@ nettools nixpkgs-fmt statix - systemd yamllint ]; }; diff --git a/operator/internal/manifests/internal/config/build_test.go b/operator/internal/manifests/internal/config/build_test.go index 58d46fc42b69..2979696a1292 100644 --- a/operator/internal/manifests/internal/config/build_test.go +++ b/operator/internal/manifests/internal/config/build_test.go @@ -97,6 +97,7 @@ limits_config: max_line_size: 256000 discover_service_name: [] max_entries_limit_per_query: 5000 + discover_service_name: [] max_global_streams_per_user: 0 max_chunks_per_query: 2000000 max_query_length: 721h @@ -355,6 +356,7 @@ limits_config: max_line_size: 256000 discover_service_name: [] max_entries_limit_per_query: 5000 + discover_service_name: [] max_global_streams_per_user: 0 max_chunks_per_query: 2000000 max_query_length: 721h @@ -782,6 +784,7 @@ limits_config: max_line_size: 256000 discover_service_name: [] max_entries_limit_per_query: 5000 + discover_service_name: [] max_global_streams_per_user: 0 max_chunks_per_query: 2000000 max_query_length: 721h @@ -1141,6 +1144,7 @@ limits_config: max_line_size: 256000 discover_service_name: [] max_entries_limit_per_query: 5000 + discover_service_name: [] max_global_streams_per_user: 0 max_chunks_per_query: 2000000 max_query_length: 721h @@ -1501,6 +1505,7 @@ limits_config: max_line_size: 256000 discover_service_name: [] max_entries_limit_per_query: 5000 + discover_service_name: [] max_global_streams_per_user: 0 max_chunks_per_query: 2000000 max_query_length: 721h @@ -1895,6 +1900,7 @@ limits_config: max_line_size: 256000 discover_service_name: [] max_entries_limit_per_query: 5000 + discover_service_name: [] max_global_streams_per_user: 0 max_chunks_per_query: 2000000 max_query_length: 721h @@ -2231,6 +2237,7 @@ limits_config: max_line_size: 256000 discover_service_name: [] max_entries_limit_per_query: 5000 + discover_service_name: [] max_global_streams_per_user: 0 max_chunks_per_query: 2000000 max_query_length: 721h @@ -2671,6 +2678,7 @@ limits_config: max_line_size: 256000 discover_service_name: [] max_entries_limit_per_query: 5000 + discover_service_name: [] max_global_streams_per_user: 0 max_chunks_per_query: 2000000 max_query_length: 721h @@ -2996,6 +3004,7 @@ limits_config: max_line_size: 256000 discover_service_name: [] max_entries_limit_per_query: 5000 + discover_service_name: [] max_global_streams_per_user: 0 max_chunks_per_query: 2000000 max_query_length: 721h @@ -3494,6 +3503,7 @@ limits_config: max_line_size: 256000 discover_service_name: [] max_entries_limit_per_query: 5000 + discover_service_name: [] max_global_streams_per_user: 0 max_chunks_per_query: 2000000 max_query_length: 721h @@ -3756,6 +3766,7 @@ limits_config: max_line_size: 256000 discover_service_name: [] max_entries_limit_per_query: 5000 + discover_service_name: [] max_global_streams_per_user: 0 max_chunks_per_query: 2000000 max_query_length: 721h @@ -4019,6 +4030,7 @@ limits_config: max_line_size: 256000 discover_service_name: [] max_entries_limit_per_query: 5000 + discover_service_name: [] max_global_streams_per_user: 0 max_chunks_per_query: 2000000 max_query_length: 721h @@ -4283,6 +4295,7 @@ limits_config: max_line_size: 256000 discover_service_name: [] max_entries_limit_per_query: 5000 + discover_service_name: [] max_global_streams_per_user: 0 max_chunks_per_query: 2000000 max_query_length: 721h @@ -4583,6 +4596,7 @@ limits_config: max_line_size: 256000 discover_service_name: [] max_entries_limit_per_query: 5000 + discover_service_name: [] max_global_streams_per_user: 0 max_chunks_per_query: 2000000 max_query_length: 721h @@ -4881,6 +4895,7 @@ limits_config: max_line_size: 256000 discover_service_name: [] max_entries_limit_per_query: 5000 + discover_service_name: [] max_global_streams_per_user: 0 max_chunks_per_query: 2000000 max_query_length: 721h @@ -5380,6 +5395,7 @@ limits_config: max_line_size: 256000 discover_service_name: [] max_entries_limit_per_query: 5000 + discover_service_name: [] max_global_streams_per_user: 0 max_chunks_per_query: 2000000 max_query_length: 721h @@ -5557,6 +5573,7 @@ limits_config: max_line_size: 256000 discover_service_name: [] max_entries_limit_per_query: 5000 + discover_service_name: [] max_global_streams_per_user: 0 max_chunks_per_query: 2000000 max_query_length: 721h @@ -5727,6 +5744,7 @@ limits_config: max_line_size: 256000 discover_service_name: [] max_entries_limit_per_query: 5000 + discover_service_name: [] max_global_streams_per_user: 0 max_chunks_per_query: 2000000 max_query_length: 721h @@ -6120,6 +6138,7 @@ limits_config: max_line_size: 256000 discover_service_name: [] max_entries_limit_per_query: 5000 + discover_service_name: [] max_global_streams_per_user: 0 max_chunks_per_query: 2000000 max_query_length: 721h diff --git a/operator/internal/manifests/internal/config/loki-config.yaml b/operator/internal/manifests/internal/config/loki-config.yaml index 634da2dd98e2..a12e35caf0f7 100644 --- a/operator/internal/manifests/internal/config/loki-config.yaml +++ b/operator/internal/manifests/internal/config/loki-config.yaml @@ -201,6 +201,7 @@ limits_config: max_line_size: {{ .Stack.Limits.Global.IngestionLimits.MaxLineSize }} discover_service_name: [] max_entries_limit_per_query: {{ .Stack.Limits.Global.QueryLimits.MaxEntriesLimitPerQuery }} + discover_service_name: [] max_global_streams_per_user: {{ .Stack.Limits.Global.IngestionLimits.MaxGlobalStreamsPerTenant }} max_chunks_per_query: {{ .Stack.Limits.Global.QueryLimits.MaxChunksPerQuery }} max_query_length: 721h diff --git a/pkg/bloombuild/planner/config.go b/pkg/bloombuild/planner/config.go index 40ec5707ef71..cfbccd84322d 100644 --- a/pkg/bloombuild/planner/config.go +++ b/pkg/bloombuild/planner/config.go @@ -4,6 +4,8 @@ import ( "flag" "fmt" "time" + + "github.com/grafana/loki/v3/pkg/bloombuild/planner/strategies" ) // Config configures the bloom-planner component. @@ -44,8 +46,8 @@ func (cfg *Config) Validate() error { type Limits interface { RetentionLimits + strategies.Limits BloomCreationEnabled(tenantID string) bool - BloomSplitSeriesKeyspaceBy(tenantID string) int BloomBuildMaxBuilders(tenantID string) int BuilderResponseTimeout(tenantID string) time.Duration BloomTaskMaxRetries(tenantID string) int diff --git a/pkg/bloombuild/planner/planner.go b/pkg/bloombuild/planner/planner.go index f66748f1832b..53b3b0df01de 100644 --- a/pkg/bloombuild/planner/planner.go +++ b/pkg/bloombuild/planner/planner.go @@ -17,6 +17,7 @@ import ( "go.uber.org/atomic" "github.com/grafana/loki/v3/pkg/bloombuild/common" + "github.com/grafana/loki/v3/pkg/bloombuild/planner/strategies" "github.com/grafana/loki/v3/pkg/bloombuild/protos" iter "github.com/grafana/loki/v3/pkg/iter/v2" "github.com/grafana/loki/v3/pkg/queue" @@ -254,7 +255,7 @@ func (p *Planner) runOne(ctx context.Context) error { tables := p.tables(time.Now()) level.Debug(p.logger).Log("msg", "loaded tables", "tables", tables.TotalDays()) - work, err := p.loadTenantWork(ctx, tables) + tenantTables, err := p.loadTenantTables(ctx, tables) if err != nil { return fmt.Errorf("error loading work: %w", err) } @@ -265,19 +266,21 @@ func (p *Planner) runOne(ctx context.Context) error { tasksResultForTenantTable := make(map[tenantTable]tenantTableTaskResults) var totalTasks int - for table, tenants := range work { - for tenant, ownershipRanges := range tenants { + for table, tenants := range tenantTables { + for _, tenant := range tenants { logger := log.With(p.logger, "tenant", tenant, "table", table.Addr()) + tt := tenantTable{ tenant: tenant, table: table, } - tasks, existingMetas, err := p.computeTasks(ctx, table, tenant, ownershipRanges) + tasks, existingMetas, err := p.computeTasks(ctx, table, tenant) if err != nil { - level.Error(logger).Log("msg", "error computing tasks", "err", err) + level.Error(logger).Log("msg", "failed to compute tasks", "err", err) continue } + level.Debug(logger).Log("msg", "computed tasks", "tasks", len(tasks), "existingMetas", len(existingMetas)) var tenantTableEnqueuedTasks int @@ -367,17 +370,20 @@ func (p *Planner) runOne(ctx context.Context) error { return nil } -// computeTasks computes the tasks for a given table and tenant and ownership range. -// It returns the tasks to be executed and the metas that are existing relevant for the ownership range. +// computeTasks computes the tasks for a given table and tenant. +// It returns the tasks to be executed and the existing metas. func (p *Planner) computeTasks( ctx context.Context, table config.DayTable, tenant string, - ownershipRanges []v1.FingerprintBounds, ) ([]*protos.Task, []bloomshipper.Meta, error) { - var tasks []*protos.Task logger := log.With(p.logger, "table", table.Addr(), "tenant", tenant) + strategy, err := strategies.NewStrategy(tenant, p.limits, p.logger) + if err != nil { + return nil, nil, fmt.Errorf("error creating strategy: %w", err) + } + // Fetch source metas to be used in both build and cleanup of out-of-date metas+blooms metas, err := p.bloomStore.FetchMetas( ctx, @@ -421,22 +427,9 @@ func (p *Planner) computeTasks( } }() - for _, ownershipRange := range ownershipRanges { - logger := log.With(logger, "ownership", ownershipRange.String()) - - // Filter only the metas that overlap in the ownership range - metasInBounds := bloomshipper.FilterMetasOverlappingBounds(metas, ownershipRange) - - // Find gaps in the TSDBs for this tenant/table - gaps, err := p.findOutdatedGaps(ctx, tenant, openTSDBs, ownershipRange, metasInBounds, logger) - if err != nil { - level.Error(logger).Log("msg", "failed to find outdated gaps", "err", err) - continue - } - - for _, gap := range gaps { - tasks = append(tasks, protos.NewTask(table, tenant, ownershipRange, gap.tsdb, gap.gaps)) - } + tasks, err := strategy.Plan(ctx, table, tenant, openTSDBs, metas) + if err != nil { + return nil, nil, fmt.Errorf("failed to plan tasks: %w", err) } return tasks, metas, nil @@ -649,15 +642,12 @@ func (p *Planner) tables(ts time.Time) *dayRangeIterator { return newDayRangeIterator(fromDay, throughDay, p.schemaCfg) } -type work map[config.DayTable]map[string][]v1.FingerprintBounds - -// loadTenantWork loads the work for each tenant and table tuple. -// work is the list of fingerprint ranges that need to be indexed in bloom filters. -func (p *Planner) loadTenantWork( +// loadTenantTables loads all tenants with bloom build enabled for each table. +func (p *Planner) loadTenantTables( ctx context.Context, tables *dayRangeIterator, -) (work, error) { - tenantTableWork := make(map[config.DayTable]map[string][]v1.FingerprintBounds, tables.TotalDays()) +) (map[config.DayTable][]string, error) { + tenantTables := make(map[config.DayTable][]string, tables.TotalDays()) for tables.Next() && tables.Err() == nil && ctx.Err() == nil { table := tables.At() @@ -670,8 +660,8 @@ func (p *Planner) loadTenantWork( level.Debug(p.logger).Log("msg", "loaded tenants", "table", table, "tenants", tenants.Remaining()) // If this is the first this we see this table, initialize the map - if tenantTableWork[table] == nil { - tenantTableWork[table] = make(map[string][]v1.FingerprintBounds, tenants.Remaining()) + if tenantTables[table] == nil { + tenantTables[table] = make([]string, tenants.Remaining()) } for tenants.Next() && tenants.Err() == nil && ctx.Err() == nil { @@ -683,11 +673,6 @@ func (p *Planner) loadTenantWork( continue } - splitFactor := p.limits.BloomSplitSeriesKeyspaceBy(tenant) - bounds := SplitFingerprintKeyspaceByFactor(splitFactor) - - tenantTableWork[table][tenant] = bounds - // Reset progress tracking metrics for this tenant // NOTE(salvacorts): We will reset them multiple times for the same tenant, for each table, but it's not a big deal. // Alternatively, we can use a Counter instead of a Gauge, but I think a Gauge is easier to reason about. @@ -695,7 +680,7 @@ func (p *Planner) loadTenantWork( p.metrics.tenantTasksCompleted.WithLabelValues(tenant, statusSuccess).Set(0) p.metrics.tenantTasksCompleted.WithLabelValues(tenant, statusFailure).Set(0) - level.Debug(p.logger).Log("msg", "loading work for tenant", "table", table, "tenant", tenant, "splitFactor", splitFactor) + tenantTables[table] = append(tenantTables[table], tenant) } if err := tenants.Err(); err != nil { level.Error(p.logger).Log("msg", "error iterating tenants", "err", err) @@ -708,7 +693,7 @@ func (p *Planner) loadTenantWork( return nil, fmt.Errorf("error iterating tables: %w", err) } - return tenantTableWork, ctx.Err() + return tenantTables, ctx.Err() } func (p *Planner) tenants(ctx context.Context, table config.DayTable) (*iter.SliceIter[string], error) { @@ -720,178 +705,6 @@ func (p *Planner) tenants(ctx context.Context, table config.DayTable) (*iter.Sli return iter.NewSliceIter(tenants), nil } -// blockPlan is a plan for all the work needed to build a meta.json -// It includes: -// - the tsdb (source of truth) which contains all the series+chunks -// we need to ensure are indexed in bloom blocks -// - a list of gaps that are out of date and need to be checked+built -// - within each gap, a list of block refs which overlap the gap are included -// so we can use them to accelerate bloom generation. They likely contain many -// of the same chunks we need to ensure are indexed, just from previous tsdb iterations. -// This is a performance optimization to avoid expensive re-reindexing -type blockPlan struct { - tsdb tsdb.SingleTenantTSDBIdentifier - gaps []protos.Gap -} - -func (p *Planner) findOutdatedGaps( - ctx context.Context, - tenant string, - tsdbs map[tsdb.SingleTenantTSDBIdentifier]common.ClosableForSeries, - ownershipRange v1.FingerprintBounds, - metas []bloomshipper.Meta, - logger log.Logger, -) ([]blockPlan, error) { - // Determine which TSDBs have gaps in the ownership range and need to - // be processed. - tsdbsWithGaps, err := gapsBetweenTSDBsAndMetas(ownershipRange, tsdbs, metas) - if err != nil { - level.Error(logger).Log("msg", "failed to find gaps", "err", err) - return nil, fmt.Errorf("failed to find gaps: %w", err) - } - - if len(tsdbsWithGaps) == 0 { - level.Debug(logger).Log("msg", "blooms exist for all tsdbs") - return nil, nil - } - - work, err := blockPlansForGaps(ctx, tenant, tsdbsWithGaps, metas) - if err != nil { - level.Error(logger).Log("msg", "failed to create plan", "err", err) - return nil, fmt.Errorf("failed to create plan: %w", err) - } - - return work, nil -} - -// Used to signal the gaps that need to be populated for a tsdb -type tsdbGaps struct { - tsdbIdentifier tsdb.SingleTenantTSDBIdentifier - tsdb common.ClosableForSeries - gaps []v1.FingerprintBounds -} - -// gapsBetweenTSDBsAndMetas returns if the metas are up-to-date with the TSDBs. This is determined by asserting -// that for each TSDB, there are metas covering the entire ownership range which were generated from that specific TSDB. -func gapsBetweenTSDBsAndMetas( - ownershipRange v1.FingerprintBounds, - tsdbs map[tsdb.SingleTenantTSDBIdentifier]common.ClosableForSeries, - metas []bloomshipper.Meta, -) (res []tsdbGaps, err error) { - for db, tsdb := range tsdbs { - id := db.Name() - - relevantMetas := make([]v1.FingerprintBounds, 0, len(metas)) - for _, meta := range metas { - for _, s := range meta.Sources { - if s.Name() == id { - relevantMetas = append(relevantMetas, meta.Bounds) - } - } - } - - gaps, err := FindGapsInFingerprintBounds(ownershipRange, relevantMetas) - if err != nil { - return nil, err - } - - if len(gaps) > 0 { - res = append(res, tsdbGaps{ - tsdbIdentifier: db, - tsdb: tsdb, - gaps: gaps, - }) - } - } - - return res, err -} - -// blockPlansForGaps groups tsdb gaps we wish to fill with overlapping but out of date blocks. -// This allows us to expedite bloom generation by using existing blocks to fill in the gaps -// since many will contain the same chunks. -func blockPlansForGaps( - ctx context.Context, - tenant string, - tsdbs []tsdbGaps, - metas []bloomshipper.Meta, -) ([]blockPlan, error) { - plans := make([]blockPlan, 0, len(tsdbs)) - - for _, idx := range tsdbs { - plan := blockPlan{ - tsdb: idx.tsdbIdentifier, - gaps: make([]protos.Gap, 0, len(idx.gaps)), - } - - for _, gap := range idx.gaps { - planGap := protos.Gap{ - Bounds: gap, - } - - seriesItr, err := common.NewTSDBSeriesIter(ctx, tenant, idx.tsdb, gap) - if err != nil { - return nil, fmt.Errorf("failed to load series from TSDB for gap (%s): %w", gap.String(), err) - } - planGap.Series, err = iter.Collect(seriesItr) - if err != nil { - return nil, fmt.Errorf("failed to collect series: %w", err) - } - - for _, meta := range metas { - if meta.Bounds.Intersection(gap) == nil { - // this meta doesn't overlap the gap, skip - continue - } - - for _, block := range meta.Blocks { - if block.Bounds.Intersection(gap) == nil { - // this block doesn't overlap the gap, skip - continue - } - // this block overlaps the gap, add it to the plan - // for this gap - planGap.Blocks = append(planGap.Blocks, block) - } - } - - // ensure we sort blocks so deduping iterator works as expected - sort.Slice(planGap.Blocks, func(i, j int) bool { - return planGap.Blocks[i].Bounds.Less(planGap.Blocks[j].Bounds) - }) - - peekingBlocks := iter.NewPeekIter[bloomshipper.BlockRef]( - iter.NewSliceIter[bloomshipper.BlockRef]( - planGap.Blocks, - ), - ) - // dedupe blocks which could be in multiple metas - itr := iter.NewDedupingIter[bloomshipper.BlockRef, bloomshipper.BlockRef]( - func(a, b bloomshipper.BlockRef) bool { - return a == b - }, - iter.Identity[bloomshipper.BlockRef], - func(a, _ bloomshipper.BlockRef) bloomshipper.BlockRef { - return a - }, - peekingBlocks, - ) - - deduped, err := iter.Collect[bloomshipper.BlockRef](itr) - if err != nil { - return nil, fmt.Errorf("failed to dedupe blocks: %w", err) - } - planGap.Blocks = deduped - - plan.gaps = append(plan.gaps, planGap) - } - - plans = append(plans, plan) - } - - return plans, nil -} - func (p *Planner) addPendingTask(task *QueueTask) { p.pendingTasks.Store(task.ID, task) } diff --git a/pkg/bloombuild/planner/planner_test.go b/pkg/bloombuild/planner/planner_test.go index 9523a4579557..68ed8b9b2190 100644 --- a/pkg/bloombuild/planner/planner_test.go +++ b/pkg/bloombuild/planner/planner_test.go @@ -16,12 +16,10 @@ import ( "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" "github.com/prometheus/common/model" - "github.com/prometheus/prometheus/model/labels" "github.com/stretchr/testify/require" "go.uber.org/atomic" "google.golang.org/grpc" - "github.com/grafana/loki/v3/pkg/bloombuild/common" "github.com/grafana/loki/v3/pkg/bloombuild/protos" "github.com/grafana/loki/v3/pkg/compression" iter "github.com/grafana/loki/v3/pkg/iter/v2" @@ -33,7 +31,6 @@ import ( "github.com/grafana/loki/v3/pkg/storage/stores/shipper/bloomshipper" bloomshipperconfig "github.com/grafana/loki/v3/pkg/storage/stores/shipper/bloomshipper/config" "github.com/grafana/loki/v3/pkg/storage/stores/shipper/indexshipper/tsdb" - "github.com/grafana/loki/v3/pkg/storage/stores/shipper/indexshipper/tsdb/index" "github.com/grafana/loki/v3/pkg/storage/types" "github.com/grafana/loki/v3/pkg/util/mempool" ) @@ -64,110 +61,6 @@ func genMeta(min, max model.Fingerprint, sources []int, blocks []bloomshipper.Bl return m } -func Test_gapsBetweenTSDBsAndMetas(t *testing.T) { - - for _, tc := range []struct { - desc string - err bool - exp []tsdbGaps - ownershipRange v1.FingerprintBounds - tsdbs map[tsdb.SingleTenantTSDBIdentifier]common.ClosableForSeries - metas []bloomshipper.Meta - }{ - { - desc: "non-overlapping tsdbs and metas", - err: true, - ownershipRange: v1.NewBounds(0, 10), - tsdbs: map[tsdb.SingleTenantTSDBIdentifier]common.ClosableForSeries{ - tsdbID(0): nil, - }, - metas: []bloomshipper.Meta{ - genMeta(11, 20, []int{0}, nil), - }, - }, - { - desc: "single tsdb", - ownershipRange: v1.NewBounds(0, 10), - tsdbs: map[tsdb.SingleTenantTSDBIdentifier]common.ClosableForSeries{ - tsdbID(0): nil, - }, - metas: []bloomshipper.Meta{ - genMeta(4, 8, []int{0}, nil), - }, - exp: []tsdbGaps{ - { - tsdbIdentifier: tsdbID(0), - gaps: []v1.FingerprintBounds{ - v1.NewBounds(0, 3), - v1.NewBounds(9, 10), - }, - }, - }, - }, - { - desc: "multiple tsdbs with separate blocks", - ownershipRange: v1.NewBounds(0, 10), - tsdbs: map[tsdb.SingleTenantTSDBIdentifier]common.ClosableForSeries{ - tsdbID(0): nil, - tsdbID(1): nil, - }, - metas: []bloomshipper.Meta{ - genMeta(0, 5, []int{0}, nil), - genMeta(6, 10, []int{1}, nil), - }, - exp: []tsdbGaps{ - { - tsdbIdentifier: tsdbID(0), - gaps: []v1.FingerprintBounds{ - v1.NewBounds(6, 10), - }, - }, - { - tsdbIdentifier: tsdbID(1), - gaps: []v1.FingerprintBounds{ - v1.NewBounds(0, 5), - }, - }, - }, - }, - { - desc: "multiple tsdbs with the same blocks", - ownershipRange: v1.NewBounds(0, 10), - tsdbs: map[tsdb.SingleTenantTSDBIdentifier]common.ClosableForSeries{ - tsdbID(0): nil, - tsdbID(1): nil, - }, - metas: []bloomshipper.Meta{ - genMeta(0, 5, []int{0, 1}, nil), - genMeta(6, 8, []int{1}, nil), - }, - exp: []tsdbGaps{ - { - tsdbIdentifier: tsdbID(0), - gaps: []v1.FingerprintBounds{ - v1.NewBounds(6, 10), - }, - }, - { - tsdbIdentifier: tsdbID(1), - gaps: []v1.FingerprintBounds{ - v1.NewBounds(9, 10), - }, - }, - }, - }, - } { - t.Run(tc.desc, func(t *testing.T) { - gaps, err := gapsBetweenTSDBsAndMetas(tc.ownershipRange, tc.tsdbs, tc.metas) - if tc.err { - require.Error(t, err) - return - } - require.ElementsMatch(t, tc.exp, gaps) - }) - } -} - func genBlockRef(min, max model.Fingerprint) bloomshipper.BlockRef { startTS, endTS := testDay.Bounds() return bloomshipper.BlockRef{ @@ -214,209 +107,6 @@ func genBlock(ref bloomshipper.BlockRef) (bloomshipper.Block, error) { }, nil } -func Test_blockPlansForGaps(t *testing.T) { - for _, tc := range []struct { - desc string - ownershipRange v1.FingerprintBounds - tsdbs []tsdb.SingleTenantTSDBIdentifier - metas []bloomshipper.Meta - err bool - exp []blockPlan - }{ - { - desc: "single overlapping meta+no overlapping block", - ownershipRange: v1.NewBounds(0, 10), - tsdbs: []tsdb.SingleTenantTSDBIdentifier{tsdbID(0)}, - metas: []bloomshipper.Meta{ - genMeta(5, 20, []int{1}, []bloomshipper.BlockRef{genBlockRef(11, 20)}), - }, - exp: []blockPlan{ - { - tsdb: tsdbID(0), - gaps: []protos.Gap{ - { - Bounds: v1.NewBounds(0, 10), - Series: genSeries(v1.NewBounds(0, 10)), - }, - }, - }, - }, - }, - { - desc: "single overlapping meta+one overlapping block", - ownershipRange: v1.NewBounds(0, 10), - tsdbs: []tsdb.SingleTenantTSDBIdentifier{tsdbID(0)}, - metas: []bloomshipper.Meta{ - genMeta(5, 20, []int{1}, []bloomshipper.BlockRef{genBlockRef(9, 20)}), - }, - exp: []blockPlan{ - { - tsdb: tsdbID(0), - gaps: []protos.Gap{ - { - Bounds: v1.NewBounds(0, 10), - Series: genSeries(v1.NewBounds(0, 10)), - Blocks: []bloomshipper.BlockRef{genBlockRef(9, 20)}, - }, - }, - }, - }, - }, - { - // the range which needs to be generated doesn't overlap with existing blocks - // from other tsdb versions since theres an up to date tsdb version block, - // but we can trim the range needing generation - desc: "trims up to date area", - ownershipRange: v1.NewBounds(0, 10), - tsdbs: []tsdb.SingleTenantTSDBIdentifier{tsdbID(0)}, - metas: []bloomshipper.Meta{ - genMeta(9, 20, []int{0}, []bloomshipper.BlockRef{genBlockRef(9, 20)}), // block for same tsdb - genMeta(9, 20, []int{1}, []bloomshipper.BlockRef{genBlockRef(9, 20)}), // block for different tsdb - }, - exp: []blockPlan{ - { - tsdb: tsdbID(0), - gaps: []protos.Gap{ - { - Bounds: v1.NewBounds(0, 8), - Series: genSeries(v1.NewBounds(0, 8)), - }, - }, - }, - }, - }, - { - desc: "uses old block for overlapping range", - ownershipRange: v1.NewBounds(0, 10), - tsdbs: []tsdb.SingleTenantTSDBIdentifier{tsdbID(0)}, - metas: []bloomshipper.Meta{ - genMeta(9, 20, []int{0}, []bloomshipper.BlockRef{genBlockRef(9, 20)}), // block for same tsdb - genMeta(5, 20, []int{1}, []bloomshipper.BlockRef{genBlockRef(5, 20)}), // block for different tsdb - }, - exp: []blockPlan{ - { - tsdb: tsdbID(0), - gaps: []protos.Gap{ - { - Bounds: v1.NewBounds(0, 8), - Series: genSeries(v1.NewBounds(0, 8)), - Blocks: []bloomshipper.BlockRef{genBlockRef(5, 20)}, - }, - }, - }, - }, - }, - { - desc: "multi case", - ownershipRange: v1.NewBounds(0, 10), - tsdbs: []tsdb.SingleTenantTSDBIdentifier{tsdbID(0), tsdbID(1)}, // generate for both tsdbs - metas: []bloomshipper.Meta{ - genMeta(0, 2, []int{0}, []bloomshipper.BlockRef{ - genBlockRef(0, 1), - genBlockRef(1, 2), - }), // tsdb_0 - genMeta(6, 8, []int{0}, []bloomshipper.BlockRef{genBlockRef(6, 8)}), // tsdb_0 - - genMeta(3, 5, []int{1}, []bloomshipper.BlockRef{genBlockRef(3, 5)}), // tsdb_1 - genMeta(8, 10, []int{1}, []bloomshipper.BlockRef{genBlockRef(8, 10)}), // tsdb_1 - }, - exp: []blockPlan{ - { - tsdb: tsdbID(0), - gaps: []protos.Gap{ - // tsdb (id=0) can source chunks from the blocks built from tsdb (id=1) - { - Bounds: v1.NewBounds(3, 5), - Series: genSeries(v1.NewBounds(3, 5)), - Blocks: []bloomshipper.BlockRef{genBlockRef(3, 5)}, - }, - { - Bounds: v1.NewBounds(9, 10), - Series: genSeries(v1.NewBounds(9, 10)), - Blocks: []bloomshipper.BlockRef{genBlockRef(8, 10)}, - }, - }, - }, - // tsdb (id=1) can source chunks from the blocks built from tsdb (id=0) - { - tsdb: tsdbID(1), - gaps: []protos.Gap{ - { - Bounds: v1.NewBounds(0, 2), - Series: genSeries(v1.NewBounds(0, 2)), - Blocks: []bloomshipper.BlockRef{ - genBlockRef(0, 1), - genBlockRef(1, 2), - }, - }, - { - Bounds: v1.NewBounds(6, 7), - Series: genSeries(v1.NewBounds(6, 7)), - Blocks: []bloomshipper.BlockRef{genBlockRef(6, 8)}, - }, - }, - }, - }, - }, - { - desc: "dedupes block refs", - ownershipRange: v1.NewBounds(0, 10), - tsdbs: []tsdb.SingleTenantTSDBIdentifier{tsdbID(0)}, - metas: []bloomshipper.Meta{ - genMeta(9, 20, []int{1}, []bloomshipper.BlockRef{ - genBlockRef(1, 4), - genBlockRef(9, 20), - }), // blocks for first diff tsdb - genMeta(5, 20, []int{2}, []bloomshipper.BlockRef{ - genBlockRef(5, 10), - genBlockRef(9, 20), // same block references in prior meta (will be deduped) - }), // block for second diff tsdb - }, - exp: []blockPlan{ - { - tsdb: tsdbID(0), - gaps: []protos.Gap{ - { - Bounds: v1.NewBounds(0, 10), - Series: genSeries(v1.NewBounds(0, 10)), - Blocks: []bloomshipper.BlockRef{ - genBlockRef(1, 4), - genBlockRef(5, 10), - genBlockRef(9, 20), - }, - }, - }, - }, - }, - }, - } { - t.Run(tc.desc, func(t *testing.T) { - // We add series spanning the whole FP ownership range - tsdbs := make(map[tsdb.SingleTenantTSDBIdentifier]common.ClosableForSeries) - for _, id := range tc.tsdbs { - tsdbs[id] = newFakeForSeries(genSeries(tc.ownershipRange)) - } - - // we reuse the gapsBetweenTSDBsAndMetas function to generate the gaps as this function is tested - // separately and it's used to generate input in our regular code path (easier to write tests this way). - gaps, err := gapsBetweenTSDBsAndMetas(tc.ownershipRange, tsdbs, tc.metas) - require.NoError(t, err) - - plans, err := blockPlansForGaps( - context.Background(), - "fakeTenant", - gaps, - tc.metas, - ) - if tc.err { - require.Error(t, err) - return - } - require.ElementsMatch(t, tc.exp, plans) - }) - } -} - func genSeries(bounds v1.FingerprintBounds) []*v1.Series { series := make([]*v1.Series, 0, int(bounds.Max-bounds.Min+1)) for i := bounds.Min; i <= bounds.Max; i++ { @@ -434,45 +124,6 @@ func genSeries(bounds v1.FingerprintBounds) []*v1.Series { return series } -type fakeForSeries struct { - series []*v1.Series -} - -func newFakeForSeries(series []*v1.Series) *fakeForSeries { - return &fakeForSeries{ - series: series, - } -} - -func (f fakeForSeries) ForSeries(_ context.Context, _ string, ff index.FingerprintFilter, _ model.Time, _ model.Time, fn func(labels.Labels, model.Fingerprint, []index.ChunkMeta) (stop bool), _ ...*labels.Matcher) error { - overlapping := make([]*v1.Series, 0, len(f.series)) - for _, s := range f.series { - if ff.Match(s.Fingerprint) { - overlapping = append(overlapping, s) - } - } - - for _, s := range overlapping { - chunks := make([]index.ChunkMeta, 0, len(s.Chunks)) - for _, c := range s.Chunks { - chunks = append(chunks, index.ChunkMeta{ - MinTime: int64(c.From), - MaxTime: int64(c.Through), - Checksum: c.Checksum, - }) - } - - if fn(labels.EmptyLabels(), s.Fingerprint, chunks) { - break - } - } - return nil -} - -func (f fakeForSeries) Close() error { - return nil -} - func createTasks(n int, resultsCh chan *protos.TaskResult) []*QueueTask { tasks := make([]*QueueTask, 0, n) // Enqueue tasks diff --git a/pkg/bloombuild/planner/strategies/factory.go b/pkg/bloombuild/planner/strategies/factory.go new file mode 100644 index 000000000000..578f74f855d3 --- /dev/null +++ b/pkg/bloombuild/planner/strategies/factory.go @@ -0,0 +1,45 @@ +package strategies + +import ( + "context" + "fmt" + + "github.com/go-kit/log" + + "github.com/grafana/loki/v3/pkg/bloombuild/common" + "github.com/grafana/loki/v3/pkg/bloombuild/protos" + "github.com/grafana/loki/v3/pkg/storage/config" + "github.com/grafana/loki/v3/pkg/storage/stores/shipper/bloomshipper" + "github.com/grafana/loki/v3/pkg/storage/stores/shipper/indexshipper/tsdb" +) + +const ( + SplitKeyspaceStrategyName = "split_keyspace_by_factor" +) + +type Limits interface { + BloomPlanningStrategy(tenantID string) string + BloomSplitSeriesKeyspaceBy(tenantID string) int +} + +type TSDBSet = map[tsdb.SingleTenantTSDBIdentifier]common.ClosableForSeries + +type PlanningStrategy interface { + // Plan returns a set of tasks for a given tenant-table tuple and TSDBs. + Plan(ctx context.Context, table config.DayTable, tenant string, tsdbs TSDBSet, metas []bloomshipper.Meta) ([]*protos.Task, error) +} + +func NewStrategy( + tenantID string, + limits Limits, + logger log.Logger, +) (PlanningStrategy, error) { + strategy := limits.BloomPlanningStrategy(tenantID) + + switch strategy { + case SplitKeyspaceStrategyName: + return NewSplitKeyspaceStrategy(limits, logger) + default: + return nil, fmt.Errorf("unknown bloom planning strategy (%s)", strategy) + } +} diff --git a/pkg/bloombuild/planner/strategies/splitkeyspace.go b/pkg/bloombuild/planner/strategies/splitkeyspace.go new file mode 100644 index 000000000000..ea4db8f2e29f --- /dev/null +++ b/pkg/bloombuild/planner/strategies/splitkeyspace.go @@ -0,0 +1,240 @@ +package strategies + +import ( + "context" + "fmt" + "sort" + + "github.com/go-kit/log" + "github.com/go-kit/log/level" + + "github.com/grafana/loki/v3/pkg/bloombuild/common" + "github.com/grafana/loki/v3/pkg/bloombuild/protos" + iter "github.com/grafana/loki/v3/pkg/iter/v2" + v1 "github.com/grafana/loki/v3/pkg/storage/bloom/v1" + "github.com/grafana/loki/v3/pkg/storage/config" + "github.com/grafana/loki/v3/pkg/storage/stores/shipper/bloomshipper" + "github.com/grafana/loki/v3/pkg/storage/stores/shipper/indexshipper/tsdb" +) + +type SplitKeyspaceStrategy struct { + limits Limits + logger log.Logger +} + +func NewSplitKeyspaceStrategy( + limits Limits, + logger log.Logger, +) (*SplitKeyspaceStrategy, error) { + return &SplitKeyspaceStrategy{ + limits: limits, + logger: logger, + }, nil +} + +func (s *SplitKeyspaceStrategy) Plan( + ctx context.Context, + table config.DayTable, + tenant string, + tsdbs TSDBSet, + metas []bloomshipper.Meta, +) ([]*protos.Task, error) { + splitFactor := s.limits.BloomSplitSeriesKeyspaceBy(tenant) + ownershipRanges := SplitFingerprintKeyspaceByFactor(splitFactor) + + logger := log.With(s.logger, "table", table.Addr(), "tenant", tenant) + level.Debug(s.logger).Log("msg", "loading work for tenant", "splitFactor", splitFactor) + + var tasks []*protos.Task + for _, ownershipRange := range ownershipRanges { + logger := log.With(logger, "ownership", ownershipRange.String()) + + // Filter only the metas that overlap in the ownership range + metasInBounds := bloomshipper.FilterMetasOverlappingBounds(metas, ownershipRange) + + // Find gaps in the TSDBs for this tenant/table + gaps, err := s.findOutdatedGaps(ctx, tenant, tsdbs, ownershipRange, metasInBounds, logger) + if err != nil { + level.Error(logger).Log("msg", "failed to find outdated gaps", "err", err) + continue + } + + for _, gap := range gaps { + tasks = append(tasks, protos.NewTask(table, tenant, ownershipRange, gap.tsdb, gap.gaps)) + } + } + + return tasks, nil +} + +// blockPlan is a plan for all the work needed to build a meta.json +// It includes: +// - the tsdb (source of truth) which contains all the series+chunks +// we need to ensure are indexed in bloom blocks +// - a list of gaps that are out of date and need to be checked+built +// - within each gap, a list of block refs which overlap the gap are included +// so we can use them to accelerate bloom generation. They likely contain many +// of the same chunks we need to ensure are indexed, just from previous tsdb iterations. +// This is a performance optimization to avoid expensive re-reindexing +type blockPlan struct { + tsdb tsdb.SingleTenantTSDBIdentifier + gaps []protos.Gap +} + +func (s *SplitKeyspaceStrategy) findOutdatedGaps( + ctx context.Context, + tenant string, + tsdbs map[tsdb.SingleTenantTSDBIdentifier]common.ClosableForSeries, + ownershipRange v1.FingerprintBounds, + metas []bloomshipper.Meta, + logger log.Logger, +) ([]blockPlan, error) { + // Determine which TSDBs have gaps in the ownership range and need to + // be processed. + tsdbsWithGaps, err := gapsBetweenTSDBsAndMetas(ownershipRange, tsdbs, metas) + if err != nil { + level.Error(logger).Log("msg", "failed to find gaps", "err", err) + return nil, fmt.Errorf("failed to find gaps: %w", err) + } + + if len(tsdbsWithGaps) == 0 { + level.Debug(logger).Log("msg", "blooms exist for all tsdbs") + return nil, nil + } + + work, err := blockPlansForGaps(ctx, tenant, tsdbsWithGaps, metas) + if err != nil { + level.Error(logger).Log("msg", "failed to create plan", "err", err) + return nil, fmt.Errorf("failed to create plan: %w", err) + } + + return work, nil +} + +// Used to signal the gaps that need to be populated for a tsdb +type tsdbGaps struct { + tsdbIdentifier tsdb.SingleTenantTSDBIdentifier + tsdb common.ClosableForSeries + gaps []v1.FingerprintBounds +} + +// gapsBetweenTSDBsAndMetas returns if the metas are up-to-date with the TSDBs. This is determined by asserting +// that for each TSDB, there are metas covering the entire ownership range which were generated from that specific TSDB. +func gapsBetweenTSDBsAndMetas( + ownershipRange v1.FingerprintBounds, + tsdbs map[tsdb.SingleTenantTSDBIdentifier]common.ClosableForSeries, + metas []bloomshipper.Meta, +) (res []tsdbGaps, err error) { + for db, tsdb := range tsdbs { + id := db.Name() + + relevantMetas := make([]v1.FingerprintBounds, 0, len(metas)) + for _, meta := range metas { + for _, s := range meta.Sources { + if s.Name() == id { + relevantMetas = append(relevantMetas, meta.Bounds) + } + } + } + + gaps, err := FindGapsInFingerprintBounds(ownershipRange, relevantMetas) + if err != nil { + return nil, err + } + + if len(gaps) > 0 { + res = append(res, tsdbGaps{ + tsdbIdentifier: db, + tsdb: tsdb, + gaps: gaps, + }) + } + } + + return res, err +} + +// blockPlansForGaps groups tsdb gaps we wish to fill with overlapping but out of date blocks. +// This allows us to expedite bloom generation by using existing blocks to fill in the gaps +// since many will contain the same chunks. +func blockPlansForGaps( + ctx context.Context, + tenant string, + tsdbs []tsdbGaps, + metas []bloomshipper.Meta, +) ([]blockPlan, error) { + plans := make([]blockPlan, 0, len(tsdbs)) + + for _, idx := range tsdbs { + plan := blockPlan{ + tsdb: idx.tsdbIdentifier, + gaps: make([]protos.Gap, 0, len(idx.gaps)), + } + + for _, gap := range idx.gaps { + planGap := protos.Gap{ + Bounds: gap, + } + + seriesItr, err := common.NewTSDBSeriesIter(ctx, tenant, idx.tsdb, gap) + if err != nil { + return nil, fmt.Errorf("failed to load series from TSDB for gap (%s): %w", gap.String(), err) + } + planGap.Series, err = iter.Collect(seriesItr) + if err != nil { + return nil, fmt.Errorf("failed to collect series: %w", err) + } + + for _, meta := range metas { + if meta.Bounds.Intersection(gap) == nil { + // this meta doesn't overlap the gap, skip + continue + } + + for _, block := range meta.Blocks { + if block.Bounds.Intersection(gap) == nil { + // this block doesn't overlap the gap, skip + continue + } + // this block overlaps the gap, add it to the plan + // for this gap + planGap.Blocks = append(planGap.Blocks, block) + } + } + + // ensure we sort blocks so deduping iterator works as expected + sort.Slice(planGap.Blocks, func(i, j int) bool { + return planGap.Blocks[i].Bounds.Less(planGap.Blocks[j].Bounds) + }) + + peekingBlocks := iter.NewPeekIter[bloomshipper.BlockRef]( + iter.NewSliceIter[bloomshipper.BlockRef]( + planGap.Blocks, + ), + ) + // dedupe blocks which could be in multiple metas + itr := iter.NewDedupingIter[bloomshipper.BlockRef, bloomshipper.BlockRef]( + func(a, b bloomshipper.BlockRef) bool { + return a == b + }, + iter.Identity[bloomshipper.BlockRef], + func(a, _ bloomshipper.BlockRef) bloomshipper.BlockRef { + return a + }, + peekingBlocks, + ) + + deduped, err := iter.Collect[bloomshipper.BlockRef](itr) + if err != nil { + return nil, fmt.Errorf("failed to dedupe blocks: %w", err) + } + planGap.Blocks = deduped + + plan.gaps = append(plan.gaps, planGap) + } + + plans = append(plans, plan) + } + + return plans, nil +} diff --git a/pkg/bloombuild/planner/strategies/splitkeyspace_test.go b/pkg/bloombuild/planner/strategies/splitkeyspace_test.go new file mode 100644 index 000000000000..e5f6781d06c0 --- /dev/null +++ b/pkg/bloombuild/planner/strategies/splitkeyspace_test.go @@ -0,0 +1,432 @@ +package strategies + +import ( + "context" + "testing" + "time" + + "github.com/prometheus/common/model" + "github.com/prometheus/prometheus/model/labels" + "github.com/stretchr/testify/require" + + "github.com/grafana/loki/v3/pkg/bloombuild/common" + "github.com/grafana/loki/v3/pkg/bloombuild/protos" + v1 "github.com/grafana/loki/v3/pkg/storage/bloom/v1" + "github.com/grafana/loki/v3/pkg/storage/config" + "github.com/grafana/loki/v3/pkg/storage/stores/shipper/bloomshipper" + "github.com/grafana/loki/v3/pkg/storage/stores/shipper/indexshipper/tsdb" + "github.com/grafana/loki/v3/pkg/storage/stores/shipper/indexshipper/tsdb/index" +) + +var testDay = parseDayTime("2023-09-01") +var testTable = config.NewDayTable(testDay, "index_") + +func Test_gapsBetweenTSDBsAndMetas(t *testing.T) { + + for _, tc := range []struct { + desc string + err bool + exp []tsdbGaps + ownershipRange v1.FingerprintBounds + tsdbs map[tsdb.SingleTenantTSDBIdentifier]common.ClosableForSeries + metas []bloomshipper.Meta + }{ + { + desc: "non-overlapping tsdbs and metas", + err: true, + ownershipRange: v1.NewBounds(0, 10), + tsdbs: map[tsdb.SingleTenantTSDBIdentifier]common.ClosableForSeries{ + tsdbID(0): nil, + }, + metas: []bloomshipper.Meta{ + genMeta(11, 20, []int{0}, nil), + }, + }, + { + desc: "single tsdb", + ownershipRange: v1.NewBounds(0, 10), + tsdbs: map[tsdb.SingleTenantTSDBIdentifier]common.ClosableForSeries{ + tsdbID(0): nil, + }, + metas: []bloomshipper.Meta{ + genMeta(4, 8, []int{0}, nil), + }, + exp: []tsdbGaps{ + { + tsdbIdentifier: tsdbID(0), + gaps: []v1.FingerprintBounds{ + v1.NewBounds(0, 3), + v1.NewBounds(9, 10), + }, + }, + }, + }, + { + desc: "multiple tsdbs with separate blocks", + ownershipRange: v1.NewBounds(0, 10), + tsdbs: map[tsdb.SingleTenantTSDBIdentifier]common.ClosableForSeries{ + tsdbID(0): nil, + tsdbID(1): nil, + }, + metas: []bloomshipper.Meta{ + genMeta(0, 5, []int{0}, nil), + genMeta(6, 10, []int{1}, nil), + }, + exp: []tsdbGaps{ + { + tsdbIdentifier: tsdbID(0), + gaps: []v1.FingerprintBounds{ + v1.NewBounds(6, 10), + }, + }, + { + tsdbIdentifier: tsdbID(1), + gaps: []v1.FingerprintBounds{ + v1.NewBounds(0, 5), + }, + }, + }, + }, + { + desc: "multiple tsdbs with the same blocks", + ownershipRange: v1.NewBounds(0, 10), + tsdbs: map[tsdb.SingleTenantTSDBIdentifier]common.ClosableForSeries{ + tsdbID(0): nil, + tsdbID(1): nil, + }, + metas: []bloomshipper.Meta{ + genMeta(0, 5, []int{0, 1}, nil), + genMeta(6, 8, []int{1}, nil), + }, + exp: []tsdbGaps{ + { + tsdbIdentifier: tsdbID(0), + gaps: []v1.FingerprintBounds{ + v1.NewBounds(6, 10), + }, + }, + { + tsdbIdentifier: tsdbID(1), + gaps: []v1.FingerprintBounds{ + v1.NewBounds(9, 10), + }, + }, + }, + }, + } { + t.Run(tc.desc, func(t *testing.T) { + gaps, err := gapsBetweenTSDBsAndMetas(tc.ownershipRange, tc.tsdbs, tc.metas) + if tc.err { + require.Error(t, err) + return + } + require.ElementsMatch(t, tc.exp, gaps) + }) + } +} + +func Test_blockPlansForGaps(t *testing.T) { + for _, tc := range []struct { + desc string + ownershipRange v1.FingerprintBounds + tsdbs []tsdb.SingleTenantTSDBIdentifier + metas []bloomshipper.Meta + err bool + exp []blockPlan + }{ + { + desc: "single overlapping meta+no overlapping block", + ownershipRange: v1.NewBounds(0, 10), + tsdbs: []tsdb.SingleTenantTSDBIdentifier{tsdbID(0)}, + metas: []bloomshipper.Meta{ + genMeta(5, 20, []int{1}, []bloomshipper.BlockRef{genBlockRef(11, 20)}), + }, + exp: []blockPlan{ + { + tsdb: tsdbID(0), + gaps: []protos.Gap{ + { + Bounds: v1.NewBounds(0, 10), + Series: genSeries(v1.NewBounds(0, 10)), + }, + }, + }, + }, + }, + { + desc: "single overlapping meta+one overlapping block", + ownershipRange: v1.NewBounds(0, 10), + tsdbs: []tsdb.SingleTenantTSDBIdentifier{tsdbID(0)}, + metas: []bloomshipper.Meta{ + genMeta(5, 20, []int{1}, []bloomshipper.BlockRef{genBlockRef(9, 20)}), + }, + exp: []blockPlan{ + { + tsdb: tsdbID(0), + gaps: []protos.Gap{ + { + Bounds: v1.NewBounds(0, 10), + Series: genSeries(v1.NewBounds(0, 10)), + Blocks: []bloomshipper.BlockRef{genBlockRef(9, 20)}, + }, + }, + }, + }, + }, + { + // the range which needs to be generated doesn't overlap with existing blocks + // from other tsdb versions since theres an up to date tsdb version block, + // but we can trim the range needing generation + desc: "trims up to date area", + ownershipRange: v1.NewBounds(0, 10), + tsdbs: []tsdb.SingleTenantTSDBIdentifier{tsdbID(0)}, + metas: []bloomshipper.Meta{ + genMeta(9, 20, []int{0}, []bloomshipper.BlockRef{genBlockRef(9, 20)}), // block for same tsdb + genMeta(9, 20, []int{1}, []bloomshipper.BlockRef{genBlockRef(9, 20)}), // block for different tsdb + }, + exp: []blockPlan{ + { + tsdb: tsdbID(0), + gaps: []protos.Gap{ + { + Bounds: v1.NewBounds(0, 8), + Series: genSeries(v1.NewBounds(0, 8)), + }, + }, + }, + }, + }, + { + desc: "uses old block for overlapping range", + ownershipRange: v1.NewBounds(0, 10), + tsdbs: []tsdb.SingleTenantTSDBIdentifier{tsdbID(0)}, + metas: []bloomshipper.Meta{ + genMeta(9, 20, []int{0}, []bloomshipper.BlockRef{genBlockRef(9, 20)}), // block for same tsdb + genMeta(5, 20, []int{1}, []bloomshipper.BlockRef{genBlockRef(5, 20)}), // block for different tsdb + }, + exp: []blockPlan{ + { + tsdb: tsdbID(0), + gaps: []protos.Gap{ + { + Bounds: v1.NewBounds(0, 8), + Series: genSeries(v1.NewBounds(0, 8)), + Blocks: []bloomshipper.BlockRef{genBlockRef(5, 20)}, + }, + }, + }, + }, + }, + { + desc: "multi case", + ownershipRange: v1.NewBounds(0, 10), + tsdbs: []tsdb.SingleTenantTSDBIdentifier{tsdbID(0), tsdbID(1)}, // generate for both tsdbs + metas: []bloomshipper.Meta{ + genMeta(0, 2, []int{0}, []bloomshipper.BlockRef{ + genBlockRef(0, 1), + genBlockRef(1, 2), + }), // tsdb_0 + genMeta(6, 8, []int{0}, []bloomshipper.BlockRef{genBlockRef(6, 8)}), // tsdb_0 + + genMeta(3, 5, []int{1}, []bloomshipper.BlockRef{genBlockRef(3, 5)}), // tsdb_1 + genMeta(8, 10, []int{1}, []bloomshipper.BlockRef{genBlockRef(8, 10)}), // tsdb_1 + }, + exp: []blockPlan{ + { + tsdb: tsdbID(0), + gaps: []protos.Gap{ + // tsdb (id=0) can source chunks from the blocks built from tsdb (id=1) + { + Bounds: v1.NewBounds(3, 5), + Series: genSeries(v1.NewBounds(3, 5)), + Blocks: []bloomshipper.BlockRef{genBlockRef(3, 5)}, + }, + { + Bounds: v1.NewBounds(9, 10), + Series: genSeries(v1.NewBounds(9, 10)), + Blocks: []bloomshipper.BlockRef{genBlockRef(8, 10)}, + }, + }, + }, + // tsdb (id=1) can source chunks from the blocks built from tsdb (id=0) + { + tsdb: tsdbID(1), + gaps: []protos.Gap{ + { + Bounds: v1.NewBounds(0, 2), + Series: genSeries(v1.NewBounds(0, 2)), + Blocks: []bloomshipper.BlockRef{ + genBlockRef(0, 1), + genBlockRef(1, 2), + }, + }, + { + Bounds: v1.NewBounds(6, 7), + Series: genSeries(v1.NewBounds(6, 7)), + Blocks: []bloomshipper.BlockRef{genBlockRef(6, 8)}, + }, + }, + }, + }, + }, + { + desc: "dedupes block refs", + ownershipRange: v1.NewBounds(0, 10), + tsdbs: []tsdb.SingleTenantTSDBIdentifier{tsdbID(0)}, + metas: []bloomshipper.Meta{ + genMeta(9, 20, []int{1}, []bloomshipper.BlockRef{ + genBlockRef(1, 4), + genBlockRef(9, 20), + }), // blocks for first diff tsdb + genMeta(5, 20, []int{2}, []bloomshipper.BlockRef{ + genBlockRef(5, 10), + genBlockRef(9, 20), // same block references in prior meta (will be deduped) + }), // block for second diff tsdb + }, + exp: []blockPlan{ + { + tsdb: tsdbID(0), + gaps: []protos.Gap{ + { + Bounds: v1.NewBounds(0, 10), + Series: genSeries(v1.NewBounds(0, 10)), + Blocks: []bloomshipper.BlockRef{ + genBlockRef(1, 4), + genBlockRef(5, 10), + genBlockRef(9, 20), + }, + }, + }, + }, + }, + }, + } { + t.Run(tc.desc, func(t *testing.T) { + // We add series spanning the whole FP ownership range + tsdbs := make(map[tsdb.SingleTenantTSDBIdentifier]common.ClosableForSeries) + for _, id := range tc.tsdbs { + tsdbs[id] = newFakeForSeries(genSeries(tc.ownershipRange)) + } + + // we reuse the gapsBetweenTSDBsAndMetas function to generate the gaps as this function is tested + // separately and it's used to generate input in our regular code path (easier to write tests this way). + gaps, err := gapsBetweenTSDBsAndMetas(tc.ownershipRange, tsdbs, tc.metas) + require.NoError(t, err) + + plans, err := blockPlansForGaps( + context.Background(), + "fakeTenant", + gaps, + tc.metas, + ) + if tc.err { + require.Error(t, err) + return + } + require.ElementsMatch(t, tc.exp, plans) + }) + } +} + +func genSeries(bounds v1.FingerprintBounds) []*v1.Series { + series := make([]*v1.Series, 0, int(bounds.Max-bounds.Min+1)) + for i := bounds.Min; i <= bounds.Max; i++ { + series = append(series, &v1.Series{ + Fingerprint: i, + Chunks: v1.ChunkRefs{ + { + From: 0, + Through: 1, + Checksum: 1, + }, + }, + }) + } + return series +} + +func genMeta(min, max model.Fingerprint, sources []int, blocks []bloomshipper.BlockRef) bloomshipper.Meta { + m := bloomshipper.Meta{ + MetaRef: bloomshipper.MetaRef{ + Ref: bloomshipper.Ref{ + TenantID: "fakeTenant", + TableName: testTable.Addr(), + Bounds: v1.NewBounds(min, max), + }, + }, + Blocks: blocks, + } + for _, source := range sources { + m.Sources = append(m.Sources, tsdbID(source)) + } + return m +} + +func genBlockRef(min, max model.Fingerprint) bloomshipper.BlockRef { + startTS, endTS := testDay.Bounds() + return bloomshipper.BlockRef{ + Ref: bloomshipper.Ref{ + TenantID: "fakeTenant", + TableName: testTable.Addr(), + Bounds: v1.NewBounds(min, max), + StartTimestamp: startTS, + EndTimestamp: endTS, + Checksum: 0, + }, + } +} + +func tsdbID(n int) tsdb.SingleTenantTSDBIdentifier { + return tsdb.SingleTenantTSDBIdentifier{ + TS: time.Unix(int64(n), 0), + } +} + +func parseDayTime(s string) config.DayTime { + t, err := time.Parse("2006-01-02", s) + if err != nil { + panic(err) + } + return config.DayTime{ + Time: model.TimeFromUnix(t.Unix()), + } +} + +type fakeForSeries struct { + series []*v1.Series +} + +func newFakeForSeries(series []*v1.Series) *fakeForSeries { + return &fakeForSeries{ + series: series, + } +} + +func (f fakeForSeries) ForSeries(_ context.Context, _ string, ff index.FingerprintFilter, _ model.Time, _ model.Time, fn func(labels.Labels, model.Fingerprint, []index.ChunkMeta) (stop bool), _ ...*labels.Matcher) error { + overlapping := make([]*v1.Series, 0, len(f.series)) + for _, s := range f.series { + if ff.Match(s.Fingerprint) { + overlapping = append(overlapping, s) + } + } + + for _, s := range overlapping { + chunks := make([]index.ChunkMeta, 0, len(s.Chunks)) + for _, c := range s.Chunks { + chunks = append(chunks, index.ChunkMeta{ + MinTime: int64(c.From), + MaxTime: int64(c.Through), + Checksum: c.Checksum, + }) + } + + if fn(labels.EmptyLabels(), s.Fingerprint, chunks) { + break + } + } + return nil +} + +func (f fakeForSeries) Close() error { + return nil +} diff --git a/pkg/bloombuild/planner/strategies/util.go b/pkg/bloombuild/planner/strategies/util.go new file mode 100644 index 000000000000..31ce42be154a --- /dev/null +++ b/pkg/bloombuild/planner/strategies/util.go @@ -0,0 +1,125 @@ +package strategies + +import ( + "fmt" + "math" + + "github.com/prometheus/common/model" + + v1 "github.com/grafana/loki/v3/pkg/storage/bloom/v1" +) + +// SplitFingerprintKeyspaceByFactor splits the keyspace covered by model.Fingerprint into contiguous non-overlapping ranges. +func SplitFingerprintKeyspaceByFactor(factor int) []v1.FingerprintBounds { + if factor <= 0 { + return nil + } + + bounds := make([]v1.FingerprintBounds, 0, factor) + + // The keyspace of a Fingerprint is from 0 to max uint64. + keyspaceSize := uint64(math.MaxUint64) + + // Calculate the size of each range. + rangeSize := keyspaceSize / uint64(factor) + + for i := 0; i < factor; i++ { + // Calculate the start and end of the range. + start := uint64(i) * rangeSize + end := start + rangeSize - 1 + + // For the last range, make sure it ends at the end of the keyspace. + if i == factor-1 { + end = keyspaceSize + } + + // Create a FingerprintBounds for the range and add it to the slice. + bounds = append(bounds, v1.FingerprintBounds{ + Min: model.Fingerprint(start), + Max: model.Fingerprint(end), + }) + } + + return bounds +} + +func FindGapsInFingerprintBounds(ownershipRange v1.FingerprintBounds, metas []v1.FingerprintBounds) (gaps []v1.FingerprintBounds, err error) { + if len(metas) == 0 { + return []v1.FingerprintBounds{ownershipRange}, nil + } + + // turn the available metas into a list of non-overlapping metas + // for easier processing + var nonOverlapping []v1.FingerprintBounds + // First, we reduce the metas into a smaller set by combining overlaps. They must be sorted. + var cur *v1.FingerprintBounds + for i := 0; i < len(metas); i++ { + j := i + 1 + + // first iteration (i == 0), set the current meta + if cur == nil { + cur = &metas[i] + } + + if j >= len(metas) { + // We've reached the end of the list. Add the last meta to the non-overlapping set. + nonOverlapping = append(nonOverlapping, *cur) + break + } + + combined := cur.Union(metas[j]) + if len(combined) == 1 { + // There was an overlap between the two tested ranges. Combine them and keep going. + cur = &combined[0] + continue + } + + // There was no overlap between the two tested ranges. Add the first to the non-overlapping set. + // and keep the second for the next iteration. + nonOverlapping = append(nonOverlapping, combined[0]) + cur = &combined[1] + } + + // Now, detect gaps between the non-overlapping metas and the ownership range. + // The left bound of the ownership range will be adjusted as we go. + leftBound := ownershipRange.Min + for _, meta := range nonOverlapping { + + clippedMeta := meta.Intersection(ownershipRange) + // should never happen as long as we are only combining metas + // that intersect with the ownership range + if clippedMeta == nil { + return nil, fmt.Errorf("meta is not within ownership range: %v", meta) + } + + searchRange := ownershipRange.Slice(leftBound, clippedMeta.Max) + // update the left bound for the next iteration + // We do the max to prevent the max bound to overflow from MaxUInt64 to 0 + leftBound = min( + max(clippedMeta.Max+1, clippedMeta.Max), + max(ownershipRange.Max+1, ownershipRange.Max), + ) + + // since we've already ensured that the meta is within the ownership range, + // we know the xor will be of length zero (when the meta is equal to the ownership range) + // or 1 (when the meta is a subset of the ownership range) + xors := searchRange.Unless(*clippedMeta) + if len(xors) == 0 { + // meta is equal to the ownership range. This means the meta + // covers this entire section of the ownership range. + continue + } + + gaps = append(gaps, xors[0]) + } + + // If the leftBound is less than the ownership range max, and it's smaller than MaxUInt64, + // There is a gap between the last meta and the end of the ownership range. + // Note: we check `leftBound < math.MaxUint64` since in the loop above we clamp the + // leftBound to MaxUint64 to prevent an overflow to 0: `max(clippedMeta.Max+1, clippedMeta.Max)` + if leftBound < math.MaxUint64 && leftBound <= ownershipRange.Max { + gaps = append(gaps, v1.NewBounds(leftBound, ownershipRange.Max)) + } + + return gaps, nil +} diff --git a/pkg/distributor/distributor.go b/pkg/distributor/distributor.go index 30383bfcbbbd..892384bef60b 100644 --- a/pkg/distributor/distributor.go +++ b/pkg/distributor/distributor.go @@ -47,6 +47,7 @@ import ( "github.com/grafana/loki/v3/pkg/ingester" "github.com/grafana/loki/v3/pkg/ingester/client" "github.com/grafana/loki/v3/pkg/kafka" + kafka_client "github.com/grafana/loki/v3/pkg/kafka/client" "github.com/grafana/loki/v3/pkg/loghttp/push" "github.com/grafana/loki/v3/pkg/logproto" "github.com/grafana/loki/v3/pkg/logql/log/logfmt" @@ -234,11 +235,11 @@ func New( var kafkaWriter KafkaProducer if cfg.KafkaEnabled { - kafkaClient, err := kafka.NewWriterClient(cfg.KafkaConfig, 20, logger, registerer) + kafkaClient, err := kafka_client.NewWriterClient(cfg.KafkaConfig, 20, logger, registerer) if err != nil { return nil, fmt.Errorf("failed to start kafka client: %w", err) } - kafkaWriter = kafka.NewProducer(kafkaClient, cfg.KafkaConfig.ProducerMaxBufferedBytes, + kafkaWriter = kafka_client.NewProducer(kafkaClient, cfg.KafkaConfig.ProducerMaxBufferedBytes, prometheus.WrapRegistererWithPrefix("_kafka_", registerer)) } @@ -501,7 +502,7 @@ func (d *Distributor) Push(ctx context.Context, req *logproto.PushRequest) (*log } else { logLevel = detectLogLevelFromLogEntry(entry, structuredMetadata) } - if logLevel != constants.LogLevelUnknown && logLevel != "" { + if logLevel != "" { entry.StructuredMetadata = append(entry.StructuredMetadata, logproto.LabelAdapter{ Name: constants.LevelLabel, Value: logLevel, diff --git a/pkg/distributor/distributor_test.go b/pkg/distributor/distributor_test.go index 785d6ce03d0c..ea06eecd4515 100644 --- a/pkg/distributor/distributor_test.go +++ b/pkg/distributor/distributor_test.go @@ -1640,7 +1640,7 @@ func Test_DetectLogLevels(t *testing.T) { require.NoError(t, err) topVal := ingester.Peek() require.Equal(t, `{foo="bar"}`, topVal.Streams[0].Labels) - require.Len(t, topVal.Streams[0].Entries[0].StructuredMetadata, 0) + require.Len(t, topVal.Streams[0].Entries[0].StructuredMetadata, 1) }) t.Run("log level detection enabled and warn logs", func(t *testing.T) { diff --git a/pkg/ingester/ingester.go b/pkg/ingester/ingester.go index adf0cd7b332f..17089efbbf63 100644 --- a/pkg/ingester/ingester.go +++ b/pkg/ingester/ingester.go @@ -611,6 +611,10 @@ func (i *Ingester) starting(ctx context.Context) (err error) { i.setPrepareShutdown() } + // start our flush loop: this needs to start before the partition-reader in order for chunks to be shipped in the case of Kafka catching up. + i.loopDone.Add(1) + go i.loop() + // When kafka ingestion is enabled, we have to make sure that reader catches up replaying the partition // BEFORE the ingester ring lifecycler is started, because once the ingester ring lifecycler will start // it will switch the ingester state in the ring to ACTIVE. @@ -646,9 +650,7 @@ func (i *Ingester) starting(ctx context.Context) (err error) { return fmt.Errorf("failed to start partition ring lifecycler: %w", err) } } - // start our loop - i.loopDone.Add(1) - go i.loop() + return nil } diff --git a/pkg/kafka/logger.go b/pkg/kafka/client/logger.go similarity index 98% rename from pkg/kafka/logger.go rename to pkg/kafka/client/logger.go index e055094a4163..3be96839e120 100644 --- a/pkg/kafka/logger.go +++ b/pkg/kafka/client/logger.go @@ -1,6 +1,6 @@ // SPDX-License-Identifier: AGPL-3.0-only -package kafka +package client import ( "github.com/go-kit/log" diff --git a/pkg/kafka/reader_client.go b/pkg/kafka/client/reader_client.go similarity index 51% rename from pkg/kafka/reader_client.go rename to pkg/kafka/client/reader_client.go index 9237686fee60..e8bbb2da8c86 100644 --- a/pkg/kafka/reader_client.go +++ b/pkg/kafka/client/reader_client.go @@ -1,19 +1,25 @@ // SPDX-License-Identifier: AGPL-3.0-only -package kafka +package client import ( + "context" + "fmt" "time" "github.com/go-kit/log" + "github.com/go-kit/log/level" "github.com/pkg/errors" "github.com/prometheus/client_golang/prometheus" + "github.com/twmb/franz-go/pkg/kadm" "github.com/twmb/franz-go/pkg/kgo" "github.com/twmb/franz-go/plugin/kprom" + + "github.com/grafana/loki/v3/pkg/kafka" ) // NewReaderClient returns the kgo.Client that should be used by the Reader. -func NewReaderClient(kafkaCfg Config, metrics *kprom.Metrics, logger log.Logger, opts ...kgo.Opt) (*kgo.Client, error) { +func NewReaderClient(kafkaCfg kafka.Config, metrics *kprom.Metrics, logger log.Logger, opts ...kgo.Opt) (*kgo.Client, error) { const fetchMaxBytes = 100_000_000 opts = append(opts, commonKafkaClientOptions(kafkaCfg, metrics, logger)...) @@ -33,7 +39,7 @@ func NewReaderClient(kafkaCfg Config, metrics *kprom.Metrics, logger log.Logger, return nil, errors.Wrap(err, "creating kafka client") } if kafkaCfg.AutoCreateTopicEnabled { - kafkaCfg.SetDefaultNumberOfPartitionsForAutocreatedTopics(logger) + setDefaultNumberOfPartitionsForAutocreatedTopics(kafkaCfg, client, logger) } return client, nil } @@ -44,3 +50,29 @@ func NewReaderClientMetrics(component string, reg prometheus.Registerer) *kprom. // Do not export the client ID, because we use it to specify options to the backend. kprom.FetchAndProduceDetail(kprom.Batches, kprom.Records, kprom.CompressedBytes, kprom.UncompressedBytes)) } + +// setDefaultNumberOfPartitionsForAutocreatedTopics tries to set num.partitions config option on brokers. +// This is best-effort, if setting the option fails, error is logged, but not returned. +func setDefaultNumberOfPartitionsForAutocreatedTopics(cfg kafka.Config, cl *kgo.Client, logger log.Logger) { + if cfg.AutoCreateTopicDefaultPartitions <= 0 { + return + } + + // Note: this client doesn't get closed because it is owned by the caller + adm := kadm.NewClient(cl) + + defaultNumberOfPartitions := fmt.Sprintf("%d", cfg.AutoCreateTopicDefaultPartitions) + _, err := adm.AlterBrokerConfigsState(context.Background(), []kadm.AlterConfig{ + { + Op: kadm.SetConfig, + Name: "num.partitions", + Value: &defaultNumberOfPartitions, + }, + }) + if err != nil { + level.Error(logger).Log("msg", "failed to alter default number of partitions", "err", err) + return + } + + level.Info(logger).Log("msg", "configured Kafka-wide default number of partitions for auto-created topics (num.partitions)", "value", cfg.AutoCreateTopicDefaultPartitions) +} diff --git a/pkg/kafka/client/reader_client_test.go b/pkg/kafka/client/reader_client_test.go new file mode 100644 index 000000000000..90980ad0e912 --- /dev/null +++ b/pkg/kafka/client/reader_client_test.go @@ -0,0 +1,104 @@ +package client + +import ( + "context" + "testing" + + "github.com/go-kit/log" + "github.com/grafana/dskit/flagext" + "github.com/stretchr/testify/require" + "github.com/twmb/franz-go/pkg/kfake" + "github.com/twmb/franz-go/pkg/kgo" + "github.com/twmb/franz-go/pkg/kmsg" + + "github.com/grafana/loki/v3/pkg/kafka" + "github.com/grafana/loki/v3/pkg/kafka/testkafka" +) + +func TestNewReaderClient(t *testing.T) { + _, addr := testkafka.CreateClusterWithoutCustomConsumerGroupsSupport(t, 1, "test", kfake.EnableSASL(), kfake.Superuser("PLAIN", "user", "password")) + + tests := []struct { + name string + config kafka.Config + wantErr bool + }{ + { + name: "valid config", + config: kafka.Config{ + Address: addr, + Topic: "abcd", + SASLUsername: "user", + SASLPassword: flagext.SecretWithValue("password"), + }, + wantErr: false, + }, + { + name: "wrong password", + config: kafka.Config{ + Address: addr, + Topic: "abcd", + SASLUsername: "user", + SASLPassword: flagext.SecretWithValue("wrong wrong wrong"), + }, + wantErr: true, + }, + { + name: "wrong username", + config: kafka.Config{ + Address: addr, + Topic: "abcd", + SASLUsername: "wrong wrong wrong", + SASLPassword: flagext.SecretWithValue("password"), + }, + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + client, err := NewReaderClient(tt.config, nil, nil) + require.NoError(t, err) + + err = client.Ping(context.Background()) + if tt.wantErr { + require.Error(t, err) + } else { + require.NoError(t, err) + } + }) + } +} + +func TestSetDefaultNumberOfPartitionsForAutocreatedTopics(t *testing.T) { + cluster, err := kfake.NewCluster(kfake.NumBrokers(1)) + require.NoError(t, err) + t.Cleanup(cluster.Close) + + addrs := cluster.ListenAddrs() + require.Len(t, addrs, 1) + + cfg := kafka.Config{ + Address: addrs[0], + AutoCreateTopicDefaultPartitions: 100, + } + + cluster.ControlKey(kmsg.AlterConfigs.Int16(), func(request kmsg.Request) (kmsg.Response, error, bool) { + r := request.(*kmsg.AlterConfigsRequest) + + require.Len(t, r.Resources, 1) + res := r.Resources[0] + require.Equal(t, kmsg.ConfigResourceTypeBroker, res.ResourceType) + require.Len(t, res.Configs, 1) + cfg := res.Configs[0] + require.Equal(t, "num.partitions", cfg.Name) + require.NotNil(t, *cfg.Value) + require.Equal(t, "100", *cfg.Value) + + return &kmsg.AlterConfigsResponse{}, nil, true + }) + + client, err := kgo.NewClient(commonKafkaClientOptions(cfg, nil, log.NewNopLogger())...) + require.NoError(t, err) + + setDefaultNumberOfPartitionsForAutocreatedTopics(cfg, client, log.NewNopLogger()) +} diff --git a/pkg/kafka/writer_client.go b/pkg/kafka/client/writer_client.go similarity index 90% rename from pkg/kafka/writer_client.go rename to pkg/kafka/client/writer_client.go index 59fefda31d19..1493e17f5168 100644 --- a/pkg/kafka/writer_client.go +++ b/pkg/kafka/client/writer_client.go @@ -1,4 +1,4 @@ -package kafka +package client import ( "context" @@ -13,20 +13,30 @@ import ( "github.com/twmb/franz-go/pkg/kerr" "github.com/twmb/franz-go/pkg/kgo" "github.com/twmb/franz-go/pkg/kmsg" + "github.com/twmb/franz-go/pkg/sasl/plain" "github.com/twmb/franz-go/plugin/kotel" "github.com/twmb/franz-go/plugin/kprom" "go.opentelemetry.io/otel/propagation" "go.opentelemetry.io/otel/trace" "go.uber.org/atomic" + "github.com/grafana/loki/v3/pkg/kafka" "github.com/grafana/loki/v3/pkg/util/constants" ) +var ( + // writerRequestTimeoutOverhead is the overhead applied by the Writer to every Kafka timeout. + // You can think about this overhead as an extra time for requests sitting in the client's buffer + // before being sent on the wire and the actual time it takes to send it over the network and + // start being processed by Kafka. + writerRequestTimeoutOverhead = 2 * time.Second +) + // NewWriterClient returns the kgo.Client that should be used by the Writer. // // The input prometheus.Registerer must be wrapped with a prefix (the names of metrics // registered don't have a prefix). -func NewWriterClient(kafkaCfg Config, maxInflightProduceRequests int, logger log.Logger, reg prometheus.Registerer) (*kgo.Client, error) { +func NewWriterClient(kafkaCfg kafka.Config, maxInflightProduceRequests int, logger log.Logger, reg prometheus.Registerer) (*kgo.Client, error) { // Do not export the client ID, because we use it to specify options to the backend. metrics := kprom.NewMetrics( "", // No prefix. We expect the input prometheus.Registered to be wrapped with a prefix. @@ -42,7 +52,7 @@ func NewWriterClient(kafkaCfg Config, maxInflightProduceRequests int, logger log kgo.RecordPartitioner(kgo.ManualPartitioner()), // Set the upper bounds the size of a record batch. - kgo.ProducerBatchMaxBytes(producerBatchMaxBytes), + kgo.ProducerBatchMaxBytes(kafka.ProducerBatchMaxBytes), // By default, the Kafka client allows 1 Produce in-flight request per broker. Disabling write idempotency // (which we don't need), we can increase the max number of in-flight Produce requests per broker. A higher @@ -81,10 +91,14 @@ func NewWriterClient(kafkaCfg Config, maxInflightProduceRequests int, logger log kgo.MaxBufferedRecords(math.MaxInt), // Use a high value to set it as unlimited, because the client doesn't support "0 as unlimited". kgo.MaxBufferedBytes(0), ) + client, err := kgo.NewClient(opts...) + if err != nil { + return nil, err + } if kafkaCfg.AutoCreateTopicEnabled { - kafkaCfg.SetDefaultNumberOfPartitionsForAutocreatedTopics(logger) + setDefaultNumberOfPartitionsForAutocreatedTopics(kafkaCfg, client, logger) } - return kgo.NewClient(opts...) + return client, nil } type onlySampledTraces struct { @@ -99,7 +113,7 @@ func (o onlySampledTraces) Inject(ctx context.Context, carrier propagation.TextM o.TextMapPropagator.Inject(ctx, carrier) } -func commonKafkaClientOptions(cfg Config, metrics *kprom.Metrics, logger log.Logger) []kgo.Opt { +func commonKafkaClientOptions(cfg kafka.Config, metrics *kprom.Metrics, logger log.Logger) []kgo.Opt { opts := []kgo.Opt{ kgo.ClientID(cfg.ClientID), kgo.SeedBrokers(cfg.Address), @@ -139,6 +153,16 @@ func commonKafkaClientOptions(cfg Config, metrics *kprom.Metrics, logger log.Log }), } + // SASL plain auth. + if cfg.SASLUsername != "" && cfg.SASLPassword.String() != "" { + opts = append(opts, kgo.SASL(plain.Plain(func(_ context.Context) (plain.Auth, error) { + return plain.Auth{ + User: cfg.SASLUsername, + Pass: cfg.SASLPassword.String(), + }, nil + }))) + } + if cfg.AutoCreateTopicEnabled { opts = append(opts, kgo.AllowAutoTopicCreation()) } diff --git a/pkg/kafka/client/writer_client_test.go b/pkg/kafka/client/writer_client_test.go new file mode 100644 index 000000000000..4feb782ffe63 --- /dev/null +++ b/pkg/kafka/client/writer_client_test.go @@ -0,0 +1,71 @@ +package client + +import ( + "context" + "testing" + "time" + + "github.com/grafana/dskit/flagext" + "github.com/stretchr/testify/require" + "github.com/twmb/franz-go/pkg/kfake" + + "github.com/grafana/loki/v3/pkg/kafka" + "github.com/grafana/loki/v3/pkg/kafka/testkafka" +) + +func TestNewWriterClient(t *testing.T) { + _, addr := testkafka.CreateClusterWithoutCustomConsumerGroupsSupport(t, 1, "test", kfake.EnableSASL(), kfake.Superuser("PLAIN", "user", "password")) + + tests := []struct { + name string + config kafka.Config + wantErr bool + }{ + { + name: "valid config", + config: kafka.Config{ + Address: addr, + Topic: "abcd", + WriteTimeout: time.Second, + SASLUsername: "user", + SASLPassword: flagext.SecretWithValue("password"), + }, + wantErr: false, + }, + { + name: "wrong password", + config: kafka.Config{ + Address: addr, + Topic: "abcd", + WriteTimeout: time.Second, + SASLUsername: "user", + SASLPassword: flagext.SecretWithValue("wrong wrong wrong"), + }, + wantErr: true, + }, + { + name: "wrong username", + config: kafka.Config{ + Address: addr, + Topic: "abcd", + WriteTimeout: time.Second, + SASLUsername: "wrong wrong wrong", + SASLPassword: flagext.SecretWithValue("password"), + }, + wantErr: true, + }, + } + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + client, err := NewWriterClient(tt.config, 10, nil, nil) + require.NoError(t, err) + + err = client.Ping(context.Background()) + if tt.wantErr { + require.Error(t, err) + } else { + require.NoError(t, err) + } + }) + } +} diff --git a/pkg/kafka/config.go b/pkg/kafka/config.go index 13cfb618cfdb..09008bec9341 100644 --- a/pkg/kafka/config.go +++ b/pkg/kafka/config.go @@ -1,7 +1,6 @@ package kafka import ( - "context" "errors" "flag" "fmt" @@ -9,10 +8,7 @@ import ( "strings" "time" - "github.com/go-kit/log" - "github.com/go-kit/log/level" - "github.com/twmb/franz-go/pkg/kadm" - "github.com/twmb/franz-go/pkg/kgo" + "github.com/grafana/dskit/flagext" ) const ( @@ -21,29 +17,24 @@ const ( consumeFromEnd = "end" consumeFromTimestamp = "timestamp" - // writerRequestTimeoutOverhead is the overhead applied by the Writer to every Kafka timeout. - // You can think about this overhead as an extra time for requests sitting in the client's buffer - // before being sent on the wire and the actual time it takes to send it over the network and - // start being processed by Kafka. - writerRequestTimeoutOverhead = 2 * time.Second - - // producerBatchMaxBytes is the max allowed size of a batch of Kafka records. - producerBatchMaxBytes = 16_000_000 + // ProducerBatchMaxBytes is the max allowed size of a batch of Kafka records. + ProducerBatchMaxBytes = 16_000_000 // maxProducerRecordDataBytesLimit is the max allowed size of a single record data. Given we have a limit - // on the max batch size (producerBatchMaxBytes), a Kafka record data can't be bigger than the batch size + // on the max batch size (ProducerBatchMaxBytes), a Kafka record data can't be bigger than the batch size // minus some overhead required to serialise the batch and the record itself. We use 16KB as such overhead // in the worst case scenario, which is expected to be way above the actual one. - maxProducerRecordDataBytesLimit = producerBatchMaxBytes - 16384 + maxProducerRecordDataBytesLimit = ProducerBatchMaxBytes - 16384 minProducerRecordDataBytesLimit = 1024 * 1024 ) var ( - ErrMissingKafkaAddress = errors.New("the Kafka address has not been configured") - ErrMissingKafkaTopic = errors.New("the Kafka topic has not been configured") - ErrInconsistentConsumerLagAtStartup = errors.New("the target and max consumer lag at startup must be either both set to 0 or to a value greater than 0") - ErrInvalidMaxConsumerLagAtStartup = errors.New("the configured max consumer lag at startup must greater or equal than the configured target consumer lag") - ErrInvalidProducerMaxRecordSizeBytes = fmt.Errorf("the configured producer max record size bytes must be a value between %d and %d", minProducerRecordDataBytesLimit, maxProducerRecordDataBytesLimit) + ErrMissingKafkaAddress = errors.New("the Kafka address has not been configured") + ErrMissingKafkaTopic = errors.New("the Kafka topic has not been configured") + ErrInconsistentConsumerLagAtStartup = errors.New("the target and max consumer lag at startup must be either both set to 0 or to a value greater than 0") + ErrInvalidMaxConsumerLagAtStartup = errors.New("the configured max consumer lag at startup must greater or equal than the configured target consumer lag") + ErrInconsistentSASLUsernameAndPassword = errors.New("both sasl username and password must be set") + ErrInvalidProducerMaxRecordSizeBytes = fmt.Errorf("the configured producer max record size bytes must be a value between %d and %d", minProducerRecordDataBytesLimit, maxProducerRecordDataBytesLimit) ) // Config holds the generic config for the Kafka backend. @@ -54,6 +45,9 @@ type Config struct { DialTimeout time.Duration `yaml:"dial_timeout"` WriteTimeout time.Duration `yaml:"write_timeout"` + SASLUsername string `yaml:"sasl_username"` + SASLPassword flagext.Secret `yaml:"sasl_password"` + ConsumerGroup string `yaml:"consumer_group"` ConsumerGroupOffsetCommitInterval time.Duration `yaml:"consumer_group_offset_commit_interval"` @@ -80,6 +74,9 @@ func (cfg *Config) RegisterFlagsWithPrefix(prefix string, f *flag.FlagSet) { f.DurationVar(&cfg.DialTimeout, prefix+".dial-timeout", 2*time.Second, "The maximum time allowed to open a connection to a Kafka broker.") f.DurationVar(&cfg.WriteTimeout, prefix+".write-timeout", 10*time.Second, "How long to wait for an incoming write request to be successfully committed to the Kafka backend.") + f.StringVar(&cfg.SASLUsername, prefix+".sasl-username", "", "The SASL username for authentication to Kafka using the PLAIN mechanism. Both username and password must be set.") + f.Var(&cfg.SASLPassword, prefix+".sasl-password", "The SASL password for authentication to Kafka using the PLAIN mechanism. Both username and password must be set.") + f.StringVar(&cfg.ConsumerGroup, prefix+".consumer-group", "", "The consumer group used by the consumer to track the last consumed offset. The consumer group must be different for each ingester. If the configured consumer group contains the '' placeholder, it is replaced with the actual partition ID owned by the ingester. When empty (recommended), Mimir uses the ingester instance ID to guarantee uniqueness.") f.DurationVar(&cfg.ConsumerGroupOffsetCommitInterval, prefix+".consumer-group-offset-commit-interval", time.Second, "How frequently a consumer should commit the consumed offset to Kafka. The last committed offset is used at startup to continue the consumption from where it was left.") @@ -113,6 +110,10 @@ func (cfg *Config) Validate() error { return ErrInvalidMaxConsumerLagAtStartup } + if (cfg.SASLUsername == "") != (cfg.SASLPassword.String() == "") { + return ErrInconsistentSASLUsernameAndPassword + } + return nil } @@ -124,35 +125,3 @@ func (cfg *Config) GetConsumerGroup(instanceID string, partitionID int32) string return strings.ReplaceAll(cfg.ConsumerGroup, "", strconv.Itoa(int(partitionID))) } - -// SetDefaultNumberOfPartitionsForAutocreatedTopics tries to set num.partitions config option on brokers. -// This is best-effort, if setting the option fails, error is logged, but not returned. -func (cfg Config) SetDefaultNumberOfPartitionsForAutocreatedTopics(logger log.Logger) { - if cfg.AutoCreateTopicDefaultPartitions <= 0 { - return - } - - cl, err := kgo.NewClient(commonKafkaClientOptions(cfg, nil, logger)...) - if err != nil { - level.Error(logger).Log("msg", "failed to create kafka client", "err", err) - return - } - - adm := kadm.NewClient(cl) - defer adm.Close() - - defaultNumberOfPartitions := fmt.Sprintf("%d", cfg.AutoCreateTopicDefaultPartitions) - _, err = adm.AlterBrokerConfigsState(context.Background(), []kadm.AlterConfig{ - { - Op: kadm.SetConfig, - Name: "num.partitions", - Value: &defaultNumberOfPartitions, - }, - }) - if err != nil { - level.Error(logger).Log("msg", "failed to alter default number of partitions", "err", err) - return - } - - level.Info(logger).Log("msg", "configured Kafka-wide default number of partitions for auto-created topics (num.partitions)", "value", cfg.AutoCreateTopicDefaultPartitions) -} diff --git a/pkg/kafka/config_test.go b/pkg/kafka/config_test.go index 7c21e38fd141..87c456f42adc 100644 --- a/pkg/kafka/config_test.go +++ b/pkg/kafka/config_test.go @@ -3,39 +3,37 @@ package kafka import ( "testing" - "github.com/go-kit/log" + "github.com/grafana/dskit/flagext" "github.com/stretchr/testify/require" - "github.com/twmb/franz-go/pkg/kfake" - "github.com/twmb/franz-go/pkg/kmsg" ) -func TestSetDefaultNumberOfPartitionsForAutocreatedTopics(t *testing.T) { - cluster, err := kfake.NewCluster(kfake.NumBrokers(1)) - require.NoError(t, err) - t.Cleanup(cluster.Close) - - addrs := cluster.ListenAddrs() - require.Len(t, addrs, 1) - +func TestBothSASLParamsMustBeSet(t *testing.T) { cfg := Config{ - Address: addrs[0], - AutoCreateTopicDefaultPartitions: 100, + // Other required params + Address: "abcd", + Topic: "abcd", + ProducerMaxRecordSizeBytes: 1048576, } - cluster.ControlKey(kmsg.AlterConfigs.Int16(), func(request kmsg.Request) (kmsg.Response, error, bool) { - r := request.(*kmsg.AlterConfigsRequest) - - require.Len(t, r.Resources, 1) - res := r.Resources[0] - require.Equal(t, kmsg.ConfigResourceTypeBroker, res.ResourceType) - require.Len(t, res.Configs, 1) - cfg := res.Configs[0] - require.Equal(t, "num.partitions", cfg.Name) - require.NotNil(t, *cfg.Value) - require.Equal(t, "100", *cfg.Value) - - return &kmsg.AlterConfigsResponse{}, nil, true - }) + // No SASL params is valid + err := cfg.Validate() + require.NoError(t, err) - cfg.SetDefaultNumberOfPartitionsForAutocreatedTopics(log.NewNopLogger()) + // Just username is invalid + cfg.SASLUsername = "abcd" + cfg.SASLPassword = flagext.Secret{} + err = cfg.Validate() + require.Error(t, err) + + // Just password is invalid + cfg.SASLUsername = "" + cfg.SASLPassword = flagext.SecretWithValue("abcd") + err = cfg.Validate() + require.Error(t, err) + + // Both username and password is valid + cfg.SASLUsername = "abcd" + cfg.SASLPassword = flagext.SecretWithValue("abcd") + err = cfg.Validate() + require.NoError(t, err) } diff --git a/pkg/kafka/partition/committer_test.go b/pkg/kafka/partition/committer_test.go index 9ef02f910e5d..1739986cd66c 100644 --- a/pkg/kafka/partition/committer_test.go +++ b/pkg/kafka/partition/committer_test.go @@ -14,7 +14,7 @@ import ( "github.com/prometheus/client_golang/prometheus/testutil" - "github.com/grafana/loki/v3/pkg/kafka" + "github.com/grafana/loki/v3/pkg/kafka/client" "github.com/grafana/loki/v3/pkg/kafka/testkafka" ) @@ -24,7 +24,7 @@ func TestPartitionCommitter(t *testing.T) { topicName := "test-topic" _, kafkaCfg := testkafka.CreateCluster(t, numPartitions, topicName) - client, err := kafka.NewReaderClient(kafkaCfg, kprom.NewMetrics("foo"), log.NewNopLogger()) + client, err := client.NewReaderClient(kafkaCfg, kprom.NewMetrics("foo"), log.NewNopLogger()) require.NoError(t, err) // Create a Kafka admin client diff --git a/pkg/kafka/partition/reader.go b/pkg/kafka/partition/reader.go index e07a65f8a0f1..e364f3bba748 100644 --- a/pkg/kafka/partition/reader.go +++ b/pkg/kafka/partition/reader.go @@ -22,6 +22,7 @@ import ( "github.com/twmb/franz-go/plugin/kprom" "github.com/grafana/loki/v3/pkg/kafka" + "github.com/grafana/loki/v3/pkg/kafka/client" ) var errWaitTargetLagDeadlineExceeded = errors.New("waiting for target lag deadline exceeded") @@ -94,7 +95,7 @@ func NewReader( // This method is called when the PartitionReader service starts. func (p *Reader) start(ctx context.Context) error { var err error - p.client, err = kafka.NewReaderClient(p.kafkaCfg, p.metrics.kprom, p.logger) + p.client, err = client.NewReaderClient(p.kafkaCfg, p.metrics.kprom, p.logger) if err != nil { return errors.Wrap(err, "creating kafka reader client") } @@ -539,7 +540,7 @@ func newReaderMetrics(reg prometheus.Registerer) readerMetrics { return readerMetrics{ receiveDelayWhenStarting: receiveDelay.WithLabelValues("starting"), receiveDelayWhenRunning: receiveDelay.WithLabelValues("running"), - kprom: kafka.NewReaderClientMetrics("partition-reader", reg), + kprom: client.NewReaderClientMetrics("partition-reader", reg), fetchWaitDuration: promauto.With(reg).NewHistogram(prometheus.HistogramOpts{ Name: "loki_ingest_storage_reader_records_batch_wait_duration_seconds", Help: "How long a consumer spent waiting for a batch of records from the Kafka client. If fetching is faster than processing, then this will be close to 0.", diff --git a/pkg/kafka/partition/reader_test.go b/pkg/kafka/partition/reader_test.go index 8d548c831241..dfd653de78e3 100644 --- a/pkg/kafka/partition/reader_test.go +++ b/pkg/kafka/partition/reader_test.go @@ -17,6 +17,7 @@ import ( "github.com/twmb/franz-go/pkg/kgo" "github.com/grafana/loki/v3/pkg/kafka" + "github.com/grafana/loki/v3/pkg/kafka/client" "github.com/grafana/loki/v3/pkg/kafka/testkafka" "github.com/grafana/loki/v3/pkg/logproto" ) @@ -58,7 +59,7 @@ func (m *mockConsumer) Flush(ctx context.Context) error { } func TestPartitionReader_BasicFunctionality(t *testing.T) { - _, kafkaCfg := testkafka.CreateCluster(t, 1, "test-topic") + _, kafkaCfg := testkafka.CreateCluster(t, 1, "test") consumer := newMockConsumer() consumerFactory := func(_ Committer) (Consumer, error) { @@ -67,7 +68,7 @@ func TestPartitionReader_BasicFunctionality(t *testing.T) { partitionReader, err := NewReader(kafkaCfg, 0, "test-consumer-group", consumerFactory, log.NewNopLogger(), prometheus.NewRegistry()) require.NoError(t, err) - producer, err := kafka.NewWriterClient(kafkaCfg, 100, log.NewNopLogger(), prometheus.NewRegistry()) + producer, err := client.NewWriterClient(kafkaCfg, 100, log.NewNopLogger(), prometheus.NewRegistry()) require.NoError(t, err) err = services.StartAndAwaitRunning(context.Background(), partitionReader) @@ -82,8 +83,8 @@ func TestPartitionReader_BasicFunctionality(t *testing.T) { require.NoError(t, err) require.Len(t, records, 1) - producer.ProduceSync(context.Background(), records...) - producer.ProduceSync(context.Background(), records...) + require.NoError(t, producer.ProduceSync(context.Background(), records...).FirstErr()) + require.NoError(t, producer.ProduceSync(context.Background(), records...).FirstErr()) // Wait for records to be processed assert.Eventually(t, func() bool { @@ -121,7 +122,7 @@ func TestPartitionReader_ProcessCatchUpAtStartup(t *testing.T) { partitionReader, err := NewReader(kafkaCfg, 0, "test-consumer-group", consumerFactory, log.NewNopLogger(), prometheus.NewRegistry()) require.NoError(t, err) - producer, err := kafka.NewWriterClient(kafkaCfg, 100, log.NewNopLogger(), prometheus.NewRegistry()) + producer, err := client.NewWriterClient(kafkaCfg, 100, log.NewNopLogger(), prometheus.NewRegistry()) require.NoError(t, err) stream := logproto.Stream{ @@ -175,11 +176,11 @@ func TestPartitionReader_ProcessCommits(t *testing.T) { partitionID := int32(0) partitionReader, err := NewReader(kafkaCfg, partitionID, "test-consumer-group", consumerFactory, log.NewNopLogger(), prometheus.NewRegistry()) require.NoError(t, err) - producer, err := kafka.NewWriterClient(kafkaCfg, 100, log.NewNopLogger(), prometheus.NewRegistry()) + producer, err := client.NewWriterClient(kafkaCfg, 100, log.NewNopLogger(), prometheus.NewRegistry()) require.NoError(t, err) // Init the client: This usually happens in "start" but we want to manage our own lifecycle for this test. - partitionReader.client, err = kafka.NewReaderClient(kafkaCfg, nil, log.NewNopLogger(), + partitionReader.client, err = client.NewReaderClient(kafkaCfg, nil, log.NewNopLogger(), kgo.ConsumePartitions(map[string]map[int32]kgo.Offset{ kafkaCfg.Topic: {partitionID: kgo.NewOffset().AtStart()}, }), diff --git a/pkg/kafka/testkafka/cluster.go b/pkg/kafka/testkafka/cluster.go index cc5847c2bfd3..c70e3da4a71c 100644 --- a/pkg/kafka/testkafka/cluster.go +++ b/pkg/kafka/testkafka/cluster.go @@ -16,8 +16,8 @@ import ( ) // CreateCluster returns a fake Kafka cluster for unit testing. -func CreateCluster(t testing.TB, numPartitions int32, topicName string) (*kfake.Cluster, kafka.Config) { - cluster, addr := CreateClusterWithoutCustomConsumerGroupsSupport(t, numPartitions, topicName) +func CreateCluster(t testing.TB, numPartitions int32, topicName string, opts ...kfake.Opt) (*kfake.Cluster, kafka.Config) { + cluster, addr := CreateClusterWithoutCustomConsumerGroupsSupport(t, numPartitions, topicName, opts...) addSupportForConsumerGroups(t, cluster, topicName, numPartitions) return cluster, createTestKafkaConfig(addr, topicName) @@ -34,8 +34,16 @@ func createTestKafkaConfig(clusterAddr, topicName string) kafka.Config { return cfg } -func CreateClusterWithoutCustomConsumerGroupsSupport(t testing.TB, numPartitions int32, topicName string) (*kfake.Cluster, string) { - cluster, err := kfake.NewCluster(kfake.NumBrokers(1), kfake.SeedTopics(numPartitions, topicName)) +func CreateClusterWithoutCustomConsumerGroupsSupport(t testing.TB, numPartitions int32, topicName string, opts ...kfake.Opt) (*kfake.Cluster, string) { + cfg := []kfake.Opt{ + kfake.NumBrokers(1), + kfake.SeedTopics(numPartitions, topicName), + } + + // Apply options. + cfg = append(cfg, opts...) + + cluster, err := kfake.NewCluster(cfg...) require.NoError(t, err) t.Cleanup(cluster.Close) diff --git a/pkg/logql/log/fmt.go b/pkg/logql/log/fmt.go index c69aa3d40bb0..74eb0262d39d 100644 --- a/pkg/logql/log/fmt.go +++ b/pkg/logql/log/fmt.go @@ -391,9 +391,9 @@ func (lf *LabelsFormatter) Process(ts int64, l []byte, lbs *LabelsBuilder) ([]by defer smp.Put(m) for _, f := range lf.formats { if f.Rename { - v, category, ok := lbs.GetWithCategory(f.Value) + v, _, ok := lbs.GetWithCategory(f.Value) if ok { - lbs.Set(category, f.Name, v) + lbs.Set(ParsedLabel, f.Name, v) lbs.Del(f.Value) } continue diff --git a/pkg/logql/log/fmt_test.go b/pkg/logql/log/fmt_test.go index 2028d2e00bf8..9cb449a7bfdd 100644 --- a/pkg/logql/log/fmt_test.go +++ b/pkg/logql/log/fmt_test.go @@ -515,6 +515,22 @@ func Test_labelsFormatter_Format(t *testing.T) { in labels.Labels want labels.Labels }{ + { + "rename label", + mustNewLabelsFormatter([]LabelFmt{ + NewRenameLabelFmt("baz", "foo"), + }), + labels.FromStrings("foo", "blip", "bar", "blop"), + labels.FromStrings("bar", "blop", "baz", "blip"), + }, + { + "rename and overwrite existing label", + mustNewLabelsFormatter([]LabelFmt{ + NewRenameLabelFmt("bar", "foo"), + }), + labels.FromStrings("foo", "blip", "bar", "blop"), + labels.FromStrings("bar", "blip"), + }, { "combined with template", mustNewLabelsFormatter([]LabelFmt{NewTemplateLabelFmt("foo", "{{.foo}} and {{.bar}}")}), diff --git a/pkg/logql/log/parser.go b/pkg/logql/log/parser.go index c8e65061ba41..50d973eb8b7d 100644 --- a/pkg/logql/log/parser.go +++ b/pkg/logql/log/parser.go @@ -625,6 +625,8 @@ func (j *JSONExpressionParser) Process(_ int64, line []byte, lbs *LabelsBuilder) switch typ { case jsonparser.Null: lbs.Set(ParsedLabel, key, "") + case jsonparser.Object: + lbs.Set(ParsedLabel, key, string(data)) default: lbs.Set(ParsedLabel, key, unescapeJSONString(data)) } diff --git a/pkg/logql/log/parser_test.go b/pkg/logql/log/parser_test.go index 5ac57b9ef054..5ac3a8750363 100644 --- a/pkg/logql/log/parser_test.go +++ b/pkg/logql/log/parser_test.go @@ -542,13 +542,35 @@ func TestJSONExpressionParser(t *testing.T) { ), NoParserHints(), }, + { + "nested object with escaped value", + []byte(`{"app":{"name":"great \"loki\""}`), + []LabelExtractionExpr{ + NewLabelExtractionExpr("app", `app`), + }, + labels.FromStrings("foo", "bar"), + labels.FromStrings("foo", "bar", + "app", `{"name":"great \"loki\""}`, + ), + NoParserHints(), + }, + { + "field with escaped value inside the json string", + []byte(`{"app":"{\"name\":\"great \\\"loki\\\"\"}"}`), + []LabelExtractionExpr{ + NewLabelExtractionExpr("app", `app`), + }, + labels.FromStrings("foo", "bar"), + labels.FromStrings("foo", "bar", + "app", `{"name":"great \"loki\""}`, + ), + NoParserHints(), + }, } for _, tt := range tests { - j, err := NewJSONExpressionParser(tt.expressions) - if err != nil { - t.Fatalf("cannot create JSON expression parser: %s", err.Error()) - } t.Run(tt.name, func(t *testing.T) { + j, err := NewJSONExpressionParser(tt.expressions) + require.NoError(t, err, "cannot create JSON expression parser") b := NewBaseLabelsBuilderWithGrouping(nil, tt.hints, false, false).ForLabels(tt.lbs, tt.lbs.Hash()) b.Reset() _, _ = j.Process(0, tt.line, b) diff --git a/pkg/validation/limits.go b/pkg/validation/limits.go index 5da6bc9cfc61..59d4a3f99e4e 100644 --- a/pkg/validation/limits.go +++ b/pkg/validation/limits.go @@ -208,6 +208,7 @@ type Limits struct { BloomBuilderResponseTimeout time.Duration `yaml:"bloom_build_builder_response_timeout" json:"bloom_build_builder_response_timeout" category:"experimental"` BloomCreationEnabled bool `yaml:"bloom_creation_enabled" json:"bloom_creation_enabled" category:"experimental"` + BloomPlanningStrategy string `yaml:"bloom_planning_strategy" json:"bloom_planning_strategy" category:"experimental"` BloomSplitSeriesKeyspaceBy int `yaml:"bloom_split_series_keyspace_by" json:"bloom_split_series_keyspace_by" category:"experimental"` BloomBlockEncoding string `yaml:"bloom_block_encoding" json:"bloom_block_encoding" category:"experimental"` @@ -389,7 +390,8 @@ func (l *Limits) RegisterFlags(f *flag.FlagSet) { ) f.BoolVar(&l.BloomCreationEnabled, "bloom-build.enable", false, "Experimental. Whether to create blooms for the tenant.") - f.IntVar(&l.BloomSplitSeriesKeyspaceBy, "bloom-build.split-keyspace-by", 256, "Experimental. Number of splits to create for the series keyspace when building blooms. The series keyspace is split into this many parts to parallelize bloom creation.") + f.StringVar(&l.BloomPlanningStrategy, "bloom-build.planning-strategy", "split_keyspace_by_factor", "Experimental. Bloom planning strategy to use in bloom creation. Can be one of: 'split_keyspace_by_factor'") + f.IntVar(&l.BloomSplitSeriesKeyspaceBy, "bloom-build.split-keyspace-by", 256, "Experimental. Only if `bloom-build.planning-strategy` is 'split'. Number of splits to create for the series keyspace when building blooms. The series keyspace is split into this many parts to parallelize bloom creation.") f.IntVar(&l.BloomBuildMaxBuilders, "bloom-build.max-builders", 0, "Experimental. Maximum number of builders to use when building blooms. 0 allows unlimited builders.") f.DurationVar(&l.BloomBuilderResponseTimeout, "bloom-build.builder-response-timeout", 0, "Experimental. Timeout for a builder to finish a task. If a builder does not respond within this time, it is considered failed and the task will be requeued. 0 disables the timeout.") f.IntVar(&l.BloomBuildTaskMaxRetries, "bloom-build.task-max-retries", 3, "Experimental. Maximum number of retries for a failed task. If a task fails more than this number of times, it is considered failed and will not be retried. A value of 0 disables this limit.") @@ -996,6 +998,10 @@ func (o *Overrides) BloomCreationEnabled(userID string) bool { return o.getOverridesForUser(userID).BloomCreationEnabled } +func (o *Overrides) BloomPlanningStrategy(userID string) string { + return o.getOverridesForUser(userID).BloomPlanningStrategy +} + func (o *Overrides) BloomSplitSeriesKeyspaceBy(userID string) int { return o.getOverridesForUser(userID).BloomSplitSeriesKeyspaceBy } diff --git a/production/helm/loki/CHANGELOG.md b/production/helm/loki/CHANGELOG.md index 55ed8d2ed684..28e0d8ca9797 100644 --- a/production/helm/loki/CHANGELOG.md +++ b/production/helm/loki/CHANGELOG.md @@ -13,6 +13,8 @@ Entries should include a reference to the pull request that introduced the chang [//]: # ( : do not remove this line. This locator is used by the CI pipeline to automatically create a changelog entry for each new Loki release. Add other chart versions and respective changelog entries bellow this line.) +## 6.18.0 + ## 6.17.1 - [BUGFIX] Added missing `loki.storage.azure.chunkDelimiter` parameter to Helm chart. diff --git a/production/helm/loki/Chart.yaml b/production/helm/loki/Chart.yaml index 2381bac04810..273bdbbb7db1 100644 --- a/production/helm/loki/Chart.yaml +++ b/production/helm/loki/Chart.yaml @@ -3,7 +3,7 @@ name: loki description: Helm chart for Grafana Loki and Grafana Enterprise Logs supporting both simple, scalable and distributed modes. type: application appVersion: 3.2.0 -version: 6.17.1 +version: 6.18.0 home: https://grafana.github.io/helm-charts sources: - https://github.com/grafana/loki diff --git a/production/helm/loki/README.md b/production/helm/loki/README.md index e152718c170f..94f70c56bf94 100644 --- a/production/helm/loki/README.md +++ b/production/helm/loki/README.md @@ -1,6 +1,6 @@ # loki -![Version: 6.17.1](https://img.shields.io/badge/Version-6.17.1-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 3.2.0](https://img.shields.io/badge/AppVersion-3.2.0-informational?style=flat-square) +![Version: 6.18.0](https://img.shields.io/badge/Version-6.18.0-informational?style=flat-square) ![Type: application](https://img.shields.io/badge/Type-application-informational?style=flat-square) ![AppVersion: 3.2.0](https://img.shields.io/badge/AppVersion-3.2.0-informational?style=flat-square) Helm chart for Grafana Loki and Grafana Enterprise Logs supporting both simple, scalable and distributed modes. diff --git a/production/helm/loki/templates/networkpolicy.yaml b/production/helm/loki/templates/networkpolicy.yaml index 5052e81162b3..9286edb74eff 100644 --- a/production/helm/loki/templates/networkpolicy.yaml +++ b/production/helm/loki/templates/networkpolicy.yaml @@ -66,7 +66,7 @@ spec: {{- include "loki.selectorLabels" . | nindent 6 }} ingress: - ports: - - port: http + - port: http-metrics protocol: TCP {{- if .Values.networkPolicy.ingress.namespaceSelector }} from: diff --git a/tools/doc-generator/parse/root_blocks.go b/tools/doc-generator/parse/root_blocks.go index 1bfcc57bc896..43e8f800dbba 100644 --- a/tools/doc-generator/parse/root_blocks.go +++ b/tools/doc-generator/parse/root_blocks.go @@ -280,12 +280,14 @@ When a memberlist config with atleast 1 join_members is defined, kvstore of type Desc: `Configures additional object stores for a given storage provider. Supported stores: aws, azure, bos, filesystem, gcs, swift. Example: -storage_config: - named_stores: - aws: - store-1: - endpoint: s3://foo-bucket - region: us-west1 +` + "```yaml" + ` + storage_config: + named_stores: + aws: + store-1: + endpoint: s3://foo-bucket + region: us-west1 +` + "```" + ` Named store from this example can be used by setting object_store to store-1 in period_config.`, }, { diff --git a/vendor/github.com/twmb/franz-go/pkg/sasl/plain/plain.go b/vendor/github.com/twmb/franz-go/pkg/sasl/plain/plain.go new file mode 100644 index 000000000000..97a9369d1372 --- /dev/null +++ b/vendor/github.com/twmb/franz-go/pkg/sasl/plain/plain.go @@ -0,0 +1,60 @@ +// Package plain provides PLAIN sasl authentication as specified in RFC4616. +package plain + +import ( + "context" + "errors" + + "github.com/twmb/franz-go/pkg/sasl" +) + +// Auth contains information for authentication. +type Auth struct { + // Zid is an optional authorization ID to use in authenticating. + Zid string + + // User is username to use for authentication. + User string + + // Pass is the password to use for authentication. + Pass string + + _ struct{} // require explicit field initialization +} + +// AsMechanism returns a sasl mechanism that will use 'a' as credentials for +// all sasl sessions. +// +// This is a shortcut for using the Plain function and is useful when you do +// not need to live-rotate credentials. +func (a Auth) AsMechanism() sasl.Mechanism { + return Plain(func(context.Context) (Auth, error) { + return a, nil + }) +} + +// Plain returns a sasl mechanism that will call authFn whenever sasl +// authentication is needed. The returned Auth is used for a single session. +func Plain(authFn func(context.Context) (Auth, error)) sasl.Mechanism { + return plain(authFn) +} + +type plain func(context.Context) (Auth, error) + +func (plain) Name() string { return "PLAIN" } +func (fn plain) Authenticate(ctx context.Context, _ string) (sasl.Session, []byte, error) { + auth, err := fn(ctx) + if err != nil { + return nil, nil, err + } + if auth.User == "" || auth.Pass == "" { + return nil, nil, errors.New("PLAIN user and pass must be non-empty") + } + return session{}, []byte(auth.Zid + "\x00" + auth.User + "\x00" + auth.Pass), nil +} + +type session struct{} + +func (session) Challenge([]byte) (bool, []byte, error) { + return true, nil, nil +} diff --git a/vendor/modules.txt b/vendor/modules.txt index 49e7bb611899..8e8e074487f9 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -1602,6 +1602,7 @@ github.com/twmb/franz-go/pkg/kgo github.com/twmb/franz-go/pkg/kgo/internal/sticky github.com/twmb/franz-go/pkg/kversion github.com/twmb/franz-go/pkg/sasl +github.com/twmb/franz-go/pkg/sasl/plain # github.com/twmb/franz-go/pkg/kadm v1.13.0 ## explicit; go 1.21 github.com/twmb/franz-go/pkg/kadm