From 5c11b8060cd61ef6230520dd56b635932434395c Mon Sep 17 00:00:00 2001 From: kpango Date: Mon, 5 Aug 2024 10:13:22 +0900 Subject: [PATCH] [BUGFIX] index correction process Signed-off-by: kpango --- Makefile | 5 +- Makefile.d/dependencies.mk | 4 +- Makefile.d/k8s.mk | 14 +- Makefile.d/proto.mk | 10 +- Makefile.d/test.mk | 16 +- .../vald-helm-operator/crds/valdrelease.yaml | 175 ++++ charts/vald/README.md | 1 + .../index/job/correction/configmap.yaml | 21 +- charts/vald/values.schema.json | 314 +++++++ charts/vald/values.yaml | 3 + go.mod | 20 +- go.sum | 34 +- internal/config/corrector.go | 6 + internal/db/kvs/pogreb/pogreb.go | 6 +- internal/db/kvs/pogreb/pogreb_test.go | 48 +- k8s/index/job/correction/configmap.yaml | 450 +++++++++ k8s/index/job/correction/cronjob.yaml | 144 +++ k8s/index/job/creation/configmap.yaml | 370 ++++++++ k8s/index/job/creation/cronjob.yaml | 144 +++ k8s/index/job/save/configmap.yaml | 370 ++++++++ k8s/index/job/save/cronjob.yaml | 144 +++ k8s/index/operator/configmap.yaml | 28 + k8s/index/operator/deployment.yaml | 173 ++++ k8s/index/operator/priorityclass.yaml | 30 + k8s/operator/helm/crds/valdrelease.yaml | 175 ++++ pkg/index/job/correction/service/corrector.go | 861 +++++++----------- pkg/index/job/correction/service/options.go | 34 +- pkg/index/job/correction/usecase/corrector.go | 21 +- rust/Cargo.lock | 27 +- versions/CMAKE_VERSION | 2 +- versions/PROMETHEUS_STACK_VERSION | 2 +- versions/actions/ACTIONS_UPLOAD_ARTIFACT | 2 +- 32 files changed, 2997 insertions(+), 657 deletions(-) diff --git a/Makefile b/Makefile index 6ed06ad318..6d2b0ed842 100644 --- a/Makefile +++ b/Makefile @@ -17,11 +17,12 @@ SHELL = bash ORG ?= vdaas NAME = vald -GOPKG = github.com/$(ORG)/$(NAME) +REPO = $(ORG)/$(NAME) +GOPKG = github.com/$(REPO) DATETIME = $(eval DATETIME := $(shell date -u +%Y/%m/%d_%H:%M:%S%z))$(DATETIME) TAG ?= latest CRORG ?= $(ORG) -GHCRORG = ghcr.io/$(ORG)/$(NAME) +GHCRORG = ghcr.io/$(REPO) AGENT_NGT_IMAGE = $(NAME)-agent-ngt AGENT_FAISS_IMAGE = $(NAME)-agent-faiss AGENT_SIDECAR_IMAGE = $(NAME)-agent-sidecar diff --git a/Makefile.d/dependencies.mk b/Makefile.d/dependencies.mk index 95b2895d74..928ed8c9d2 100644 --- a/Makefile.d/dependencies.mk +++ b/Makefile.d/dependencies.mk @@ -225,12 +225,12 @@ update/hdf5: .PHONY: update/vald ## update vald it's self version update/vald: - curl -fsSL https://api.github.com/repos/vdaas/vald/releases/latest | grep -Po '"tag_name": "\K.*?(?=")' > $(ROOTDIR)/versions/VALD_VERSION + curl -fsSL https://api.github.com/repos/$(REPO)/releases/latest | grep -Po '"tag_name": "\K.*?(?=")' > $(ROOTDIR)/versions/VALD_VERSION .PHONY: update/valdcli ## update vald client library made by clojure self version update/valdcli: - curl -fsSL https://api.github.com/repos/vdaas/vald-client-clj/releases/latest | grep -Po '"tag_name": "\K.*?(?=")' > $(ROOTDIR)/versions/VALDCLI_VERSION + curl -fsSL https://api.github.com/repos/$(REPO)-client-clj/releases/latest | grep -Po '"tag_name": "\K.*?(?=")' > $(ROOTDIR)/versions/VALDCLI_VERSION .PHONY: update/template ## update PULL_REQUEST_TEMPLATE and ISSUE_TEMPLATE diff --git a/Makefile.d/k8s.mk b/Makefile.d/k8s.mk index 23c6d9f77a..628fac13b5 100644 --- a/Makefile.d/k8s.mk +++ b/Makefile.d/k8s.mk @@ -37,6 +37,10 @@ k8s/manifest/update: \ helm template \ --values $(HELM_VALUES) \ $(HELM_EXTRA_OPTIONS) \ + --set manager.index.operator.enabled=true \ + --set manager.index.saver.enabled=true \ + --set manager.index.creator.enabled=true \ + --set manager.index.corrector.enabled=true \ --output-dir $(TEMP_DIR) \ charts/vald mkdir -p k8s/gateway @@ -536,27 +540,27 @@ $(BINDIR)/telepresence: .PHONY: telepresence/swap/agent-ngt ## swap agent-ngt deployment using telepresence telepresence/swap/agent-ngt: - $(call telepresence,vald-agent-ngt,vdaas/vald-agent-ngt) + $(call telepresence,vald-agent-ngt,$(REPO)-agent-ngt) .PHONY: telepresence/swap/agent-faiss ## swap agent-faiss deployment using telepresence telepresence/swap/agent-faiss: - $(call telepresence,vald-agent-faiss,vdaas/vald-agent-faiss) + $(call telepresence,vald-agent-faiss,$(REPO)-agent-faiss) .PHONY: telepresence/swap/discoverer ## swap discoverer deployment using telepresence telepresence/swap/discoverer: - $(call telepresence,vald-discoverer,vdaas/vald-discoverer-k8s) + $(call telepresence,vald-discoverer,$(REPO)-discoverer-k8s) .PHONY: telepresence/swap/manager-index ## swap manager-index deployment using telepresence telepresence/swap/manager-index: - $(call telepresence,vald-manager-index,vdaas/vald-manager-index) + $(call telepresence,vald-manager-index,$(REPO)-manager-index) .PHONY: telepresence/swap/lb-gateway ## swap lb-gateway deployment using telepresence telepresence/swap/lb-gateway: - $(call telepresence,vald-lb-gateway,vdaas/vald-lb-gateway) + $(call telepresence,vald-lb-gateway,$(REPO)-lb-gateway) .PHONY: kubelinter/install ## install kubelinter diff --git a/Makefile.d/proto.mk b/Makefile.d/proto.mk index 50669a111f..2eef443f91 100644 --- a/Makefile.d/proto.mk +++ b/Makefile.d/proto.mk @@ -47,7 +47,7 @@ $(GOBIN)/buf: $(ROOTDIR)/apis/proto/v1/rpc/errdetails/error_details.proto: curl -fsSL https://raw.githubusercontent.com/googleapis/googleapis/master/google/rpc/error_details.proto -o $(ROOTDIR)/apis/proto/v1/rpc/errdetails/error_details.proto sed -i -e "s/package google.rpc/package rpc.v1/" $(ROOTDIR)/apis/proto/v1/rpc/errdetails/error_details.proto - sed -i -e "s%google.golang.org/genproto/googleapis/rpc/errdetails;errdetails%github.com/vdaas/vald/apis/grpc/v1/rpc/errdetails%" $(ROOTDIR)/apis/proto/v1/rpc/errdetails/error_details.proto + sed -i -e "s%google.golang.org/genproto/googleapis/rpc/errdetails;errdetails%$(GOPKG)/apis/grpc/v1/rpc/errdetails%" $(ROOTDIR)/apis/proto/v1/rpc/errdetails/error_details.proto sed -i -e "s/com.google.rpc/org.vdaas.vald.api.v1.rpc/" $(ROOTDIR)/apis/proto/v1/rpc/errdetails/error_details.proto proto/gen: \ @@ -59,9 +59,9 @@ proto/gen: \ make proto/replace proto/replace: - find $(ROOTDIR)/apis/grpc/* -name '*.go' | xargs -P$(CORES) sed -i -E "s%google.golang.org/grpc/codes%github.com/vdaas/vald/internal/net/grpc/codes%g" - find $(ROOTDIR)/apis/grpc/* -name '*.go' | xargs -P$(CORES) sed -i -E "s%google.golang.org/grpc/status%github.com/vdaas/vald/internal/net/grpc/status%g" - find $(ROOTDIR)/apis/grpc/* -name '*.go' | xargs -P$(CORES) sed -i -E "s%\"io\"%\"github.com/vdaas/vald/internal/io\"%g" - find $(ROOTDIR)/apis/grpc/* -name '*.go' | xargs -P$(CORES) sed -i -E "s%\"sync\"%\"github.com/vdaas/vald/internal/sync\"%g" + find $(ROOTDIR)/apis/grpc/* -name '*.go' | xargs -P$(CORES) sed -i -E "s%google.golang.org/grpc/codes%$(GOPKG)/internal/net/grpc/codes%g" + find $(ROOTDIR)/apis/grpc/* -name '*.go' | xargs -P$(CORES) sed -i -E "s%google.golang.org/grpc/status%$(GOPKG)/internal/net/grpc/status%g" + find $(ROOTDIR)/apis/grpc/* -name '*.go' | xargs -P$(CORES) sed -i -E "s%\"io\"%\"$(GOPKG)/internal/io\"%g" + find $(ROOTDIR)/apis/grpc/* -name '*.go' | xargs -P$(CORES) sed -i -E "s%\"sync\"%\"$(GOPKG)/internal/sync\"%g" find $(ROOTDIR)/apis/grpc/* -name '*.go' | xargs -P$(CORES) sed -i -E "s%interface\{\}%any%g" find $(ROOTDIR)/apis/grpc/* -name '*.go' | xargs -P$(CORES) sed -i -E "s%For_%For%g" diff --git a/Makefile.d/test.mk b/Makefile.d/test.mk index 2ecf16a87b..511c7e0fa5 100644 --- a/Makefile.d/test.mk +++ b/Makefile.d/test.mk @@ -344,15 +344,15 @@ gotests/gen-test: ## apply patches to generated go test files gotests/patch: @$(call green, "apply patches to go test files...") - find $(ROOTDIR)/internal/k8s/* -name '*_test.go' | xargs -P$(CORES) sed -i -E "s%k8s.io/apimachinery/pkg/api/errors%github.com/vdaas/vald/internal/errors%g" - find $(ROOTDIR)/* -name '*_test.go' | xargs -P$(CORES) sed -i -E "s%cockroachdb/errors%vdaas/vald/internal/errors%g" - find $(ROOTDIR)/* -name '*_test.go' | xargs -P$(CORES) sed -i -E "s%golang.org/x/sync/errgroup%github.com/vdaas/vald/internal/sync/errgroup%g" - find $(ROOTDIR)/* -name '*_test.go' | xargs -P$(CORES) sed -i -E "s%pkg/errors%vdaas/vald/internal/errors%g" - find $(ROOTDIR)/* -name '*_test.go' | xargs -P$(CORES) sed -i -E "s%go-errors/errors%vdaas/vald/internal/errors%g" - find $(ROOTDIR)/* -name '*_test.go' | xargs -P$(CORES) sed -i -E "s%go.uber.org/goleak%github.com/vdaas/vald/internal/test/goleak%g" - find $(ROOTDIR)/internal/errors -name '*_test.go' | xargs -P$(CORES) sed -i -E "s%\"github.com/vdaas/vald/internal/errors\"%%g" + find $(ROOTDIR)/internal/k8s/* -name '*_test.go' | xargs -P$(CORES) sed -i -E "s%k8s.io/apimachinery/pkg/api/errors%$(GOPKG)/internal/errors%g" + find $(ROOTDIR)/* -name '*_test.go' | xargs -P$(CORES) sed -i -E "s%cockroachdb/errors%$(REPO)/internal/errors%g" + find $(ROOTDIR)/* -name '*_test.go' | xargs -P$(CORES) sed -i -E "s%golang.org/x/sync/errgroup%$(GOPKG)/internal/sync/errgroup%g" + find $(ROOTDIR)/* -name '*_test.go' | xargs -P$(CORES) sed -i -E "s%pkg/errors%$(REPO)/internal/errors%g" + find $(ROOTDIR)/* -name '*_test.go' | xargs -P$(CORES) sed -i -E "s%go-errors/errors%$(REPO)/internal/errors%g" + find $(ROOTDIR)/* -name '*_test.go' | xargs -P$(CORES) sed -i -E "s%go.uber.org/goleak%$(GOPKG)/internal/test/goleak%g" + find $(ROOTDIR)/internal/errors -name '*_test.go' | xargs -P$(CORES) sed -i -E "s%\"$(GOPKG)/internal/errors\"%%g" find $(ROOTDIR)/internal/errors -name '*_test.go' -not -name '*_benchmark_test.go' | xargs -P$(CORES) sed -i -E "s/errors\.//g" - find $(ROOTDIR)/internal/test/goleak -name '*_test.go' | xargs -P$(CORES) sed -i -E "s%\"github.com/vdaas/vald/internal/test/goleak\"%%g" + find $(ROOTDIR)/internal/test/goleak -name '*_test.go' | xargs -P$(CORES) sed -i -E "s%\"$(GOPKG)/internal/test/goleak\"%%g" find $(ROOTDIR)/internal/test/goleak -name '*_test.go' | xargs -P$(CORES) sed -i -E "s/goleak\.//g" .PHONY: test/patch-placeholder diff --git a/charts/vald-helm-operator/crds/valdrelease.yaml b/charts/vald-helm-operator/crds/valdrelease.yaml index 88657d76e0..99c4cc0ec6 100644 --- a/charts/vald-helm-operator/crds/valdrelease.yaml +++ b/charts/vald-helm-operator/crds/valdrelease.yaml @@ -7888,6 +7888,181 @@ spec: items: type: object x-kubernetes-preserve-unknown-fields: true + gateway: + type: object + properties: + addrs: + type: array + items: + type: string + backoff: + type: object + properties: + backoff_factor: + type: number + backoff_time_limit: + type: string + enable_error_log: + type: boolean + initial_duration: + type: string + jitter_limit: + type: string + maximum_duration: + type: string + retry_count: + type: integer + call_option: + type: object + x-kubernetes-preserve-unknown-fields: true + circuit_breaker: + type: object + properties: + closed_error_rate: + type: number + closed_refresh_timeout: + type: string + half_open_error_rate: + type: number + min_samples: + type: integer + open_timeout: + type: string + connection_pool: + type: object + properties: + enable_dns_resolver: + type: boolean + enable_rebalance: + type: boolean + old_conn_close_duration: + type: string + rebalance_duration: + type: string + size: + type: integer + dial_option: + type: object + properties: + backoff_base_delay: + type: string + backoff_jitter: + type: number + backoff_max_delay: + type: string + backoff_multiplier: + type: number + enable_backoff: + type: boolean + initial_connection_window_size: + type: integer + initial_window_size: + type: integer + insecure: + type: boolean + interceptors: + type: array + items: + type: string + enum: + - TraceInterceptor + keepalive: + type: object + properties: + permit_without_stream: + type: boolean + time: + type: string + timeout: + type: string + max_msg_size: + type: integer + min_connection_timeout: + type: string + net: + type: object + properties: + dialer: + type: object + properties: + dual_stack_enabled: + type: boolean + keepalive: + type: string + timeout: + type: string + dns: + type: object + properties: + cache_enabled: + type: boolean + cache_expiration: + type: string + refresh_duration: + type: string + socket_option: + type: object + properties: + ip_recover_destination_addr: + type: boolean + ip_transparent: + type: boolean + reuse_addr: + type: boolean + reuse_port: + type: boolean + tcp_cork: + type: boolean + tcp_defer_accept: + type: boolean + tcp_fast_open: + type: boolean + tcp_no_delay: + type: boolean + tcp_quick_ack: + type: boolean + tls: + type: object + properties: + ca: + type: string + cert: + type: string + enabled: + type: boolean + insecure_skip_verify: + type: boolean + key: + type: string + read_buffer_size: + type: integer + timeout: + type: string + write_buffer_size: + type: integer + health_check_duration: + type: string + max_recv_msg_size: + type: integer + max_retry_rpc_buffer_size: + type: integer + max_send_msg_size: + type: integer + tls: + type: object + properties: + ca: + type: string + cert: + type: string + enabled: + type: boolean + insecure_skip_verify: + type: boolean + key: + type: string + wait_for_ready: + type: boolean image: type: object properties: diff --git a/charts/vald/README.md b/charts/vald/README.md index 31f0ef1494..c887075fdd 100644 --- a/charts/vald/README.md +++ b/charts/vald/README.md @@ -855,6 +855,7 @@ Run the following command to install the chart, | manager.index.corrector.discoverer.duration | string | `"500ms"` | refresh duration to discover | | manager.index.corrector.enabled | bool | `false` | enable index correction CronJob | | manager.index.corrector.env | list | `[{"name":"MY_NODE_NAME","valueFrom":{"fieldRef":{"fieldPath":"spec.nodeName"}}},{"name":"MY_POD_NAME","valueFrom":{"fieldRef":{"fieldPath":"metadata.name"}}},{"name":"MY_POD_NAMESPACE","valueFrom":{"fieldRef":{"fieldPath":"metadata.namespace"}}}]` | environment variables | +| manager.index.corrector.gateway | object | `{}` | gRPC client for gateway (overrides defaults.grpc.client) | | manager.index.corrector.image.pullPolicy | string | `"Always"` | | | manager.index.corrector.image.repository | string | `"vdaas/vald-index-correction"` | image repository | | manager.index.corrector.image.tag | string | `""` | image tag (overrides defaults.image.tag) | diff --git a/charts/vald/templates/index/job/correction/configmap.yaml b/charts/vald/templates/index/job/correction/configmap.yaml index 789c7ed037..86207f71fa 100644 --- a/charts/vald/templates/index/job/correction/configmap.yaml +++ b/charts/vald/templates/index/job/correction/configmap.yaml @@ -53,10 +53,29 @@ data: stream_list_concurrency: {{ $corrector.stream_list_concurrency }} kvs_async_write_concurrency: {{ $corrector.kvs_async_write_concurrency }} index_replica: {{ $gateway.gateway_config.index_replica }} + gateway: + {{- $nextGatewayClient := $corrector.gateway}} + {{- if $gateway.enabled -}} + {{- $lbServerPort := $gateway.server_config.servers.grpc.port }} + {{- $defaultLBHost := printf "%s.%s.svc.cluster.local" $gateway.name .Release.Namespace }} + {{- $defaultLBPort := default .Values.defaults.server_config.servers.grpc.port $lbServerPort }} + {{- $defaultLBAddr := (list (printf "%s:%d" $defaultLBHost (int64 $defaultLBPort))) }} + {{- $lbAddrs := dict "Values" $nextGatewayClient.addrs "default" $defaultLBAddr }} + {{- include "vald.grpc.client.addrs" $lbAddrs | nindent 8 }} + {{- else -}} + {{- $agentServerPort := $agent.server_config.servers.grpc.port }} + {{- $defaultAgentHost := printf "%s.%s.svc.cluster.local" $agent.name .Release.Namespace }} + {{- $defaultAgentPort := default .Values.defaults.server_config.servers.grpc.port $agentServerPort }} + {{- $defaultAgentAddr := (list (printf "%s:%d" $defaultAgentHost (int64 $defaultAgentPort))) }} + {{- $agentAddrs := dict "Values" $nextGatewayClient.addrs "default" $defaultAgentAddr }} + {{- include "vald.grpc.client.addrs" $agentAddrs | nindent 8 }} + {{- end -}} + {{- $nextGRPCClient := dict "Values" $nextGatewayClient "default" .Values.defaults.grpc.client }} + {{- include "vald.grpc.client" $nextGRPCClient | nindent 8 }} discoverer: duration: {{ $corrector.discoverer.duration }} client: - {{- $discovererClient := $index.corrector.discoverer.client }} + {{- $discovererClient := $corrector.discoverer.client }} {{- $discovererServerPort := $discoverer.server_config.servers.grpc.port }} {{- $defaultDiscovererHost := printf "%s.%s.svc.cluster.local" $discoverer.name .Release.Namespace }} {{- $defaultDiscovererPort := default .Values.defaults.server_config.servers.grpc.port $discovererServerPort }} diff --git a/charts/vald/values.schema.json b/charts/vald/values.schema.json index 772d20c328..04c4c1ce2b 100644 --- a/charts/vald/values.schema.json +++ b/charts/vald/values.schema.json @@ -13109,6 +13109,320 @@ "description": "environment variables", "items": { "type": "object" } }, + "gateway": { + "type": "object", + "properties": { + "addrs": { + "type": "array", + "description": "gRPC client addresses", + "items": { "type": "string" } + }, + "backoff": { + "type": "object", + "properties": { + "backoff_factor": { + "type": "number", + "description": "gRPC client backoff factor" + }, + "backoff_time_limit": { + "type": "string", + "description": "gRPC client backoff time limit" + }, + "enable_error_log": { + "type": "boolean", + "description": "gRPC client backoff log enabled" + }, + "initial_duration": { + "type": "string", + "description": "gRPC client backoff initial duration" + }, + "jitter_limit": { + "type": "string", + "description": "gRPC client backoff jitter limit" + }, + "maximum_duration": { + "type": "string", + "description": "gRPC client backoff maximum duration" + }, + "retry_count": { + "type": "integer", + "description": "gRPC client backoff retry count" + } + } + }, + "call_option": { "type": "object" }, + "circuit_breaker": { + "type": "object", + "properties": { + "closed_error_rate": { + "type": "number", + "description": "gRPC client circuitbreaker closed error rate" + }, + "closed_refresh_timeout": { + "type": "string", + "description": "gRPC client circuitbreaker closed refresh timeout" + }, + "half_open_error_rate": { + "type": "number", + "description": "gRPC client circuitbreaker half-open error rate" + }, + "min_samples": { + "type": "integer", + "description": "gRPC client circuitbreaker minimum sampling count" + }, + "open_timeout": { + "type": "string", + "description": "gRPC client circuitbreaker open timeout" + } + } + }, + "connection_pool": { + "type": "object", + "properties": { + "enable_dns_resolver": { + "type": "boolean", + "description": "enables gRPC client connection pool dns resolver, when enabled vald uses ip handshake exclude dns discovery which improves network performance" + }, + "enable_rebalance": { + "type": "boolean", + "description": "enables gRPC client connection pool rebalance" + }, + "old_conn_close_duration": { + "type": "string", + "description": "makes delay before gRPC client connection closing during connection pool rebalance" + }, + "rebalance_duration": { + "type": "string", + "description": "gRPC client connection pool rebalance duration" + }, + "size": { + "type": "integer", + "description": "gRPC client connection pool size" + } + } + }, + "dial_option": { + "type": "object", + "properties": { + "backoff_base_delay": { + "type": "string", + "description": "gRPC client dial option base backoff delay" + }, + "backoff_jitter": { + "type": "number", + "description": "gRPC client dial option base backoff delay" + }, + "backoff_max_delay": { + "type": "string", + "description": "gRPC client dial option max backoff delay" + }, + "backoff_multiplier": { + "type": "number", + "description": "gRPC client dial option base backoff delay" + }, + "enable_backoff": { + "type": "boolean", + "description": "gRPC client dial option backoff enabled" + }, + "initial_connection_window_size": { + "type": "integer", + "description": "gRPC client dial option initial connection window size" + }, + "initial_window_size": { + "type": "integer", + "description": "gRPC client dial option initial window size" + }, + "insecure": { + "type": "boolean", + "description": "gRPC client dial option insecure enabled" + }, + "interceptors": { + "type": "array", + "description": "gRPC client interceptors", + "items": { + "type": "string", + "enum": ["TraceInterceptor"] + } + }, + "keepalive": { + "type": "object", + "properties": { + "permit_without_stream": { + "type": "boolean", + "description": "gRPC client keep alive permit without stream" + }, + "time": { + "type": "string", + "description": "gRPC client keep alive time" + }, + "timeout": { + "type": "string", + "description": "gRPC client keep alive timeout" + } + } + }, + "max_msg_size": { + "type": "integer", + "description": "gRPC client dial option max message size" + }, + "min_connection_timeout": { + "type": "string", + "description": "gRPC client dial option minimum connection timeout" + }, + "net": { + "type": "object", + "properties": { + "dialer": { + "type": "object", + "properties": { + "dual_stack_enabled": { + "type": "boolean", + "description": "gRPC client TCP dialer dual stack enabled" + }, + "keepalive": { + "type": "string", + "description": "gRPC client TCP dialer keep alive" + }, + "timeout": { + "type": "string", + "description": "gRPC client TCP dialer timeout" + } + } + }, + "dns": { + "type": "object", + "properties": { + "cache_enabled": { + "type": "boolean", + "description": "gRPC client TCP DNS cache enabled" + }, + "cache_expiration": { + "type": "string", + "description": "gRPC client TCP DNS cache expiration" + }, + "refresh_duration": { + "type": "string", + "description": "gRPC client TCP DNS cache refresh duration" + } + } + }, + "socket_option": { + "type": "object", + "properties": { + "ip_recover_destination_addr": { + "type": "boolean", + "description": "server listen socket option for ip_recover_destination_addr functionality" + }, + "ip_transparent": { + "type": "boolean", + "description": "server listen socket option for ip_transparent functionality" + }, + "reuse_addr": { + "type": "boolean", + "description": "server listen socket option for reuse_addr functionality" + }, + "reuse_port": { + "type": "boolean", + "description": "server listen socket option for reuse_port functionality" + }, + "tcp_cork": { + "type": "boolean", + "description": "server listen socket option for tcp_cork functionality" + }, + "tcp_defer_accept": { + "type": "boolean", + "description": "server listen socket option for tcp_defer_accept functionality" + }, + "tcp_fast_open": { + "type": "boolean", + "description": "server listen socket option for tcp_fast_open functionality" + }, + "tcp_no_delay": { + "type": "boolean", + "description": "server listen socket option for tcp_no_delay functionality" + }, + "tcp_quick_ack": { + "type": "boolean", + "description": "server listen socket option for tcp_quick_ack functionality" + } + } + }, + "tls": { + "type": "object", + "properties": { + "ca": { + "type": "string", + "description": "TLS ca path" + }, + "cert": { + "type": "string", + "description": "TLS cert path" + }, + "enabled": { + "type": "boolean", + "description": "TLS enabled" + }, + "insecure_skip_verify": { + "type": "boolean", + "description": "enable/disable skip SSL certificate verification" + }, + "key": { + "type": "string", + "description": "TLS key path" + } + } + } + } + }, + "read_buffer_size": { + "type": "integer", + "description": "gRPC client dial option read buffer size" + }, + "timeout": { + "type": "string", + "description": "gRPC client dial option timeout" + }, + "write_buffer_size": { + "type": "integer", + "description": "gRPC client dial option write buffer size" + } + } + }, + "health_check_duration": { + "type": "string", + "description": "gRPC client health check duration" + }, + "max_recv_msg_size": { "type": "integer" }, + "max_retry_rpc_buffer_size": { "type": "integer" }, + "max_send_msg_size": { "type": "integer" }, + "tls": { + "type": "object", + "properties": { + "ca": { + "type": "string", + "description": "TLS ca path" + }, + "cert": { + "type": "string", + "description": "TLS cert path" + }, + "enabled": { + "type": "boolean", + "description": "TLS enabled" + }, + "insecure_skip_verify": { + "type": "boolean", + "description": "enable/disable skip SSL certificate verification" + }, + "key": { + "type": "string", + "description": "TLS key path" + } + } + }, + "wait_for_ready": { "type": "boolean" } + } + }, "image": { "type": "object", "properties": { diff --git a/charts/vald/values.yaml b/charts/vald/values.yaml index 4d7c2a052b..86d34a0e06 100644 --- a/charts/vald/values.yaml +++ b/charts/vald/values.yaml @@ -3257,6 +3257,9 @@ manager: # @schema {"name": "manager.index.corrector.node_name", "type": "string"} # manager.index.corrector.node_name -- node name node_name: "" # _MY_NODE_NAME_ + # @schema {"name": "manager.index.corrector.gateway", "alias": "grpc.client"} + # manager.index.corrector.gateway -- gRPC client for gateway (overrides defaults.grpc.client) + gateway: {} # @schema {"name": "manager.index.corrector.discoverer", "type": "object"} discoverer: # @schema {"name": "manager.index.corrector.discoverer.duration", "type": "string"} diff --git a/go.mod b/go.mod index 46cf20cff6..2e841150b1 100644 --- a/go.mod +++ b/go.mod @@ -11,11 +11,11 @@ replace ( cloud.google.com/go/iam => cloud.google.com/go/iam v1.1.12 cloud.google.com/go/kms => cloud.google.com/go/kms v1.18.4 cloud.google.com/go/monitoring => cloud.google.com/go/monitoring v1.20.3 - cloud.google.com/go/pubsub => cloud.google.com/go/pubsub v1.40.0 + cloud.google.com/go/pubsub => cloud.google.com/go/pubsub v1.41.0 cloud.google.com/go/secretmanager => cloud.google.com/go/secretmanager v1.13.5 cloud.google.com/go/storage => cloud.google.com/go/storage v1.43.0 cloud.google.com/go/trace => cloud.google.com/go/trace v1.10.11 - code.cloudfoundry.org/bytefmt => code.cloudfoundry.org/bytefmt v0.0.0-20240730181512-d61d30bca0a4 + code.cloudfoundry.org/bytefmt => code.cloudfoundry.org/bytefmt v0.0.0-20240804182054-0a63f33a903d contrib.go.opencensus.io/exporter/aws => contrib.go.opencensus.io/exporter/aws v0.0.0-20230502192102-15967c811cec contrib.go.opencensus.io/exporter/prometheus => contrib.go.opencensus.io/exporter/prometheus v0.4.2 contrib.go.opencensus.io/integrations/ocsql => contrib.go.opencensus.io/integrations/ocsql v0.1.7 @@ -50,7 +50,7 @@ replace ( github.com/aws/aws-sdk-go-v2/config => github.com/aws/aws-sdk-go-v2/config v1.27.27 github.com/aws/aws-sdk-go-v2/credentials => github.com/aws/aws-sdk-go-v2/credentials v1.17.27 github.com/aws/aws-sdk-go-v2/feature/ec2/imds => github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.11 - github.com/aws/aws-sdk-go-v2/feature/s3/manager => github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.17.9 + github.com/aws/aws-sdk-go-v2/feature/s3/manager => github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.17.10 github.com/aws/aws-sdk-go-v2/internal/configsources => github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.15 github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 => github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.15 github.com/aws/aws-sdk-go-v2/internal/ini => github.com/aws/aws-sdk-go-v2/internal/ini v1.8.0 @@ -59,7 +59,7 @@ replace ( github.com/aws/aws-sdk-go-v2/service/internal/presigned-url => github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.17 github.com/aws/aws-sdk-go-v2/service/internal/s3shared => github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.17.15 github.com/aws/aws-sdk-go-v2/service/kms => github.com/aws/aws-sdk-go-v2/service/kms v1.35.3 - github.com/aws/aws-sdk-go-v2/service/s3 => github.com/aws/aws-sdk-go-v2/service/s3 v1.58.2 + github.com/aws/aws-sdk-go-v2/service/s3 => github.com/aws/aws-sdk-go-v2/service/s3 v1.58.3 github.com/aws/aws-sdk-go-v2/service/secretsmanager => github.com/aws/aws-sdk-go-v2/service/secretsmanager v1.32.4 github.com/aws/aws-sdk-go-v2/service/sns => github.com/aws/aws-sdk-go-v2/service/sns v1.31.3 github.com/aws/aws-sdk-go-v2/service/sqs => github.com/aws/aws-sdk-go-v2/service/sqs v1.34.3 @@ -300,14 +300,14 @@ replace ( golang.org/x/image => golang.org/x/image v0.18.0 golang.org/x/lint => golang.org/x/lint v0.0.0-20210508222113-6edffad5e616 golang.org/x/mobile => golang.org/x/mobile v0.0.0-20240716161057-1ad2df20a8b6 - golang.org/x/mod => golang.org/x/mod v0.19.0 + golang.org/x/mod => golang.org/x/mod v0.20.0 golang.org/x/net => golang.org/x/net v0.27.0 - golang.org/x/oauth2 => golang.org/x/oauth2 v0.21.0 - golang.org/x/sync => golang.org/x/sync v0.7.0 - golang.org/x/sys => golang.org/x/sys v0.22.0 + golang.org/x/oauth2 => golang.org/x/oauth2 v0.22.0 + golang.org/x/sync => golang.org/x/sync v0.8.0 + golang.org/x/sys => golang.org/x/sys v0.23.0 golang.org/x/term => golang.org/x/term v0.22.0 golang.org/x/text => golang.org/x/text v0.16.0 - golang.org/x/time => golang.org/x/time v0.5.0 + golang.org/x/time => golang.org/x/time v0.6.0 golang.org/x/tools => golang.org/x/tools v0.23.0 golang.org/x/xerrors => golang.org/x/xerrors v0.0.0-20240716161551-93cc26a95ae9 gomodules.xyz/jsonpatch/v2 => gomodules.xyz/jsonpatch/v2 v2.4.0 @@ -399,7 +399,7 @@ require ( golang.org/x/net v0.27.0 golang.org/x/oauth2 v0.21.0 golang.org/x/sync v0.7.0 - golang.org/x/sys v0.22.0 + golang.org/x/sys v0.23.0 golang.org/x/text v0.16.0 golang.org/x/time v0.5.0 golang.org/x/tools v0.23.0 diff --git a/go.sum b/go.sum index 891645dbde..c55259a657 100644 --- a/go.sum +++ b/go.sum @@ -123,7 +123,7 @@ cloud.google.com/go/oslogin v1.13.7/go.mod h1:xq027cL0fojpcEcpEQdWayiDn8tIx3WEFY cloud.google.com/go/phishingprotection v0.8.11/go.mod h1:Mge0cylqVFs+D0EyxlsTOJ1Guf3qDgrztHzxZqkhRQM= cloud.google.com/go/policytroubleshooter v1.10.9/go.mod h1:X8HEPVBWz8E+qwI/QXnhBLahEHdcuPO3M9YvSj0LDek= cloud.google.com/go/privatecatalog v0.9.11/go.mod h1:awEF2a8M6UgoqVJcF/MthkF8SSo6OoWQ7TtPNxUlljY= -cloud.google.com/go/pubsub v1.40.0/go.mod h1:BVJI4sI2FyXp36KFKvFwcfDRDfR8MiLT8mMhmIhdAeA= +cloud.google.com/go/pubsub v1.41.0/go.mod h1:g+YzC6w/3N91tzG66e2BZtp7WrpBBMXVa3Y9zVoOGpk= cloud.google.com/go/pubsublite v1.8.2/go.mod h1:4r8GSa9NznExjuLPEJlF1VjOPOpgf3IT6k8x/YgaOPI= cloud.google.com/go/recaptchaenterprise/v2 v2.14.2/go.mod h1:MwPgdgvBkE46aWuuXeBTCB8hQJ88p+CpXInROZYCTkc= cloud.google.com/go/recommendationengine v0.8.11/go.mod h1:cEkU4tCXAF88a4boMFZym7U7uyxvVwcQtKzS85IbQio= @@ -159,8 +159,8 @@ cloud.google.com/go/vpcaccess v1.7.11/go.mod h1:a2cuAiSCI4TVK0Dt6/dRjf22qQvfY+po cloud.google.com/go/webrisk v1.9.11/go.mod h1:mK6M8KEO0ZI7VkrjCq3Tjzw4vYq+3c4DzlMUDVaiswE= cloud.google.com/go/websecurityscanner v1.6.11/go.mod h1:vhAZjksELSg58EZfUQ1BMExD+hxqpn0G0DuyCZQjiTg= cloud.google.com/go/workflows v1.12.10/go.mod h1:RcKqCiOmKs8wFUEf3EwWZPH5eHc7Oq0kamIyOUCk0IE= -code.cloudfoundry.org/bytefmt v0.0.0-20240730181512-d61d30bca0a4 h1:0TdG8CEvxn4T76pDdDRdkvUrXvhHnRH5dV/Ldc8rmPs= -code.cloudfoundry.org/bytefmt v0.0.0-20240730181512-d61d30bca0a4/go.mod h1:ujeJbPDuEVVnMj8TgwGJEZ2BCcYOaIa7EVRXu995UKY= +code.cloudfoundry.org/bytefmt v0.0.0-20240804182054-0a63f33a903d h1:ZLm6EHoj1ci9UabYcqoReI2NMXXkKKqPU0tFN2r2kqE= +code.cloudfoundry.org/bytefmt v0.0.0-20240804182054-0a63f33a903d/go.mod h1:0uEmgxxcOTCe4IeYr8HTbVll002oKimIOGp7vuHVK/o= dmitri.shuralyov.com/gpu/mtl v0.0.0-20201218220906-28db891af037/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= dmitri.shuralyov.com/gpu/mtl v0.0.0-20221208032759-85de2813cf6b/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= eliasnaur.com/font v0.0.0-20230308162249-dd43949cb42d/go.mod h1:OYVuxibdk9OSLX8vAqydtRPP87PyTFcT9uH3MlEGBQA= @@ -225,8 +225,8 @@ github.com/aws/aws-sdk-go-v2/credentials v1.17.27 h1:2raNba6gr2IfA0eqqiP2XiQ0UVO github.com/aws/aws-sdk-go-v2/credentials v1.17.27/go.mod h1:gniiwbGahQByxan6YjQUMcW4Aov6bLC3m+evgcoN4r4= github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.11 h1:KreluoV8FZDEtI6Co2xuNk/UqI9iwMrOx/87PBNIKqw= github.com/aws/aws-sdk-go-v2/feature/ec2/imds v1.16.11/go.mod h1:SeSUYBLsMYFoRvHE0Tjvn7kbxaUhl75CJi1sbfhMxkU= -github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.17.9 h1:TC2vjvaAv1VNl9A0rm+SeuBjrzXnrlwk6Yop+gKRi38= -github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.17.9/go.mod h1:WPv2FRnkIOoDv/8j2gSUsI4qDc7392w5anFB/I89GZ8= +github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.17.10 h1:zeN9UtUlA6FTx0vFSayxSX32HDw73Yb6Hh2izDSFxXY= +github.com/aws/aws-sdk-go-v2/feature/s3/manager v1.17.10/go.mod h1:3HKuexPDcwLWPaqpW2UR/9n8N/u/3CKcGAzSs8p8u8g= github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.15 h1:SoNJ4RlFEQEbtDcCEt+QG56MY4fm4W8rYirAmq+/DdU= github.com/aws/aws-sdk-go-v2/internal/configsources v1.3.15/go.mod h1:U9ke74k1n2bf+RIgoX1SXFed1HLs51OgUSs+Ph0KJP8= github.com/aws/aws-sdk-go-v2/internal/endpoints/v2 v2.6.15 h1:C6WHdGnTDIYETAm5iErQUiVNsclNx9qbJVPIt03B6bI= @@ -243,8 +243,8 @@ github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.17 h1:HGErhhrx github.com/aws/aws-sdk-go-v2/service/internal/presigned-url v1.11.17/go.mod h1:RkZEx4l0EHYDJpWppMJ3nD9wZJAa8/0lq9aVC+r2UII= github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.17.15 h1:246A4lSTXWJw/rmlQI+TT2OcqeDMKBdyjEQrafMaQdA= github.com/aws/aws-sdk-go-v2/service/internal/s3shared v1.17.15/go.mod h1:haVfg3761/WF7YPuJOER2MP0k4UAXyHaLclKXB6usDg= -github.com/aws/aws-sdk-go-v2/service/s3 v1.58.2 h1:sZXIzO38GZOU+O0C+INqbH7C2yALwfMWpd64tONS/NE= -github.com/aws/aws-sdk-go-v2/service/s3 v1.58.2/go.mod h1:Lcxzg5rojyVPU/0eFwLtcyTaek/6Mtic5B1gJo7e/zE= +github.com/aws/aws-sdk-go-v2/service/s3 v1.58.3 h1:hT8ZAZRIfqBqHbzKTII+CIiY8G2oC9OpLedkZ51DWl8= +github.com/aws/aws-sdk-go-v2/service/s3 v1.58.3/go.mod h1:Lcxzg5rojyVPU/0eFwLtcyTaek/6Mtic5B1gJo7e/zE= github.com/aws/aws-sdk-go-v2/service/sso v1.22.4 h1:BXx0ZIxvrJdSgSvKTZ+yRBeSqqgPM89VPlulEcl37tM= github.com/aws/aws-sdk-go-v2/service/sso v1.22.4/go.mod h1:ooyCOXjvJEsUw7x+ZDHeISPMhtwI3ZCB7ggFMcFfWLU= github.com/aws/aws-sdk-go-v2/service/ssooidc v1.23.4 h1:Jux+gDDyi1Lruk+KHF91tK2KCuY61kzoCpvtvJJBtOE= @@ -696,23 +696,23 @@ golang.org/x/image v0.18.0 h1:jGzIakQa/ZXI1I0Fxvaa9W7yP25TqT6cHIHn+6CqvSQ= golang.org/x/image v0.18.0/go.mod h1:4yyo5vMFQjVjUcVk4jEQcU9MGy/rulF5WvUILseCM2E= golang.org/x/lint v0.0.0-20210508222113-6edffad5e616/go.mod h1:3xt1FjdF8hUf6vQPIChWIBhFzV8gjjsPE/fR3IyQdNY= golang.org/x/mobile v0.0.0-20240716161057-1ad2df20a8b6/go.mod h1:TCsc78+c4cqb8IKEosz2LwJ6YRNkIjMuAYeHYjchGDE= -golang.org/x/mod v0.19.0 h1:fEdghXQSo20giMthA7cd28ZC+jts4amQ3YMXiP5oMQ8= -golang.org/x/mod v0.19.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= +golang.org/x/mod v0.20.0 h1:utOm6MM3R3dnawAiJgn0y+xvuYRsm1RKM/4giyfDgV0= +golang.org/x/mod v0.20.0/go.mod h1:hTbmBsO62+eylJbnUtE2MGJUyE7QWk4xUqPFrRgJ+7c= golang.org/x/net v0.27.0 h1:5K3Njcw06/l2y9vpGCSdcxWOYHOUk3dVNGDXN+FvAys= golang.org/x/net v0.27.0/go.mod h1:dDi0PyhWNoiUOrAS8uXv/vnScO4wnHQO4mj9fn/RytE= -golang.org/x/oauth2 v0.21.0 h1:tsimM75w1tF/uws5rbeHzIWxEqElMehnc+iW793zsZs= -golang.org/x/oauth2 v0.21.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= -golang.org/x/sync v0.7.0 h1:YsImfSBoP9QPYL0xyKJPq0gcaJdG3rInoqxTWbfQu9M= -golang.org/x/sync v0.7.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= -golang.org/x/sys v0.22.0 h1:RI27ohtqKCnwULzJLqkv897zojh5/DwS/ENaMzUOaWI= -golang.org/x/sys v0.22.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= +golang.org/x/oauth2 v0.22.0 h1:BzDx2FehcG7jJwgWLELCdmLuxk2i+x9UDpSiss2u0ZA= +golang.org/x/oauth2 v0.22.0/go.mod h1:XYTD2NtWslqkgxebSiOHnXEap4TF09sJSc7H1sXbhtI= +golang.org/x/sync v0.8.0 h1:3NFvSEYkUoMifnESzZl15y791HH1qU2xm6eCJU5ZPXQ= +golang.org/x/sync v0.8.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= +golang.org/x/sys v0.23.0 h1:YfKFowiIMvtgl1UERQoTPPToxltDeZfbj4H7dVUCwmM= +golang.org/x/sys v0.23.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= golang.org/x/telemetry v0.0.0-20240521205824-bda55230c457/go.mod h1:pRgIJT+bRLFKnoM1ldnzKoxTIn14Yxz928LQRYYgIN0= golang.org/x/term v0.22.0 h1:BbsgPEJULsl2fV/AT3v15Mjva5yXKQDyKf+TbDz7QJk= golang.org/x/term v0.22.0/go.mod h1:F3qCibpT5AMpCRfhfT53vVJwhLtIVHhB9XDjfFvnMI4= golang.org/x/text v0.16.0 h1:a94ExnEXNtEwYLGJSIUxnWoxoRz/ZcCsV63ROupILh4= golang.org/x/text v0.16.0/go.mod h1:GhwF1Be+LQoKShO3cGOHzqOgRrGaYc9AvblQOmPVHnI= -golang.org/x/time v0.5.0 h1:o7cqy6amK/52YcAKIPlM3a+Fpj35zvRj2TP+e1xFSfk= -golang.org/x/time v0.5.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= +golang.org/x/time v0.6.0 h1:eTDhh4ZXt5Qf0augr54TN6suAUudPcawVZeIAPU7D4U= +golang.org/x/time v0.6.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= golang.org/x/tools v0.23.0 h1:SGsXPZ+2l4JsgaCKkx+FQ9YZ5XEtA1GZYuoDjenLjvg= golang.org/x/tools v0.23.0/go.mod h1:pnu6ufv6vQkll6szChhK3C3L/ruaIv5eBeztNG8wtsI= golang.org/x/xerrors v0.0.0-20240716161551-93cc26a95ae9 h1:LLhsEBxRTBLuKlQxFBYUOU8xyFgXv6cOTp2HASDlsDk= diff --git a/internal/config/corrector.go b/internal/config/corrector.go index 5911fb53d4..86f5deb178 100644 --- a/internal/config/corrector.go +++ b/internal/config/corrector.go @@ -46,6 +46,9 @@ type Corrector struct { // Discoverer represent agent discoverer service configuration Discoverer *DiscovererClient `json:"discoverer" yaml:"discoverer"` + + // Gateway represent gateway service configuration + Gateway *GRPCClient `json:"gateway" yaml:"gateway"` } // Bind binds the actual data from the Indexer receiver field. @@ -58,5 +61,8 @@ func (c *Corrector) Bind() *Corrector { if c.Discoverer != nil { c.Discoverer = c.Discoverer.Bind() } + if c.Gateway != nil { + c.Gateway = c.Gateway.Bind() + } return c } diff --git a/internal/db/kvs/pogreb/pogreb.go b/internal/db/kvs/pogreb/pogreb.go index 26e7cc1da0..4cbdd80b7e 100644 --- a/internal/db/kvs/pogreb/pogreb.go +++ b/internal/db/kvs/pogreb/pogreb.go @@ -24,8 +24,8 @@ import ( "github.com/vdaas/vald/internal/log" ) -// Pogreb represents an interface for operating the pogreb database. -type Pogreb interface { +// DB represents an interface for operating the pogreb database. +type DB interface { Set(key string, val []byte) error Get(key string) ([]byte, bool, error) Delete(key string) error @@ -43,7 +43,7 @@ type db struct { // New returns a new pogreb instance. // If the directory path does not exist, it creates a directory for database. // If opts is nil, it uses default options. -func New(opts ...Option) (_ Pogreb, err error) { +func New(opts ...Option) (_ DB, err error) { db := new(db) for _, opt := range append(deafultOpts, opts...) { if err := opt(db); err != nil { diff --git a/internal/db/kvs/pogreb/pogreb_test.go b/internal/db/kvs/pogreb/pogreb_test.go index 14b14c5650..c241512533 100644 --- a/internal/db/kvs/pogreb/pogreb_test.go +++ b/internal/db/kvs/pogreb/pogreb_test.go @@ -122,8 +122,8 @@ func Test_db_Get(t *testing.T) { args args want want checkFunc func(want, []byte, bool, error) error - beforeFunc func(*testing.T, Pogreb, args) - afterFunc func(*testing.T, Pogreb, args) + beforeFunc func(*testing.T, DB, args) + afterFunc func(*testing.T, DB, args) } defaultCheckFunc := func(w want, got []byte, got1 bool, err error) error { if !errors.Is(err, w.err) { @@ -156,13 +156,13 @@ func Test_db_Get(t *testing.T) { want: val, want1: true, }, - beforeFunc: func(t *testing.T, d Pogreb, args args) { + beforeFunc: func(t *testing.T, d DB, args args) { t.Helper() if err := d.Set(key, val); err != nil { t.Fatal(err) } }, - afterFunc: func(t *testing.T, d Pogreb, args args) { + afterFunc: func(t *testing.T, d DB, args args) { t.Helper() if err := d.Close(true); err != nil { t.Fatal(err) @@ -187,13 +187,13 @@ func Test_db_Get(t *testing.T) { want: want{ want1: false, }, - beforeFunc: func(t *testing.T, d Pogreb, args args) { + beforeFunc: func(t *testing.T, d DB, args args) { t.Helper() if err := d.Set(key, val); err != nil { t.Fatal(err) } }, - afterFunc: func(t *testing.T, d Pogreb, args args) { + afterFunc: func(t *testing.T, d DB, args args) { t.Helper() if err := d.Close(true); err != nil { t.Fatal(err) @@ -247,11 +247,11 @@ func Test_db_Delete(t *testing.T) { name string args args want want - checkFunc func(want, Pogreb, error) error - beforeFunc func(*testing.T, Pogreb, args) - afterFunc func(*testing.T, Pogreb, args) + checkFunc func(want, DB, error) error + beforeFunc func(*testing.T, DB, args) + afterFunc func(*testing.T, DB, args) } - defaultCheckFunc := func(w want, _ Pogreb, err error) error { + defaultCheckFunc := func(w want, _ DB, err error) error { if !errors.Is(err, w.err) { return errors.Errorf("got_error: \"%#v\",\n\t\t\t\twant: \"%#v\"", err, w.err) } @@ -272,7 +272,7 @@ func Test_db_Delete(t *testing.T) { }, key: key, }, - checkFunc: func(w want, d Pogreb, err error) error { + checkFunc: func(w want, d DB, err error) error { if err := defaultCheckFunc(w, d, err); err != nil { return err } @@ -285,13 +285,13 @@ func Test_db_Delete(t *testing.T) { } return nil }, - beforeFunc: func(t *testing.T, d Pogreb, args args) { + beforeFunc: func(t *testing.T, d DB, args args) { t.Helper() if err := d.Set(key, val); err != nil { t.Fatal(err) } }, - afterFunc: func(t *testing.T, d Pogreb, args args) { + afterFunc: func(t *testing.T, d DB, args args) { t.Helper() if err := d.Close(true); err != nil { t.Fatal(err) @@ -346,8 +346,8 @@ func Test_db_Range(t *testing.T) { args args want want checkFunc func(want, error) error - beforeFunc func(*testing.T, Pogreb, args) - afterFunc func(*testing.T, Pogreb, args) + beforeFunc func(*testing.T, DB, args) + afterFunc func(*testing.T, DB, args) } defaultCheckFunc := func(w want, err error) error { if !errors.Is(err, w.err) { @@ -384,7 +384,7 @@ func Test_db_Range(t *testing.T) { } return nil }, - beforeFunc: func(t *testing.T, d Pogreb, args args) { + beforeFunc: func(t *testing.T, d DB, args args) { t.Helper() for key, val := range data { if err := d.Set(key, val); err != nil { @@ -392,7 +392,7 @@ func Test_db_Range(t *testing.T) { } } }, - afterFunc: func(t *testing.T, d Pogreb, args args) { + afterFunc: func(t *testing.T, d DB, args args) { t.Helper() if err := d.Close(true); err != nil { t.Fatal(err) @@ -426,7 +426,7 @@ func Test_db_Range(t *testing.T) { } return nil }, - beforeFunc: func(t *testing.T, d Pogreb, args args) { + beforeFunc: func(t *testing.T, d DB, args args) { t.Helper() data := map[string][]byte{ "key-1": []byte("val-1"), @@ -438,7 +438,7 @@ func Test_db_Range(t *testing.T) { } } }, - afterFunc: func(t *testing.T, d Pogreb, args args) { + afterFunc: func(t *testing.T, d DB, args args) { t.Helper() if err := d.Close(true); err != nil { t.Fatal(err) @@ -490,8 +490,8 @@ func Test_db_Len(t *testing.T) { args args want want checkFunc func(want, uint32) error - beforeFunc func(*testing.T, Pogreb, args) - afterFunc func(*testing.T, Pogreb, args) + beforeFunc func(*testing.T, DB, args) + afterFunc func(*testing.T, DB, args) } defaultCheckFunc := func(w want, got uint32) error { if !reflect.DeepEqual(got, w.want) { @@ -516,7 +516,7 @@ func Test_db_Len(t *testing.T) { want: want{ want: uint32(len(data)), }, - beforeFunc: func(t *testing.T, d Pogreb, args args) { + beforeFunc: func(t *testing.T, d DB, args args) { t.Helper() for key, val := range data { if err := d.Set(key, val); err != nil { @@ -524,7 +524,7 @@ func Test_db_Len(t *testing.T) { } } }, - afterFunc: func(t *testing.T, d Pogreb, args args) { + afterFunc: func(t *testing.T, d DB, args args) { t.Helper() if err := d.Close(true); err != nil { t.Fatal(err) @@ -541,7 +541,7 @@ func Test_db_Len(t *testing.T) { WithBackgroundSyncInterval("0s"), }, }, - afterFunc: func(t *testing.T, d Pogreb, args args) { + afterFunc: func(t *testing.T, d DB, args args) { t.Helper() if err := d.Close(true); err != nil { t.Fatal(err) diff --git a/k8s/index/job/correction/configmap.yaml b/k8s/index/job/correction/configmap.yaml index e69de29bb2..820be5af49 100644 --- a/k8s/index/job/correction/configmap.yaml +++ b/k8s/index/job/correction/configmap.yaml @@ -0,0 +1,450 @@ +# +# Copyright (C) 2019-2024 vdaas.org vald team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +apiVersion: v1 +kind: ConfigMap +metadata: + name: vald-index-correction-config + labels: + app.kubernetes.io/name: vald + helm.sh/chart: vald-v1.7.12 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: v1.7.12 + app.kubernetes.io/component: vald-index-correction +data: + config.yaml: | + --- + version: v0.0.0 + time_zone: UTC + logging: + format: raw + level: debug + logger: glg + server_config: + servers: + - name: grpc + host: 0.0.0.0 + port: 8081 + grpc: + bidirectional_stream_concurrency: 20 + connection_timeout: "" + enable_admin: true + enable_reflection: true + header_table_size: 0 + initial_conn_window_size: 2097152 + initial_window_size: 1048576 + interceptors: + - RecoverInterceptor + keepalive: + max_conn_age: "" + max_conn_age_grace: "" + max_conn_idle: "" + min_time: 10m + permit_without_stream: false + time: 3h + timeout: 60s + max_header_list_size: 0 + max_receive_message_size: 0 + max_send_message_size: 0 + read_buffer_size: 0 + write_buffer_size: 0 + mode: GRPC + network: tcp + probe_wait_time: 3s + restart: true + socket_option: + ip_recover_destination_addr: false + ip_transparent: false + reuse_addr: true + reuse_port: true + tcp_cork: false + tcp_defer_accept: false + tcp_fast_open: false + tcp_no_delay: false + tcp_quick_ack: false + socket_path: "" + health_check_servers: + - name: liveness + host: 0.0.0.0 + port: 3000 + http: + handler_timeout: "" + idle_timeout: "" + read_header_timeout: "" + read_timeout: "" + shutdown_duration: 5s + write_timeout: "" + mode: REST + network: tcp + probe_wait_time: 3s + restart: true + socket_option: + ip_recover_destination_addr: false + ip_transparent: false + reuse_addr: true + reuse_port: true + tcp_cork: false + tcp_defer_accept: false + tcp_fast_open: true + tcp_no_delay: true + tcp_quick_ack: true + socket_path: "" + - name: readiness + host: 0.0.0.0 + port: 3001 + http: + handler_timeout: "" + idle_timeout: "" + read_header_timeout: "" + read_timeout: "" + shutdown_duration: 0s + write_timeout: "" + mode: REST + network: tcp + probe_wait_time: 3s + restart: true + socket_option: + ip_recover_destination_addr: false + ip_transparent: false + reuse_addr: true + reuse_port: true + tcp_cork: false + tcp_defer_accept: false + tcp_fast_open: true + tcp_no_delay: true + tcp_quick_ack: true + socket_path: "" + metrics_servers: + - name: pprof + host: 0.0.0.0 + port: 6060 + http: + handler_timeout: 5s + idle_timeout: 2s + read_header_timeout: 1s + read_timeout: 1s + shutdown_duration: 5s + write_timeout: 1m + mode: REST + network: tcp + probe_wait_time: 3s + restart: true + socket_option: + ip_recover_destination_addr: false + ip_transparent: false + reuse_addr: true + reuse_port: true + tcp_cork: true + tcp_defer_accept: false + tcp_fast_open: false + tcp_no_delay: false + tcp_quick_ack: false + socket_path: "" + startup_strategy: + - liveness + - pprof + - grpc + - readiness + shutdown_strategy: + - readiness + - grpc + - pprof + - liveness + full_shutdown_duration: 600s + tls: + ca: /path/to/ca + cert: /path/to/cert + enabled: false + insecure_skip_verify: false + key: /path/to/key + observability: + enabled: false + otlp: + collector_endpoint: "" + trace_batch_timeout: "1s" + trace_export_timeout: "1m" + trace_max_export_batch_size: 1024 + trace_max_queue_size: 256 + metrics_export_interval: "1s" + metrics_export_timeout: "1m" + attribute: + namespace: "_MY_POD_NAMESPACE_" + pod_name: "_MY_POD_NAME_" + node_name: "_MY_NODE_NAME_" + service_name: "vald-index-correction" + metrics: + enable_cgo: true + enable_goroutine: true + enable_memory: true + enable_version_info: true + version_info_labels: + - vald_version + - server_name + - git_commit + - build_time + - go_version + - go_os + - go_arch + - algorithm_info + trace: + enabled: false + corrector: + agent_port: 8081 + agent_name: "vald-agent" + agent_dns: vald-agent.default.svc.cluster.local + agent_namespace: "_MY_POD_NAMESPACE_" + node_name: "" + stream_list_concurrency: 200 + kvs_async_write_concurrency: 2048 + index_replica: 3 + gateway: + addrs: + - vald-lb-gateway.default.svc.cluster.local:8081 + health_check_duration: "1s" + connection_pool: + enable_dns_resolver: true + enable_rebalance: true + old_conn_close_duration: 2m + rebalance_duration: 30m + size: 3 + backoff: + backoff_factor: 1.1 + backoff_time_limit: 5s + enable_error_log: true + initial_duration: 5ms + jitter_limit: 100ms + maximum_duration: 5s + retry_count: 100 + circuit_breaker: + closed_error_rate: 0.7 + closed_refresh_timeout: 10s + half_open_error_rate: 0.5 + min_samples: 1000 + open_timeout: 1s + call_option: + max_recv_msg_size: 0 + max_retry_rpc_buffer_size: 0 + max_send_msg_size: 0 + wait_for_ready: true + dial_option: + backoff_base_delay: 1s + backoff_jitter: 0.2 + backoff_max_delay: 120s + backoff_multiplier: 1.6 + enable_backoff: false + initial_connection_window_size: 2097152 + initial_window_size: 1048576 + insecure: true + interceptors: [] + keepalive: + permit_without_stream: false + time: "" + timeout: 30s + max_msg_size: 0 + min_connection_timeout: 20s + net: + dialer: + dual_stack_enabled: true + keepalive: "" + timeout: "" + dns: + cache_enabled: true + cache_expiration: 1h + refresh_duration: 30m + socket_option: + ip_recover_destination_addr: false + ip_transparent: false + reuse_addr: true + reuse_port: true + tcp_cork: false + tcp_defer_accept: false + tcp_fast_open: false + tcp_no_delay: false + tcp_quick_ack: false + tls: + ca: /path/to/ca + cert: /path/to/cert + enabled: false + insecure_skip_verify: false + key: /path/to/key + read_buffer_size: 0 + timeout: "" + write_buffer_size: 0 + tls: + ca: /path/to/ca + cert: /path/to/cert + enabled: false + insecure_skip_verify: false + key: /path/to/key + discoverer: + duration: 500ms + client: + addrs: + - vald-discoverer.default.svc.cluster.local:8081 + health_check_duration: "1s" + connection_pool: + enable_dns_resolver: true + enable_rebalance: true + old_conn_close_duration: 2m + rebalance_duration: 30m + size: 3 + backoff: + backoff_factor: 1.1 + backoff_time_limit: 5s + enable_error_log: true + initial_duration: 5ms + jitter_limit: 100ms + maximum_duration: 5s + retry_count: 100 + circuit_breaker: + closed_error_rate: 0.7 + closed_refresh_timeout: 10s + half_open_error_rate: 0.5 + min_samples: 1000 + open_timeout: 1s + call_option: + max_recv_msg_size: 0 + max_retry_rpc_buffer_size: 0 + max_send_msg_size: 0 + wait_for_ready: true + dial_option: + backoff_base_delay: 1s + backoff_jitter: 0.2 + backoff_max_delay: 120s + backoff_multiplier: 1.6 + enable_backoff: false + initial_connection_window_size: 2097152 + initial_window_size: 1048576 + insecure: true + interceptors: [] + keepalive: + permit_without_stream: false + time: "" + timeout: 30s + max_msg_size: 0 + min_connection_timeout: 20s + net: + dialer: + dual_stack_enabled: true + keepalive: "" + timeout: "" + dns: + cache_enabled: true + cache_expiration: 1h + refresh_duration: 30m + socket_option: + ip_recover_destination_addr: false + ip_transparent: false + reuse_addr: true + reuse_port: true + tcp_cork: false + tcp_defer_accept: false + tcp_fast_open: false + tcp_no_delay: false + tcp_quick_ack: false + tls: + ca: /path/to/ca + cert: /path/to/cert + enabled: false + insecure_skip_verify: false + key: /path/to/key + read_buffer_size: 0 + timeout: "" + write_buffer_size: 0 + tls: + ca: /path/to/ca + cert: /path/to/cert + enabled: false + insecure_skip_verify: false + key: /path/to/key + agent_client_options: + addrs: [] + health_check_duration: "1s" + connection_pool: + enable_dns_resolver: true + enable_rebalance: true + old_conn_close_duration: 2m + rebalance_duration: 30m + size: 3 + backoff: + backoff_factor: 1.1 + backoff_time_limit: 5s + enable_error_log: true + initial_duration: 5ms + jitter_limit: 100ms + maximum_duration: 5s + retry_count: 100 + circuit_breaker: + closed_error_rate: 0.7 + closed_refresh_timeout: 10s + half_open_error_rate: 0.5 + min_samples: 1000 + open_timeout: 1s + call_option: + max_recv_msg_size: 0 + max_retry_rpc_buffer_size: 0 + max_send_msg_size: 0 + wait_for_ready: true + dial_option: + write_buffer_size: 0 + read_buffer_size: 0 + initial_window_size: 1.048576e+06 + initial_connection_window_size: 2.097152e+06 + max_msg_size: 0 + backoff_max_delay: "120s" + backoff_base_delay: "1s" + backoff_multiplier: 1.6 + backoff_jitter: 0.2 + min_connection_timeout: "20s" + enable_backoff: false + insecure: true + timeout: "" + interceptors: [] + net: + dns: + cache_enabled: true + cache_expiration: 1h + refresh_duration: 30m + dialer: + timeout: "" + keepalive: "15m" + dual_stack_enabled: true + tls: + ca: /path/to/ca + cert: /path/to/cert + enabled: false + insecure_skip_verify: false + key: /path/to/key + socket_option: + ip_recover_destination_addr: false + ip_transparent: false + reuse_addr: true + reuse_port: true + tcp_cork: false + tcp_defer_accept: false + tcp_fast_open: false + tcp_no_delay: false + tcp_quick_ack: false + keepalive: + permit_without_stream: false + time: "" + timeout: 30s + tls: + ca: /path/to/ca + cert: /path/to/cert + enabled: false + insecure_skip_verify: false + key: /path/to/key diff --git a/k8s/index/job/correction/cronjob.yaml b/k8s/index/job/correction/cronjob.yaml index e69de29bb2..4bd8edc567 100644 --- a/k8s/index/job/correction/cronjob.yaml +++ b/k8s/index/job/correction/cronjob.yaml @@ -0,0 +1,144 @@ +# +# Copyright (C) 2019-2024 vdaas.org vald team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +apiVersion: batch/v1 +kind: CronJob +metadata: + name: vald-index-correction + labels: + app: vald-index-correction + app.kubernetes.io/name: vald + helm.sh/chart: vald-v1.7.12 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/instance: release-name + app.kubernetes.io/component: vald-index-correction + app.kubernetes.io/version: v1.7.12 +spec: + schedule: "6 3 * * *" + concurrencyPolicy: Forbid + suspend: false + startingDeadlineSeconds: 86400 + jobTemplate: + spec: + ttlSecondsAfterFinished: 86400 + template: + metadata: + labels: + app: vald-index-correction + app.kubernetes.io/name: vald + helm.sh/chart: vald-v1.7.12 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/instance: release-name + app.kubernetes.io/component: vald-index-correction + app.kubernetes.io/version: v1.7.12 + annotations: + pyroscope.io/scrape: "true" + pyroscope.io/application-name: vald-index-correction + pyroscope.io/profile-cpu-enabled: "true" + pyroscope.io/profile-mem-enabled: "true" + pyroscope.io/port: "6060" + spec: + initContainers: + - name: wait-for-agent + image: busybox:stable + command: + - /bin/sh + - -e + - -c + - | + until [ "$(wget --server-response --spider --quiet http://vald-agent.default.svc.cluster.local:3001/readiness 2>&1 | awk 'NR==1{print $2}')" == "200" ]; do + echo "waiting for agent to be ready..." + sleep 2; + done + - name: wait-for-discoverer + image: busybox:stable + command: + - /bin/sh + - -e + - -c + - | + until [ "$(wget --server-response --spider --quiet http://vald-discoverer.default.svc.cluster.local:3001/readiness 2>&1 | awk 'NR==1{print $2}')" == "200" ]; do + echo "waiting for discoverer to be ready..." + sleep 2; + done + containers: + - name: vald-index-correction + image: "vdaas/vald-index-correction:nightly" + imagePullPolicy: Always + volumeMounts: + - name: vald-index-correction-config + mountPath: /etc/server/ + livenessProbe: + failureThreshold: 2 + httpGet: + path: /liveness + port: liveness + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 3 + successThreshold: 1 + timeoutSeconds: 2 + readinessProbe: + failureThreshold: 2 + httpGet: + path: /readiness + port: readiness + scheme: HTTP + initialDelaySeconds: 10 + periodSeconds: 3 + successThreshold: 1 + timeoutSeconds: 2 + startupProbe: + failureThreshold: 30 + httpGet: + path: /liveness + port: liveness + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 2 + ports: + - name: liveness + protocol: TCP + containerPort: 3000 + - name: readiness + protocol: TCP + containerPort: 3001 + - name: grpc + protocol: TCP + containerPort: 8081 + - name: pprof + protocol: TCP + containerPort: 6060 + env: + - name: MY_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: MY_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: MY_POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + restartPolicy: OnFailure + volumes: + - name: vald-index-correction-config + configMap: + defaultMode: 420 + name: vald-index-correction-config diff --git a/k8s/index/job/creation/configmap.yaml b/k8s/index/job/creation/configmap.yaml index e69de29bb2..da4ab3c7b2 100644 --- a/k8s/index/job/creation/configmap.yaml +++ b/k8s/index/job/creation/configmap.yaml @@ -0,0 +1,370 @@ +# +# Copyright (C) 2019-2024 vdaas.org vald team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +apiVersion: v1 +kind: ConfigMap +metadata: + name: vald-index-creation-config + labels: + app.kubernetes.io/name: vald + helm.sh/chart: vald-v1.7.12 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: v1.7.12 + app.kubernetes.io/component: vald-index-creation +data: + config.yaml: | + --- + version: v0.0.0 + time_zone: UTC + logging: + format: raw + level: debug + logger: glg + server_config: + servers: + - name: grpc + host: 0.0.0.0 + port: 8081 + grpc: + bidirectional_stream_concurrency: 20 + connection_timeout: "" + enable_admin: true + enable_reflection: true + header_table_size: 0 + initial_conn_window_size: 2097152 + initial_window_size: 1048576 + interceptors: + - RecoverInterceptor + keepalive: + max_conn_age: "" + max_conn_age_grace: "" + max_conn_idle: "" + min_time: 10m + permit_without_stream: false + time: 3h + timeout: 60s + max_header_list_size: 0 + max_receive_message_size: 0 + max_send_message_size: 0 + read_buffer_size: 0 + write_buffer_size: 0 + mode: GRPC + network: tcp + probe_wait_time: 3s + restart: true + socket_option: + ip_recover_destination_addr: false + ip_transparent: false + reuse_addr: true + reuse_port: true + tcp_cork: false + tcp_defer_accept: false + tcp_fast_open: false + tcp_no_delay: false + tcp_quick_ack: false + socket_path: "" + health_check_servers: + - name: liveness + host: 0.0.0.0 + port: 3000 + http: + handler_timeout: "" + idle_timeout: "" + read_header_timeout: "" + read_timeout: "" + shutdown_duration: 5s + write_timeout: "" + mode: REST + network: tcp + probe_wait_time: 3s + restart: true + socket_option: + ip_recover_destination_addr: false + ip_transparent: false + reuse_addr: true + reuse_port: true + tcp_cork: false + tcp_defer_accept: false + tcp_fast_open: true + tcp_no_delay: true + tcp_quick_ack: true + socket_path: "" + - name: readiness + host: 0.0.0.0 + port: 3001 + http: + handler_timeout: "" + idle_timeout: "" + read_header_timeout: "" + read_timeout: "" + shutdown_duration: 0s + write_timeout: "" + mode: REST + network: tcp + probe_wait_time: 3s + restart: true + socket_option: + ip_recover_destination_addr: false + ip_transparent: false + reuse_addr: true + reuse_port: true + tcp_cork: false + tcp_defer_accept: false + tcp_fast_open: true + tcp_no_delay: true + tcp_quick_ack: true + socket_path: "" + metrics_servers: + - name: pprof + host: 0.0.0.0 + port: 6060 + http: + handler_timeout: 5s + idle_timeout: 2s + read_header_timeout: 1s + read_timeout: 1s + shutdown_duration: 5s + write_timeout: 1m + mode: REST + network: tcp + probe_wait_time: 3s + restart: true + socket_option: + ip_recover_destination_addr: false + ip_transparent: false + reuse_addr: true + reuse_port: true + tcp_cork: true + tcp_defer_accept: false + tcp_fast_open: false + tcp_no_delay: false + tcp_quick_ack: false + socket_path: "" + startup_strategy: + - liveness + - pprof + - grpc + - readiness + shutdown_strategy: + - readiness + - grpc + - pprof + - liveness + full_shutdown_duration: 600s + tls: + ca: /path/to/ca + cert: /path/to/cert + enabled: false + insecure_skip_verify: false + key: /path/to/key + observability: + enabled: false + otlp: + collector_endpoint: "" + trace_batch_timeout: "1s" + trace_export_timeout: "1m" + trace_max_export_batch_size: 1024 + trace_max_queue_size: 256 + metrics_export_interval: "1s" + metrics_export_timeout: "1m" + attribute: + namespace: "_MY_POD_NAMESPACE_" + pod_name: "_MY_POD_NAME_" + node_name: "_MY_NODE_NAME_" + service_name: "vald-index-creation" + metrics: + enable_cgo: true + enable_goroutine: true + enable_memory: true + enable_version_info: true + version_info_labels: + - vald_version + - server_name + - git_commit + - build_time + - go_version + - go_os + - go_arch + - algorithm_info + trace: + enabled: false + creator: + agent_port: 8081 + agent_name: "vald-agent" + agent_dns: vald-agent.default.svc.cluster.local + agent_namespace: "_MY_POD_NAMESPACE_" + node_name: "" + concurrency: 1 + target_addrs: [] + discoverer: + duration: 500ms + client: + addrs: + - vald-discoverer.default.svc.cluster.local:8081 + health_check_duration: "1s" + connection_pool: + enable_dns_resolver: true + enable_rebalance: true + old_conn_close_duration: 2m + rebalance_duration: 30m + size: 3 + backoff: + backoff_factor: 1.1 + backoff_time_limit: 5s + enable_error_log: true + initial_duration: 5ms + jitter_limit: 100ms + maximum_duration: 5s + retry_count: 100 + circuit_breaker: + closed_error_rate: 0.7 + closed_refresh_timeout: 10s + half_open_error_rate: 0.5 + min_samples: 1000 + open_timeout: 1s + call_option: + max_recv_msg_size: 0 + max_retry_rpc_buffer_size: 0 + max_send_msg_size: 0 + wait_for_ready: true + dial_option: + backoff_base_delay: 1s + backoff_jitter: 0.2 + backoff_max_delay: 120s + backoff_multiplier: 1.6 + enable_backoff: false + initial_connection_window_size: 2097152 + initial_window_size: 1048576 + insecure: true + interceptors: [] + keepalive: + permit_without_stream: false + time: "" + timeout: 30s + max_msg_size: 0 + min_connection_timeout: 20s + net: + dialer: + dual_stack_enabled: true + keepalive: "" + timeout: "" + dns: + cache_enabled: true + cache_expiration: 1h + refresh_duration: 30m + socket_option: + ip_recover_destination_addr: false + ip_transparent: false + reuse_addr: true + reuse_port: true + tcp_cork: false + tcp_defer_accept: false + tcp_fast_open: false + tcp_no_delay: false + tcp_quick_ack: false + tls: + ca: /path/to/ca + cert: /path/to/cert + enabled: false + insecure_skip_verify: false + key: /path/to/key + read_buffer_size: 0 + timeout: "" + write_buffer_size: 0 + tls: + ca: /path/to/ca + cert: /path/to/cert + enabled: false + insecure_skip_verify: false + key: /path/to/key + agent_client_options: + addrs: [] + health_check_duration: "1s" + connection_pool: + enable_dns_resolver: true + enable_rebalance: true + old_conn_close_duration: 2m + rebalance_duration: 30m + size: 3 + backoff: + backoff_factor: 1.1 + backoff_time_limit: 5s + enable_error_log: true + initial_duration: 5ms + jitter_limit: 100ms + maximum_duration: 5s + retry_count: 100 + circuit_breaker: + closed_error_rate: 0.7 + closed_refresh_timeout: 10s + half_open_error_rate: 0.5 + min_samples: 1000 + open_timeout: 1s + call_option: + max_recv_msg_size: 0 + max_retry_rpc_buffer_size: 0 + max_send_msg_size: 0 + wait_for_ready: true + dial_option: + write_buffer_size: 0 + read_buffer_size: 0 + initial_window_size: 1.048576e+06 + initial_connection_window_size: 2.097152e+06 + max_msg_size: 0 + backoff_max_delay: "120s" + backoff_base_delay: "1s" + backoff_multiplier: 1.6 + backoff_jitter: 0.2 + min_connection_timeout: "20s" + enable_backoff: false + insecure: true + timeout: "" + interceptors: [] + net: + dns: + cache_enabled: true + cache_expiration: 1h + refresh_duration: 30m + dialer: + timeout: "" + keepalive: "15m" + dual_stack_enabled: true + tls: + ca: /path/to/ca + cert: /path/to/cert + enabled: false + insecure_skip_verify: false + key: /path/to/key + socket_option: + ip_recover_destination_addr: false + ip_transparent: false + reuse_addr: true + reuse_port: true + tcp_cork: false + tcp_defer_accept: false + tcp_fast_open: false + tcp_no_delay: false + tcp_quick_ack: false + keepalive: + permit_without_stream: false + time: "" + timeout: 30s + tls: + ca: /path/to/ca + cert: /path/to/cert + enabled: false + insecure_skip_verify: false + key: /path/to/key diff --git a/k8s/index/job/creation/cronjob.yaml b/k8s/index/job/creation/cronjob.yaml index e69de29bb2..26d4128ba2 100644 --- a/k8s/index/job/creation/cronjob.yaml +++ b/k8s/index/job/creation/cronjob.yaml @@ -0,0 +1,144 @@ +# +# Copyright (C) 2019-2024 vdaas.org vald team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +apiVersion: batch/v1 +kind: CronJob +metadata: + name: vald-index-creation + labels: + app: vald-index-creation + app.kubernetes.io/name: vald + helm.sh/chart: vald-v1.7.12 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/instance: release-name + app.kubernetes.io/component: vald-index-creation + app.kubernetes.io/version: v1.7.12 +spec: + schedule: "* * * * *" + concurrencyPolicy: Forbid + suspend: false + startingDeadlineSeconds: 43200 + jobTemplate: + spec: + ttlSecondsAfterFinished: 86400 + template: + metadata: + labels: + app: vald-index-creation + app.kubernetes.io/name: vald + helm.sh/chart: vald-v1.7.12 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/instance: release-name + app.kubernetes.io/component: vald-index-creation + app.kubernetes.io/version: v1.7.12 + annotations: + pyroscope.io/scrape: "true" + pyroscope.io/application-name: vald-index-creation + pyroscope.io/profile-cpu-enabled: "true" + pyroscope.io/profile-mem-enabled: "true" + pyroscope.io/port: "6060" + spec: + initContainers: + - name: wait-for-agent + image: busybox:stable + command: + - /bin/sh + - -e + - -c + - | + until [ "$(wget --server-response --spider --quiet http://vald-agent.default.svc.cluster.local:3001/readiness 2>&1 | awk 'NR==1{print $2}')" == "200" ]; do + echo "waiting for agent to be ready..." + sleep 2; + done + - name: wait-for-discoverer + image: busybox:stable + command: + - /bin/sh + - -e + - -c + - | + until [ "$(wget --server-response --spider --quiet http://vald-discoverer.default.svc.cluster.local:3001/readiness 2>&1 | awk 'NR==1{print $2}')" == "200" ]; do + echo "waiting for discoverer to be ready..." + sleep 2; + done + containers: + - name: vald-index-creation + image: "vdaas/vald-index-creation:nightly" + imagePullPolicy: Always + volumeMounts: + - name: vald-index-creation-config + mountPath: /etc/server/ + livenessProbe: + failureThreshold: 2 + httpGet: + path: /liveness + port: liveness + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 3 + successThreshold: 1 + timeoutSeconds: 2 + readinessProbe: + failureThreshold: 2 + httpGet: + path: /readiness + port: readiness + scheme: HTTP + initialDelaySeconds: 10 + periodSeconds: 3 + successThreshold: 1 + timeoutSeconds: 2 + startupProbe: + failureThreshold: 30 + httpGet: + path: /liveness + port: liveness + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 2 + ports: + - name: liveness + protocol: TCP + containerPort: 3000 + - name: readiness + protocol: TCP + containerPort: 3001 + - name: grpc + protocol: TCP + containerPort: 8081 + - name: pprof + protocol: TCP + containerPort: 6060 + env: + - name: MY_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: MY_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: MY_POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + restartPolicy: OnFailure + volumes: + - name: vald-index-creation-config + configMap: + defaultMode: 420 + name: vald-index-creation-config diff --git a/k8s/index/job/save/configmap.yaml b/k8s/index/job/save/configmap.yaml index e69de29bb2..c51c293466 100644 --- a/k8s/index/job/save/configmap.yaml +++ b/k8s/index/job/save/configmap.yaml @@ -0,0 +1,370 @@ +# +# Copyright (C) 2019-2024 vdaas.org vald team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +apiVersion: v1 +kind: ConfigMap +metadata: + name: vald-index-save-config + labels: + app.kubernetes.io/name: vald + helm.sh/chart: vald-v1.7.12 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: v1.7.12 + app.kubernetes.io/component: vald-index-save +data: + config.yaml: | + --- + version: v0.0.0 + time_zone: UTC + logging: + format: raw + level: debug + logger: glg + server_config: + servers: + - name: grpc + host: 0.0.0.0 + port: 8081 + grpc: + bidirectional_stream_concurrency: 20 + connection_timeout: "" + enable_admin: true + enable_reflection: true + header_table_size: 0 + initial_conn_window_size: 2097152 + initial_window_size: 1048576 + interceptors: + - RecoverInterceptor + keepalive: + max_conn_age: "" + max_conn_age_grace: "" + max_conn_idle: "" + min_time: 10m + permit_without_stream: false + time: 3h + timeout: 60s + max_header_list_size: 0 + max_receive_message_size: 0 + max_send_message_size: 0 + read_buffer_size: 0 + write_buffer_size: 0 + mode: GRPC + network: tcp + probe_wait_time: 3s + restart: true + socket_option: + ip_recover_destination_addr: false + ip_transparent: false + reuse_addr: true + reuse_port: true + tcp_cork: false + tcp_defer_accept: false + tcp_fast_open: false + tcp_no_delay: false + tcp_quick_ack: false + socket_path: "" + health_check_servers: + - name: liveness + host: 0.0.0.0 + port: 3000 + http: + handler_timeout: "" + idle_timeout: "" + read_header_timeout: "" + read_timeout: "" + shutdown_duration: 5s + write_timeout: "" + mode: REST + network: tcp + probe_wait_time: 3s + restart: true + socket_option: + ip_recover_destination_addr: false + ip_transparent: false + reuse_addr: true + reuse_port: true + tcp_cork: false + tcp_defer_accept: false + tcp_fast_open: true + tcp_no_delay: true + tcp_quick_ack: true + socket_path: "" + - name: readiness + host: 0.0.0.0 + port: 3001 + http: + handler_timeout: "" + idle_timeout: "" + read_header_timeout: "" + read_timeout: "" + shutdown_duration: 0s + write_timeout: "" + mode: REST + network: tcp + probe_wait_time: 3s + restart: true + socket_option: + ip_recover_destination_addr: false + ip_transparent: false + reuse_addr: true + reuse_port: true + tcp_cork: false + tcp_defer_accept: false + tcp_fast_open: true + tcp_no_delay: true + tcp_quick_ack: true + socket_path: "" + metrics_servers: + - name: pprof + host: 0.0.0.0 + port: 6060 + http: + handler_timeout: 5s + idle_timeout: 2s + read_header_timeout: 1s + read_timeout: 1s + shutdown_duration: 5s + write_timeout: 1m + mode: REST + network: tcp + probe_wait_time: 3s + restart: true + socket_option: + ip_recover_destination_addr: false + ip_transparent: false + reuse_addr: true + reuse_port: true + tcp_cork: true + tcp_defer_accept: false + tcp_fast_open: false + tcp_no_delay: false + tcp_quick_ack: false + socket_path: "" + startup_strategy: + - liveness + - pprof + - grpc + - readiness + shutdown_strategy: + - readiness + - grpc + - pprof + - liveness + full_shutdown_duration: 600s + tls: + ca: /path/to/ca + cert: /path/to/cert + enabled: false + insecure_skip_verify: false + key: /path/to/key + observability: + enabled: false + otlp: + collector_endpoint: "" + trace_batch_timeout: "1s" + trace_export_timeout: "1m" + trace_max_export_batch_size: 1024 + trace_max_queue_size: 256 + metrics_export_interval: "1s" + metrics_export_timeout: "1m" + attribute: + namespace: "_MY_POD_NAMESPACE_" + pod_name: "_MY_POD_NAME_" + node_name: "_MY_NODE_NAME_" + service_name: "vald-index-save" + metrics: + enable_cgo: true + enable_goroutine: true + enable_memory: true + enable_version_info: true + version_info_labels: + - vald_version + - server_name + - git_commit + - build_time + - go_version + - go_os + - go_arch + - algorithm_info + trace: + enabled: false + saver: + agent_port: 8081 + agent_name: "vald-agent" + agent_dns: vald-agent.default.svc.cluster.local + agent_namespace: "_MY_POD_NAMESPACE_" + node_name: "" + concurrency: 1 + target_addrs: [] + discoverer: + duration: 500ms + client: + addrs: + - vald-discoverer.default.svc.cluster.local:8081 + health_check_duration: "1s" + connection_pool: + enable_dns_resolver: true + enable_rebalance: true + old_conn_close_duration: 2m + rebalance_duration: 30m + size: 3 + backoff: + backoff_factor: 1.1 + backoff_time_limit: 5s + enable_error_log: true + initial_duration: 5ms + jitter_limit: 100ms + maximum_duration: 5s + retry_count: 100 + circuit_breaker: + closed_error_rate: 0.7 + closed_refresh_timeout: 10s + half_open_error_rate: 0.5 + min_samples: 1000 + open_timeout: 1s + call_option: + max_recv_msg_size: 0 + max_retry_rpc_buffer_size: 0 + max_send_msg_size: 0 + wait_for_ready: true + dial_option: + backoff_base_delay: 1s + backoff_jitter: 0.2 + backoff_max_delay: 120s + backoff_multiplier: 1.6 + enable_backoff: false + initial_connection_window_size: 2097152 + initial_window_size: 1048576 + insecure: true + interceptors: [] + keepalive: + permit_without_stream: false + time: "" + timeout: 30s + max_msg_size: 0 + min_connection_timeout: 20s + net: + dialer: + dual_stack_enabled: true + keepalive: "" + timeout: "" + dns: + cache_enabled: true + cache_expiration: 1h + refresh_duration: 30m + socket_option: + ip_recover_destination_addr: false + ip_transparent: false + reuse_addr: true + reuse_port: true + tcp_cork: false + tcp_defer_accept: false + tcp_fast_open: false + tcp_no_delay: false + tcp_quick_ack: false + tls: + ca: /path/to/ca + cert: /path/to/cert + enabled: false + insecure_skip_verify: false + key: /path/to/key + read_buffer_size: 0 + timeout: "" + write_buffer_size: 0 + tls: + ca: /path/to/ca + cert: /path/to/cert + enabled: false + insecure_skip_verify: false + key: /path/to/key + agent_client_options: + addrs: [] + health_check_duration: "1s" + connection_pool: + enable_dns_resolver: true + enable_rebalance: true + old_conn_close_duration: 2m + rebalance_duration: 30m + size: 3 + backoff: + backoff_factor: 1.1 + backoff_time_limit: 5s + enable_error_log: true + initial_duration: 5ms + jitter_limit: 100ms + maximum_duration: 5s + retry_count: 100 + circuit_breaker: + closed_error_rate: 0.7 + closed_refresh_timeout: 10s + half_open_error_rate: 0.5 + min_samples: 1000 + open_timeout: 1s + call_option: + max_recv_msg_size: 0 + max_retry_rpc_buffer_size: 0 + max_send_msg_size: 0 + wait_for_ready: true + dial_option: + write_buffer_size: 0 + read_buffer_size: 0 + initial_window_size: 1.048576e+06 + initial_connection_window_size: 2.097152e+06 + max_msg_size: 0 + backoff_max_delay: "120s" + backoff_base_delay: "1s" + backoff_multiplier: 1.6 + backoff_jitter: 0.2 + min_connection_timeout: "20s" + enable_backoff: false + insecure: true + timeout: "" + interceptors: [] + net: + dns: + cache_enabled: true + cache_expiration: 1h + refresh_duration: 30m + dialer: + timeout: "" + keepalive: "15m" + dual_stack_enabled: true + tls: + ca: /path/to/ca + cert: /path/to/cert + enabled: false + insecure_skip_verify: false + key: /path/to/key + socket_option: + ip_recover_destination_addr: false + ip_transparent: false + reuse_addr: true + reuse_port: true + tcp_cork: false + tcp_defer_accept: false + tcp_fast_open: false + tcp_no_delay: false + tcp_quick_ack: false + keepalive: + permit_without_stream: false + time: "" + timeout: 30s + tls: + ca: /path/to/ca + cert: /path/to/cert + enabled: false + insecure_skip_verify: false + key: /path/to/key diff --git a/k8s/index/job/save/cronjob.yaml b/k8s/index/job/save/cronjob.yaml index e69de29bb2..cc7fb2a648 100644 --- a/k8s/index/job/save/cronjob.yaml +++ b/k8s/index/job/save/cronjob.yaml @@ -0,0 +1,144 @@ +# +# Copyright (C) 2019-2024 vdaas.org vald team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +apiVersion: batch/v1 +kind: CronJob +metadata: + name: vald-index-save + labels: + app: vald-index-save + app.kubernetes.io/name: vald + helm.sh/chart: vald-v1.7.12 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/instance: release-name + app.kubernetes.io/component: vald-index-save + app.kubernetes.io/version: v1.7.12 +spec: + schedule: "0 */3 * * *" + concurrencyPolicy: Forbid + suspend: false + startingDeadlineSeconds: 43200 + jobTemplate: + spec: + ttlSecondsAfterFinished: 86400 + template: + metadata: + labels: + app: vald-index-save + app.kubernetes.io/name: vald + helm.sh/chart: vald-v1.7.12 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/instance: release-name + app.kubernetes.io/component: vald-index-save + app.kubernetes.io/version: v1.7.12 + annotations: + pyroscope.io/scrape: "true" + pyroscope.io/application-name: vald-index-save + pyroscope.io/profile-cpu-enabled: "true" + pyroscope.io/profile-mem-enabled: "true" + pyroscope.io/port: "6060" + spec: + initContainers: + - name: wait-for-agent + image: busybox:stable + command: + - /bin/sh + - -e + - -c + - | + until [ "$(wget --server-response --spider --quiet http://vald-agent.default.svc.cluster.local:3001/readiness 2>&1 | awk 'NR==1{print $2}')" == "200" ]; do + echo "waiting for agent to be ready..." + sleep 2; + done + - name: wait-for-discoverer + image: busybox:stable + command: + - /bin/sh + - -e + - -c + - | + until [ "$(wget --server-response --spider --quiet http://vald-discoverer.default.svc.cluster.local:3001/readiness 2>&1 | awk 'NR==1{print $2}')" == "200" ]; do + echo "waiting for discoverer to be ready..." + sleep 2; + done + containers: + - name: vald-index-save + image: "vdaas/vald-index-save:nightly" + imagePullPolicy: Always + volumeMounts: + - name: vald-index-save-config + mountPath: /etc/server/ + livenessProbe: + failureThreshold: 2 + httpGet: + path: /liveness + port: liveness + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 3 + successThreshold: 1 + timeoutSeconds: 2 + readinessProbe: + failureThreshold: 2 + httpGet: + path: /readiness + port: readiness + scheme: HTTP + initialDelaySeconds: 10 + periodSeconds: 3 + successThreshold: 1 + timeoutSeconds: 2 + startupProbe: + failureThreshold: 30 + httpGet: + path: /liveness + port: liveness + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 2 + ports: + - name: liveness + protocol: TCP + containerPort: 3000 + - name: readiness + protocol: TCP + containerPort: 3001 + - name: grpc + protocol: TCP + containerPort: 8081 + - name: pprof + protocol: TCP + containerPort: 6060 + env: + - name: MY_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: MY_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: MY_POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + restartPolicy: OnFailure + volumes: + - name: vald-index-save-config + configMap: + defaultMode: 420 + name: vald-index-save-config diff --git a/k8s/index/operator/configmap.yaml b/k8s/index/operator/configmap.yaml index e69de29bb2..16f8ea69b8 100644 --- a/k8s/index/operator/configmap.yaml +++ b/k8s/index/operator/configmap.yaml @@ -0,0 +1,28 @@ +# +# Copyright (C) 2019-2024 vdaas.org vald team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +apiVersion: v1 +kind: ConfigMap +metadata: + name: vald-index-operator-config + labels: + app.kubernetes.io/name: vald + helm.sh/chart: vald-v1.7.12 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: v1.7.12 + app.kubernetes.io/component: index-operator +data: + config.yaml: "---\nversion: v0.0.0\ntime_zone: UTC\nlogging:\n format: raw\n level: debug\n logger: glg\nserver_config:\n servers:\n - name: grpc\n host: 0.0.0.0\n port: 8081\n grpc:\n bidirectional_stream_concurrency: 20\n connection_timeout: \"\"\n enable_admin: true\n enable_reflection: true\n header_table_size: 0\n initial_conn_window_size: 2097152\n initial_window_size: 1048576\n interceptors:\n - RecoverInterceptor\n keepalive:\n max_conn_age: \"\"\n max_conn_age_grace: \"\"\n max_conn_idle: \"\"\n min_time: 10m\n permit_without_stream: false\n time: 3h\n timeout: 60s\n max_header_list_size: 0\n max_receive_message_size: 0\n max_send_message_size: 0\n read_buffer_size: 0\n write_buffer_size: 0\n mode: GRPC\n network: tcp\n probe_wait_time: 3s\n restart: true\n socket_option:\n ip_recover_destination_addr: false\n ip_transparent: false\n reuse_addr: true\n reuse_port: true\n tcp_cork: false\n tcp_defer_accept: false\n tcp_fast_open: false\n tcp_no_delay: false\n tcp_quick_ack: false\n socket_path: \"\"\n health_check_servers:\n - name: liveness\n host: 0.0.0.0\n port: 3000\n http:\n handler_timeout: \"\"\n idle_timeout: \"\"\n read_header_timeout: \"\"\n read_timeout: \"\"\n shutdown_duration: 5s\n write_timeout: \"\"\n mode: REST\n network: tcp\n probe_wait_time: 3s\n restart: true\n socket_option:\n ip_recover_destination_addr: false\n ip_transparent: false\n reuse_addr: true\n reuse_port: true\n tcp_cork: false\n tcp_defer_accept: false\n tcp_fast_open: true\n tcp_no_delay: true\n tcp_quick_ack: true\n socket_path: \"\"\n - name: readiness\n host: 0.0.0.0\n port: 3001\n http:\n handler_timeout: \"\"\n idle_timeout: \"\"\n read_header_timeout: \"\"\n read_timeout: \"\"\n shutdown_duration: 0s\n write_timeout: \"\"\n mode: REST\n network: tcp\n probe_wait_time: 3s\n restart: true\n socket_option:\n ip_recover_destination_addr: false\n ip_transparent: false\n reuse_addr: true\n reuse_port: true\n tcp_cork: false\n tcp_defer_accept: false\n tcp_fast_open: true\n tcp_no_delay: true\n tcp_quick_ack: true\n socket_path: \"\"\n metrics_servers:\n - name: pprof\n host: 0.0.0.0\n port: 6060\n http:\n handler_timeout: 5s\n idle_timeout: 2s\n read_header_timeout: 1s\n read_timeout: 1s\n shutdown_duration: 5s\n write_timeout: 1m\n mode: REST\n network: tcp\n probe_wait_time: 3s\n restart: true\n socket_option:\n ip_recover_destination_addr: false\n ip_transparent: false\n reuse_addr: true\n reuse_port: true\n tcp_cork: true\n tcp_defer_accept: false\n tcp_fast_open: false\n tcp_no_delay: false\n tcp_quick_ack: false\n socket_path: \"\"\n startup_strategy:\n - liveness\n - pprof\n - grpc\n - readiness\n shutdown_strategy:\n - readiness\n - grpc\n - pprof\n - liveness\n full_shutdown_duration: 600s\n tls:\n ca: /path/to/ca\n cert: /path/to/cert\n enabled: false\n insecure_skip_verify: false\n key: /path/to/key\nobservability:\n enabled: false\n otlp:\n collector_endpoint: \"\"\n trace_batch_timeout: \"1s\"\n trace_export_timeout: \"1m\"\n trace_max_export_batch_size: 1024\n trace_max_queue_size: 256\n metrics_export_interval: \"1s\"\n metrics_export_timeout: \"1m\"\n attribute:\n namespace: \"_MY_POD_NAMESPACE_\"\n pod_name: \"_MY_POD_NAME_\"\n node_name: \"_MY_NODE_NAME_\"\n service_name: \"vald-index-operator\"\n metrics:\n enable_cgo: true\n enable_goroutine: true\n enable_memory: true\n enable_version_info: true\n version_info_labels:\n - vald_version\n - server_name\n - git_commit\n - build_time\n - go_version\n - go_os\n - go_arch\n - algorithm_info\n trace:\n enabled: false\noperator:\n namespace: _MY_POD_NAMESPACE_\n agent_name: vald-agent\n agent_namespace: \n rotator_name: vald-readreplica-rotate\n target_read_replica_id_annotations_key: vald.vdaas.org/target-read-replica-id\n rotation_job_concurrency: 2\n read_replica_enabled: false\n read_replica_label_key: vald-readreplica-id\n job_templates:\n rotate:\n apiVersion: batch/v1\n kind: Job\n metadata:\n name: vald-readreplica-rotate\n labels:\n app: vald-readreplica-rotate\n app.kubernetes.io/name: vald\n helm.sh/chart: vald-v1.7.12\n app.kubernetes.io/managed-by: Helm\n app.kubernetes.io/instance: release-name\n app.kubernetes.io/component: vald-readreplica-rotate\n app.kubernetes.io/version: v1.7.12\n spec:\n ttlSecondsAfterFinished: 86400\n template:\n metadata:\n labels:\n app: vald-readreplica-rotate\n app.kubernetes.io/name: vald\n helm.sh/chart: vald-v1.7.12\n app.kubernetes.io/managed-by: Helm\n app.kubernetes.io/instance: release-name\n app.kubernetes.io/component: vald-readreplica-rotate\n app.kubernetes.io/version: v1.7.12\n annotations:\n pyroscope.io/scrape: \"true\"\n pyroscope.io/application-name: vald-readreplica-rotate\n pyroscope.io/profile-cpu-enabled: \"true\"\n pyroscope.io/profile-mem-enabled: \"true\"\n pyroscope.io/port: \"6060\"\n spec:\n containers:\n - name: vald-readreplica-rotate\n image: \"vdaas/vald-readreplica-rotate:nightly\"\n imagePullPolicy: Always\n volumeMounts:\n - name: vald-readreplica-rotate-config\n mountPath: /etc/server/\n livenessProbe:\n failureThreshold: 2\n httpGet:\n path: /liveness\n port: liveness\n scheme: HTTP\n initialDelaySeconds: 5\n periodSeconds: 3\n successThreshold: 1\n timeoutSeconds: 2\n readinessProbe:\n failureThreshold: 2\n httpGet:\n path: /readiness\n port: readiness\n scheme: HTTP\n initialDelaySeconds: 10\n periodSeconds: 3\n successThreshold: 1\n timeoutSeconds: 2\n startupProbe:\n failureThreshold: 30\n httpGet:\n path: /liveness\n port: liveness\n scheme: HTTP\n initialDelaySeconds: 5\n periodSeconds: 5\n successThreshold: 1\n timeoutSeconds: 2\n ports:\n - name: liveness\n protocol: TCP\n containerPort: 3000\n - name: readiness\n protocol: TCP\n containerPort: 3001\n - name: grpc\n protocol: TCP\n containerPort: 8081\n - name: pprof\n protocol: TCP\n containerPort: 6060\n securityContext:\n allowPrivilegeEscalation: false\n capabilities:\n drop:\n - ALL\n privileged: false\n readOnlyRootFilesystem: true\n runAsGroup: 65532\n runAsNonRoot: true\n runAsUser: 65532\n env:\n - name: MY_NODE_NAME\n valueFrom:\n fieldRef:\n fieldPath: spec.nodeName\n - name: MY_POD_NAME\n valueFrom:\n fieldRef:\n fieldPath: metadata.name\n - name: MY_POD_NAMESPACE\n valueFrom:\n fieldRef:\n fieldPath: metadata.namespace\n - name: TARGET_READREPLICA_ID_RELEASE_NAME_DEFAULT_VALD\n valueFrom:\n fieldRef:\n fieldPath: metadata.annotations['vald.vdaas.org/target-read-replica-id']\n securityContext:\n fsGroup: 65532\n fsGroupChangePolicy: OnRootMismatch\n runAsGroup: 65532\n runAsNonRoot: true\n runAsUser: 65532\n restartPolicy: OnFailure\n volumes:\n - name: vald-readreplica-rotate-config\n configMap:\n defaultMode: 420\n name: vald-readreplica-rotate-config\n serviceAccountName: vald-readreplica-rotate\n creation:\n apiVersion: batch/v1\n kind: Job\n metadata:\n name: vald-index-creation\n labels:\n app: vald-index-creation\n app.kubernetes.io/name: vald\n helm.sh/chart: vald-v1.7.12\n app.kubernetes.io/managed-by: Helm\n app.kubernetes.io/instance: release-name\n app.kubernetes.io/component: vald-index-creation\n app.kubernetes.io/version: v1.7.12\n spec:\n ttlSecondsAfterFinished: 86400\n template:\n metadata:\n labels:\n app: vald-index-creation\n app.kubernetes.io/name: vald\n helm.sh/chart: vald-v1.7.12\n app.kubernetes.io/managed-by: Helm\n app.kubernetes.io/instance: release-name\n app.kubernetes.io/component: vald-index-creation\n app.kubernetes.io/version: v1.7.12\n annotations:\n pyroscope.io/scrape: \"true\"\n pyroscope.io/application-name: vald-index-creation\n pyroscope.io/profile-cpu-enabled: \"true\"\n pyroscope.io/profile-mem-enabled: \"true\"\n pyroscope.io/port: \"6060\"\n spec:\n initContainers:\n - name: wait-for-agent\n image: busybox:stable\n command:\n - /bin/sh\n - -e\n - -c\n - |\n until [ \"$(wget --server-response --spider --quiet http://vald-agent.default.svc.cluster.local:3001/readiness 2>&1 | awk 'NR==1{print $2}')\" == \"200\" ]; do\n echo \"waiting for agent to be ready...\"\n sleep 2;\n done\n - name: wait-for-discoverer\n image: busybox:stable\n command:\n - /bin/sh\n - -e\n - -c\n - |\n until [ \"$(wget --server-response --spider --quiet http://vald-discoverer.default.svc.cluster.local:3001/readiness 2>&1 | awk 'NR==1{print $2}')\" == \"200\" ]; do\n echo \"waiting for discoverer to be ready...\"\n sleep 2;\n done\n containers:\n - name: vald-index-creation\n image: \"vdaas/vald-index-creation:nightly\"\n imagePullPolicy: Always\n volumeMounts:\n - name: vald-index-creation-config\n mountPath: /etc/server/\n livenessProbe:\n failureThreshold: 2\n httpGet:\n path: /liveness\n port: liveness\n scheme: HTTP\n initialDelaySeconds: 5\n periodSeconds: 3\n successThreshold: 1\n timeoutSeconds: 2\n readinessProbe:\n failureThreshold: 2\n httpGet:\n path: /readiness\n port: readiness\n scheme: HTTP\n initialDelaySeconds: 10\n periodSeconds: 3\n successThreshold: 1\n timeoutSeconds: 2\n startupProbe:\n failureThreshold: 30\n httpGet:\n path: /liveness\n port: liveness\n scheme: HTTP\n initialDelaySeconds: 5\n periodSeconds: 5\n successThreshold: 1\n timeoutSeconds: 2\n ports:\n - name: liveness\n protocol: TCP\n containerPort: 3000\n - name: readiness\n protocol: TCP\n containerPort: 3001\n - name: grpc\n protocol: TCP\n containerPort: 8081\n - name: pprof\n protocol: TCP\n containerPort: 6060\n env:\n - name: MY_NODE_NAME\n valueFrom:\n fieldRef:\n fieldPath: spec.nodeName\n - name: MY_POD_NAME\n valueFrom:\n fieldRef:\n fieldPath: metadata.name\n - name: MY_POD_NAMESPACE\n valueFrom:\n fieldRef:\n fieldPath: metadata.namespace\n restartPolicy: OnFailure\n volumes:\n - name: vald-index-creation-config\n configMap:\n defaultMode: 420\n name: vald-index-creation-config\n save:\n apiVersion: batch/v1\n kind: Job\n metadata:\n name: vald-index-save\n labels:\n app: vald-index-save\n app.kubernetes.io/name: vald\n helm.sh/chart: vald-v1.7.12\n app.kubernetes.io/managed-by: Helm\n app.kubernetes.io/instance: release-name\n app.kubernetes.io/component: vald-index-save\n app.kubernetes.io/version: v1.7.12\n spec:\n ttlSecondsAfterFinished: 86400\n template:\n metadata:\n labels:\n app: vald-index-save\n app.kubernetes.io/name: vald\n helm.sh/chart: vald-v1.7.12\n app.kubernetes.io/managed-by: Helm\n app.kubernetes.io/instance: release-name\n app.kubernetes.io/component: vald-index-save\n app.kubernetes.io/version: v1.7.12\n annotations:\n pyroscope.io/scrape: \"true\"\n pyroscope.io/application-name: vald-index-save\n pyroscope.io/profile-cpu-enabled: \"true\"\n pyroscope.io/profile-mem-enabled: \"true\"\n pyroscope.io/port: \"6060\"\n spec:\n initContainers:\n - name: wait-for-agent\n image: busybox:stable\n command:\n - /bin/sh\n - -e\n - -c\n - |\n until [ \"$(wget --server-response --spider --quiet http://vald-agent.default.svc.cluster.local:3001/readiness 2>&1 | awk 'NR==1{print $2}')\" == \"200\" ]; do\n echo \"waiting for agent to be ready...\"\n sleep 2;\n done\n - name: wait-for-discoverer\n image: busybox:stable\n command:\n - /bin/sh\n - -e\n - -c\n - |\n until [ \"$(wget --server-response --spider --quiet http://vald-discoverer.default.svc.cluster.local:3001/readiness 2>&1 | awk 'NR==1{print $2}')\" == \"200\" ]; do\n echo \"waiting for discoverer to be ready...\"\n sleep 2;\n done\n containers:\n - name: vald-index-save\n image: \"vdaas/vald-index-save:nightly\"\n imagePullPolicy: Always\n volumeMounts:\n - name: vald-index-save-config\n mountPath: /etc/server/\n livenessProbe:\n failureThreshold: 2\n httpGet:\n path: /liveness\n port: liveness\n scheme: HTTP\n initialDelaySeconds: 5\n periodSeconds: 3\n successThreshold: 1\n timeoutSeconds: 2\n readinessProbe:\n failureThreshold: 2\n httpGet:\n path: /readiness\n port: readiness\n scheme: HTTP\n initialDelaySeconds: 10\n periodSeconds: 3\n successThreshold: 1\n timeoutSeconds: 2\n startupProbe:\n failureThreshold: 30\n httpGet:\n path: /liveness\n port: liveness\n scheme: HTTP\n initialDelaySeconds: 5\n periodSeconds: 5\n successThreshold: 1\n timeoutSeconds: 2\n ports:\n - name: liveness\n protocol: TCP\n containerPort: 3000\n - name: readiness\n protocol: TCP\n containerPort: 3001\n - name: grpc\n protocol: TCP\n containerPort: 8081\n - name: pprof\n protocol: TCP\n containerPort: 6060\n env:\n - name: MY_NODE_NAME\n valueFrom:\n fieldRef:\n fieldPath: spec.nodeName\n - name: MY_POD_NAME\n valueFrom:\n fieldRef:\n fieldPath: metadata.name\n - name: MY_POD_NAMESPACE\n valueFrom:\n fieldRef:\n fieldPath: metadata.namespace\n restartPolicy: OnFailure\n volumes:\n - name: vald-index-save-config\n configMap:\n defaultMode: 420\n name: vald-index-save-config\n correction:\n apiVersion: batch/v1\n kind: Job\n metadata:\n name: vald-index-correction\n labels:\n app: vald-index-correction\n app.kubernetes.io/name: vald\n helm.sh/chart: vald-v1.7.12\n app.kubernetes.io/managed-by: Helm\n app.kubernetes.io/instance: release-name\n app.kubernetes.io/component: vald-index-correction\n app.kubernetes.io/version: v1.7.12\n spec:\n ttlSecondsAfterFinished: 86400\n template:\n metadata:\n labels:\n app: vald-index-correction\n app.kubernetes.io/name: vald\n helm.sh/chart: vald-v1.7.12\n app.kubernetes.io/managed-by: Helm\n app.kubernetes.io/instance: release-name\n app.kubernetes.io/component: vald-index-correction\n app.kubernetes.io/version: v1.7.12\n annotations:\n pyroscope.io/scrape: \"true\"\n pyroscope.io/application-name: vald-index-correction\n pyroscope.io/profile-cpu-enabled: \"true\"\n pyroscope.io/profile-mem-enabled: \"true\"\n pyroscope.io/port: \"6060\"\n spec:\n initContainers:\n - name: wait-for-agent\n image: busybox:stable\n command:\n - /bin/sh\n - -e\n - -c\n - |\n until [ \"$(wget --server-response --spider --quiet http://vald-agent.default.svc.cluster.local:3001/readiness 2>&1 | awk 'NR==1{print $2}')\" == \"200\" ]; do\n echo \"waiting for agent to be ready...\"\n sleep 2;\n done\n - name: wait-for-discoverer\n image: busybox:stable\n command:\n - /bin/sh\n - -e\n - -c\n - |\n until [ \"$(wget --server-response --spider --quiet http://vald-discoverer.default.svc.cluster.local:3001/readiness 2>&1 | awk 'NR==1{print $2}')\" == \"200\" ]; do\n echo \"waiting for discoverer to be ready...\"\n sleep 2;\n done\n containers:\n - name: vald-index-correction\n image: \"vdaas/vald-index-correction:nightly\"\n imagePullPolicy: Always\n volumeMounts:\n - name: vald-index-correction-config\n mountPath: /etc/server/\n livenessProbe:\n failureThreshold: 2\n httpGet:\n path: /liveness\n port: liveness\n scheme: HTTP\n initialDelaySeconds: 5\n periodSeconds: 3\n successThreshold: 1\n timeoutSeconds: 2\n readinessProbe:\n failureThreshold: 2\n httpGet:\n path: /readiness\n port: readiness\n scheme: HTTP\n initialDelaySeconds: 10\n periodSeconds: 3\n successThreshold: 1\n timeoutSeconds: 2\n startupProbe:\n failureThreshold: 30\n httpGet:\n path: /liveness\n port: liveness\n scheme: HTTP\n initialDelaySeconds: 5\n periodSeconds: 5\n successThreshold: 1\n timeoutSeconds: 2\n ports:\n - name: liveness\n protocol: TCP\n containerPort: 3000\n - name: readiness\n protocol: TCP\n containerPort: 3001\n - name: grpc\n protocol: TCP\n containerPort: 8081\n - name: pprof\n protocol: TCP\n containerPort: 6060\n env:\n - name: MY_NODE_NAME\n valueFrom:\n fieldRef:\n fieldPath: spec.nodeName\n - name: MY_POD_NAME\n valueFrom:\n fieldRef:\n fieldPath: metadata.name\n - name: MY_POD_NAMESPACE\n valueFrom:\n fieldRef:\n fieldPath: metadata.namespace\n restartPolicy: OnFailure\n volumes:\n - name: vald-index-correction-config\n configMap:\n defaultMode: 420\n name: vald-index-correction-config\n" diff --git a/k8s/index/operator/deployment.yaml b/k8s/index/operator/deployment.yaml index e69de29bb2..31d8ff67e3 100644 --- a/k8s/index/operator/deployment.yaml +++ b/k8s/index/operator/deployment.yaml @@ -0,0 +1,173 @@ +# +# Copyright (C) 2019-2024 vdaas.org vald team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +apiVersion: apps/v1 +kind: Deployment +metadata: + name: vald-index-operator + labels: + app: vald-index-operator + app.kubernetes.io/name: vald + helm.sh/chart: vald-v1.7.12 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: v1.7.12 + app.kubernetes.io/component: index-operator +spec: + progressDeadlineSeconds: 600 + replicas: 1 + revisionHistoryLimit: 2 + selector: + matchLabels: + app: vald-index-operator + strategy: + type: RollingUpdate + rollingUpdate: + maxSurge: 25% + maxUnavailable: 25% + template: + metadata: + creationTimestamp: null + labels: + app: vald-index-operator + app.kubernetes.io/name: vald + app.kubernetes.io/instance: release-name + app.kubernetes.io/component: operator + annotations: + checksum/configmap: 8f9bc477a217614027e1ee6f904fe1cd192d6d171b5d52746f56e703347c500e + pyroscope.io/scrape: "true" + pyroscope.io/application-name: vald-index-operator + pyroscope.io/profile-cpu-enabled: "true" + pyroscope.io/profile-mem-enabled: "true" + pyroscope.io/port: "6060" + spec: + affinity: + nodeAffinity: + preferredDuringSchedulingIgnoredDuringExecution: [] + podAffinity: + preferredDuringSchedulingIgnoredDuringExecution: [] + requiredDuringSchedulingIgnoredDuringExecution: [] + podAntiAffinity: + preferredDuringSchedulingIgnoredDuringExecution: + - podAffinityTerm: + labelSelector: + matchExpressions: + - key: app + operator: In + values: + - vald-index-operator + topologyKey: kubernetes.io/hostname + weight: 100 + requiredDuringSchedulingIgnoredDuringExecution: [] + containers: + - name: vald-index-operator + image: "vdaas/vald-index-operator:nightly" + imagePullPolicy: Always + livenessProbe: + failureThreshold: 2 + httpGet: + path: /liveness + port: liveness + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 3 + successThreshold: 1 + timeoutSeconds: 2 + readinessProbe: + failureThreshold: 2 + httpGet: + path: /readiness + port: readiness + scheme: HTTP + initialDelaySeconds: 10 + periodSeconds: 3 + successThreshold: 1 + timeoutSeconds: 2 + startupProbe: + failureThreshold: 30 + httpGet: + path: /liveness + port: liveness + scheme: HTTP + initialDelaySeconds: 5 + periodSeconds: 5 + successThreshold: 1 + timeoutSeconds: 2 + ports: + - name: liveness + protocol: TCP + containerPort: 3000 + - name: readiness + protocol: TCP + containerPort: 3001 + - name: grpc + protocol: TCP + containerPort: 8081 + - name: pprof + protocol: TCP + containerPort: 6060 + resources: + limits: + cpu: 600m + memory: 200Mi + requests: + cpu: 200m + memory: 65Mi + terminationMessagePath: /dev/termination-log + terminationMessagePolicy: File + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + privileged: false + readOnlyRootFilesystem: true + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 + env: + - name: MY_NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + - name: MY_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: MY_POD_NAMESPACE + valueFrom: + fieldRef: + fieldPath: metadata.namespace + volumeMounts: + - name: vald-index-operator-config + mountPath: /etc/server/ + dnsPolicy: ClusterFirst + restartPolicy: Always + schedulerName: default-scheduler + serviceAccountName: vald + securityContext: + fsGroup: 65532 + fsGroupChangePolicy: OnRootMismatch + runAsGroup: 65532 + runAsNonRoot: true + runAsUser: 65532 + terminationGracePeriodSeconds: 30 + volumes: + - name: vald-index-operator-config + configMap: + defaultMode: 420 + name: vald-index-operator-config + priorityClassName: default-vald-index-operator-priority +status: diff --git a/k8s/index/operator/priorityclass.yaml b/k8s/index/operator/priorityclass.yaml index e69de29bb2..cb16e6acca 100644 --- a/k8s/index/operator/priorityclass.yaml +++ b/k8s/index/operator/priorityclass.yaml @@ -0,0 +1,30 @@ +# +# Copyright (C) 2019-2024 vdaas.org vald team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# You may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +apiVersion: scheduling.k8s.io/v1 +kind: PriorityClass +metadata: + name: default-vald-index-operator-priority + labels: + app.kubernetes.io/name: vald + helm.sh/chart: vald-v1.7.12 + app.kubernetes.io/managed-by: Helm + app.kubernetes.io/instance: release-name + app.kubernetes.io/version: v1.7.12 + app.kubernetes.io/component: index-operator +value: 1e+06 +preemptionPolicy: Never +globalDefault: false +description: "A priority class for Vald index operator." diff --git a/k8s/operator/helm/crds/valdrelease.yaml b/k8s/operator/helm/crds/valdrelease.yaml index 88657d76e0..99c4cc0ec6 100644 --- a/k8s/operator/helm/crds/valdrelease.yaml +++ b/k8s/operator/helm/crds/valdrelease.yaml @@ -7888,6 +7888,181 @@ spec: items: type: object x-kubernetes-preserve-unknown-fields: true + gateway: + type: object + properties: + addrs: + type: array + items: + type: string + backoff: + type: object + properties: + backoff_factor: + type: number + backoff_time_limit: + type: string + enable_error_log: + type: boolean + initial_duration: + type: string + jitter_limit: + type: string + maximum_duration: + type: string + retry_count: + type: integer + call_option: + type: object + x-kubernetes-preserve-unknown-fields: true + circuit_breaker: + type: object + properties: + closed_error_rate: + type: number + closed_refresh_timeout: + type: string + half_open_error_rate: + type: number + min_samples: + type: integer + open_timeout: + type: string + connection_pool: + type: object + properties: + enable_dns_resolver: + type: boolean + enable_rebalance: + type: boolean + old_conn_close_duration: + type: string + rebalance_duration: + type: string + size: + type: integer + dial_option: + type: object + properties: + backoff_base_delay: + type: string + backoff_jitter: + type: number + backoff_max_delay: + type: string + backoff_multiplier: + type: number + enable_backoff: + type: boolean + initial_connection_window_size: + type: integer + initial_window_size: + type: integer + insecure: + type: boolean + interceptors: + type: array + items: + type: string + enum: + - TraceInterceptor + keepalive: + type: object + properties: + permit_without_stream: + type: boolean + time: + type: string + timeout: + type: string + max_msg_size: + type: integer + min_connection_timeout: + type: string + net: + type: object + properties: + dialer: + type: object + properties: + dual_stack_enabled: + type: boolean + keepalive: + type: string + timeout: + type: string + dns: + type: object + properties: + cache_enabled: + type: boolean + cache_expiration: + type: string + refresh_duration: + type: string + socket_option: + type: object + properties: + ip_recover_destination_addr: + type: boolean + ip_transparent: + type: boolean + reuse_addr: + type: boolean + reuse_port: + type: boolean + tcp_cork: + type: boolean + tcp_defer_accept: + type: boolean + tcp_fast_open: + type: boolean + tcp_no_delay: + type: boolean + tcp_quick_ack: + type: boolean + tls: + type: object + properties: + ca: + type: string + cert: + type: string + enabled: + type: boolean + insecure_skip_verify: + type: boolean + key: + type: string + read_buffer_size: + type: integer + timeout: + type: string + write_buffer_size: + type: integer + health_check_duration: + type: string + max_recv_msg_size: + type: integer + max_retry_rpc_buffer_size: + type: integer + max_send_msg_size: + type: integer + tls: + type: object + properties: + ca: + type: string + cert: + type: string + enabled: + type: boolean + insecure_skip_verify: + type: boolean + key: + type: string + wait_for_ready: + type: boolean image: type: object properties: diff --git a/pkg/index/job/correction/service/corrector.go b/pkg/index/job/correction/service/corrector.go index 073bf39e02..a68723d969 100644 --- a/pkg/index/job/correction/service/corrector.go +++ b/pkg/index/job/correction/service/corrector.go @@ -18,7 +18,6 @@ import ( "context" "fmt" "io" - "os" "reflect" "slices" "sync/atomic" @@ -27,9 +26,9 @@ import ( "github.com/vdaas/vald/apis/grpc/v1/payload" "github.com/vdaas/vald/apis/grpc/v1/vald" "github.com/vdaas/vald/internal/client/v1/client/discoverer" - "github.com/vdaas/vald/internal/db/kvs/bbolt" + vc "github.com/vdaas/vald/internal/client/v1/client/vald" + "github.com/vdaas/vald/internal/db/kvs/pogreb" "github.com/vdaas/vald/internal/errors" - "github.com/vdaas/vald/internal/file" "github.com/vdaas/vald/internal/log" "github.com/vdaas/vald/internal/net/grpc" "github.com/vdaas/vald/internal/net/grpc/codes" @@ -39,19 +38,11 @@ import ( "github.com/vdaas/vald/internal/sync/errgroup" ) -type contextTimeKey string - -const ( - insertMethod = "core.v1.Vald/Insert" - updateMethod = "core.v1.Vald/Update" - deleteMethod = "core.v1.Vald/Delete" - correctionStartTimeKey contextTimeKey = "correctionStartTimeKey" -) - type Corrector interface { Start(ctx context.Context) error StartClient(ctx context.Context) (<-chan error, error) PreStop(ctx context.Context) error + // For metrics NumberOfCheckedIndex() uint64 NumberOfCorrectedOldIndex() uint64 @@ -59,19 +50,17 @@ type Corrector interface { } type correct struct { - discoverer discoverer.Client - agentAddrs []string - sortedByIndexCntAddrs []string - uuidsCount uint32 - uncommittedUUIDsCount uint32 - checkedID bbolt.Bbolt + eg errgroup.Group + discoverer discoverer.Client + gateway vc.Client + checkedList pogreb.DB + checkedIndexCount atomic.Uint64 correctedOldIndexCount atomic.Uint64 correctedReplicationCount atomic.Uint64 - indexReplica int - streamListConcurrency int - bboltAsyncWriteConcurrency int + indexReplica int + streamListConcurrency int } const filemode = 0o600 @@ -89,596 +78,364 @@ func New(opts ...Option) (_ Corrector, err error) { log.Warn(oerr) } } - if err := c.bboltInit(); err != nil { - return nil, err - } return c, nil } -func (c *correct) bboltInit() error { - dpath := file.Join(os.TempDir(), "bbolt") - err := file.MkdirAll(dpath, os.ModePerm) +func (c *correct) StartClient(ctx context.Context) (_ <-chan error, err error) { + ech := make(chan error, 2) + gch, err := c.gateway.Start(ctx) if err != nil { - return err + return nil, err } - - dbfile := file.Join(dpath, "checkedid.db") - c.checkedID, err = bbolt.New(dbfile, "", os.FileMode(filemode)) + dch, err := c.discoverer.Start(ctx) if err != nil { - return err + return nil, err } - return nil -} - -func (c *correct) StartClient(ctx context.Context) (<-chan error, error) { - return c.discoverer.Start(ctx) + c.eg.Go(safety.RecoverFunc(func() (err error) { + defer close(ech) + for { + select { + case <-ctx.Done(): + return ctx.Err() + case err = <-dch: + case err = <-gch: + } + if err != nil { + select { + case <-ctx.Done(): + return ctx.Err() + case ech <- err: + } + } + } + })) + return ech, nil } -func (c *correct) Start(ctx context.Context) error { - // set current time to context - ctx = embedTime(ctx) - - // addrs is sorted by the memory usage of each agent(descending order) - // this is decending because it's supposed to be used for index manager to decide - // which pod to make a create index rpc(higher memory, first to commit) - c.agentAddrs = c.discoverer.GetAddrs(ctx) - if len(c.agentAddrs) <= 1 { - log.Warnf("target agent (%v) found, but there must be more than two agents for correction to happen", c.agentAddrs) - return errors.ErrAgentReplicaOne - } - log.Debugf("target agent addrs: %v", c.agentAddrs) - - if err := c.loadAgentIndexInfo(ctx); err != nil { +func (c *correct) Start(ctx context.Context) (err error) { + detail, err := c.gateway.IndexDetail(ctx, new(payload.Empty)) + if err != nil { return err } - - log.Info("starting correction with bbolt disk cache...") - if err := c.correct(ctx); err != nil { - return err + counts := detail.GetCounts() + agents := make([]string, 0, detail.GetLiveAgents()) + for agent, count := range counts { + log.Infof("index info: addr(%s), stored(%d), uncommitted(%d), indexing=%t, saving=%t", agent, count.GetStored(), count.GetUncommitted(), count.GetIndexing(), count.GetSaving()) + agents = append(agents, agent) } - log.Info("correction finished successfully") - - return nil -} - -func (c *correct) PreStop(_ context.Context) error { - log.Info("removing persistent cache files...") - return c.checkedID.Close(true) -} - -func (c *correct) NumberOfCheckedIndex() uint64 { - return c.checkedIndexCount.Load() -} - -func (c *correct) NumberOfCorrectedOldIndex() uint64 { - return c.correctedOldIndexCount.Load() -} - -func (c *correct) NumberOfCorrectedReplication() uint64 { - return c.correctedReplicationCount.Load() -} + slices.SortFunc(agents, func(left string, right string) int { + return cmp.Compare(counts[left].GetStored(), counts[right].GetStored()) + }) -// skipcq: GO-R1005 -func (c *correct) correct(ctx context.Context) (err error) { - // Vector with time after this should not be processed - correctionStartTime, err := correctionStartTime(ctx) - if err != nil { - log.Errorf("cannot determine correction start time: %w", err) - return err - } + errs := make([]error, 0, len(agents)) - curTargetAgent := 0 - jobErrs := make([]error, 0, c.streamListConcurrency) - if err := c.discoverer.GetClient().OrderedRange(ctx, c.sortedByIndexCntAddrs, - func(ctx context.Context, addr string, conn *grpc.ClientConn, copts ...grpc.CallOption) (err error) { - defer func() { - if err != nil { - // catch the err that happened in this scope using named return err - jobErrs = append(jobErrs, err) - } - curTargetAgent++ - }() + emptyReq := new(payload.Object_List_Request) - // context and errgroup for stream.Recv and correction - sctx, scancel := context.WithCancel(ctx) - defer scancel() - seg, sctx := errgroup.WithContext(sctx) - seg.SetLimit(c.streamListConcurrency) + start := time.Now() - // errgroup for bbolt AsyncSet - bolteg, ctx := errgroup.WithContext(ctx) - bolteg.SetLimit(c.bboltAsyncWriteConcurrency) + emptyByte := []byte("vald") - log.Infof("starting correction for agent %s, stream concurrency: %d, bbolt concurrency: %d", addr, c.streamListConcurrency, c.bboltAsyncWriteConcurrency) + corrected := 0 - vc := vald.NewValdClient(conn) - stream, err := vc.StreamListObject(ctx, &payload.Object_List_Request{}) + log.Infof("processing order of agents: %v", agents) + if err := c.discoverer.GetClient().OrderedRange(ctx, agents, func(ctx context.Context, + addr string, + conn *grpc.ClientConn, + copts ...grpc.CallOption, + ) (err error) { + defer func() { if err != nil { - return err + // catch the err that happened in this scope using named return err + errs = append(errs, err) } - - var mu sync.Mutex - // The number of items to be received in advance is not known in advance. - // This is because there is a possibility of new items being inserted during processing. - for { - select { - case <-sctx.Done(): - if !errors.Is(sctx.Err(), context.Canceled) { - log.Errorf("context done unexpectedly: %v", sctx.Err()) - } - - // Finalize - err = seg.Wait() - if err != nil { - log.Errorf("err group returned error: %v", err) - } - - berr := bolteg.Wait() - if berr != nil { - log.Errorf("bbolt err group returned error: %v", err) - err = errors.Join(err, berr) + }() + ctx, cancel := context.WithCancelCause(ctx) + eg, egctx := errgroup.WithContext(ctx) + eg.SetLimit(c.streamListConcurrency) + stream, err := vald.NewValdClient(conn).StreamListObject(ctx, emptyReq) + if err != nil { + return err + } + log.Infof("starting correction for agent %s, stream concurrency: %d", addr, c.streamListConcurrency) + // The number of items to be received in advance is not known in advance. + // This is because there is a possibility of new items being inserted during processing. + for { + select { + case <-ctx.Done(): + if !errors.Is(ctx.Err(), context.Canceled) { + log.Errorf("context done unexpectedly: %v", ctx.Err()) + } + if context.Cause(ctx) != io.EOF { + log.Errorf("context canceled due to: %v", ctx.Err()) + } + err = eg.Wait() + if err != nil { + log.Errorf("err group returned error: %v", err) + } + corrected++ + log.Infof("correction finished for agent %s, processed %d/%d", addr, corrected, len(agents)) + default: + res, err := stream.Recv() + if err != nil { + if errors.Is(err, io.EOF) { + cancel(io.EOF) } else { - log.Info("bbolt all batch finished") + cancel(errors.ErrStreamListObjectStreamFinishedUnexpectedly(err)) } - - log.Infof("correction finished for agent %s", addr) - return err - - default: - seg.Go(safety.RecoverFunc(func() error { - mu.Lock() - // As long as we don't stream.Recv() from the stream, we do not consume the memory of the message. - // So by limiting the number of this errgroup.Go instances, we can limit the memory usage - // https://github.com/grpc/grpc-go/blob/33f9fa2e6e5bcf4cf8fe45133e23779ae6e43f6c/rpc_util.go#L795 - res, err := stream.Recv() - mu.Unlock() - - if err != nil { - if errors.Is(err, io.EOF) { - scancel() - return nil - } - return errors.ErrStreamListObjectStreamFinishedUnexpectedly(err) - } - + } else { + eg.Go(safety.RecoverFunc(func() (err error) { vec := res.GetVector() if vec == nil { st := res.GetStatus() - log.Error(st.GetCode(), st.GetMessage(), st.GetDetails()) + if st != nil { + // TODO: do more detailed error handling + log.Error(st.GetCode(), st.GetMessage(), st.GetDetails()) + } return errors.ErrFailedToReceiveVectorFromStream } // skip if the vector is inserted after correction start - if vec.GetTimestamp() > correctionStartTime.UnixNano() { - log.Debugf("timestamp of vector(id: %s, timestamp: %v) is newer than correction start time(%v). skipping...", - vec.GetId(), - vec.GetTimestamp(), - correctionStartTime.UnixNano(), - ) + if vec.GetTimestamp() > start.UnixNano() { + // TODO: do debug logging here and describe why process are skipped return nil } // check if the index is already checked id := vec.GetId() - _, ok, err := c.checkedID.Get([]byte(id)) + if id == "" { + // TODO: do some error handling herer + return nil + } + _, ok, err := c.checkedList.Get(id) if err != nil { - log.Errorf("failed to perform Get from bbolt but still try to finish processing without cache: %v", err) + log.Errorf("failed to perform Get from check list but still try to finish processing without cache: %v", err) } if ok { // already checked index return nil } - - if err := c.checkConsistency( - ctx, - &vectorReplica{ - addr: addr, - vec: vec, + defer func() { + c.checkedList.Set(id, emptyByte) + c.checkedIndexCount.Add(1) + }() + + var ( + latest int64 + otherAgents []string + mu sync.Mutex + found = make(map[string]*payload.Object_Timestamp, len(addr)) + latestAgent = addr + ) + selfIdx := slices.Index(agents, addr) + if selfIdx != -1 { + otherAgents = slices.Delete(agents, selfIdx, selfIdx+1) + } else { + otherAgents = agents + } + if err := c.discoverer.GetClient().OrderedRangeConcurrent(egctx, otherAgents, len(otherAgents), + func(ctx context.Context, addr string, conn *grpc.ClientConn, copts ...grpc.CallOption) error { + ots, err := vald.NewValdClient(conn).GetTimestamp(ctx, &payload.Object_TimestampRequest{ + Id: &payload.Object_ID{ + Id: id, + }, + }) + if err != nil { + if st, ok := status.FromError(err); !ok { + log.Errorf("gRPC call returned not a gRPC status error: %v", err) + return err + } else if st.Code() == codes.NotFound { + // when replica of agent > index replica, this happens + return nil + } else { + log.Errorf("failed to GetObject with unexpected error. code: %v, message: %s", st.Code(), st.Message()) + return err + } + } + + // skip if the vector is inserted after correction start + if ots.GetTimestamp() > start.UnixNano() { + log.Debugf("timestamp of vector(id: %s, timestamp: %v) is newer than correction start time(%v). skipping...", + ots.GetId(), + ots.GetTimestamp(), + start.UnixNano(), + ) + return nil + } + mu.Lock() + found[addr] = ots + if latest < ots.GetTimestamp() { + latest = ots.GetTimestamp() + if latest > vec.GetTimestamp() { + latestAgent = addr + } + } + mu.Unlock() + return nil }, - curTargetAgent, ); err != nil { - return errors.ErrFailedToCheckConsistency(err) + return err + } + latestObject := vec + if vec.GetTimestamp() < latest && latestAgent != addr { + _, err := c.discoverer.GetClient().Do(grpc.WithGRPCMethod(ctx, vald.PackageName+"."+vald.ObjectRPCServiceName+"/"+vald.GetObjectRPCName), latestAgent, func(ctx context.Context, + conn *grpc.ClientConn, + copts ...grpc.CallOption, + ) (any, error) { + obj, err := vald.NewObjectClient(conn).GetObject(ctx, &payload.Object_VectorRequest{ + Id: &payload.Object_ID{ + Id: id, + }, + }, copts...) + if err != nil { + return nil, err + } + if obj.GetTimestamp() >= latest && obj.GetId() != "" && obj.GetVector() != nil { + latestObject = obj + } + return obj, nil + }) + if err != nil { + // TODO: do some error handling here but do not return here we can continue repaier process using current vec valuer + } + } + if latestObject.Timestamp < latest { + latestObject.Timestamp = latest + } + tss := time.Unix(0, latestObject.GetTimestamp()).Format(time.RFC3339Nano) // timestamp string + for addr, ots := range found { + if latestObject.GetTimestamp() > ots.GetTimestamp() { + _, err := c.discoverer.GetClient().Do(grpc.WithGRPCMethod(ctx, vald.PackageName+"."+vald.UpdateRPCServiceName+"/"+vald.UpdateRPCName), addr, func(ctx context.Context, + conn *grpc.ClientConn, + copts ...grpc.CallOption, + ) (any, error) { + // TODO: use UpdateTimestamp when it's implemented because here we just want to update only the timestamp but not the vector + _, err := vald.NewUpdateClient(conn).Update(ctx, &payload.Update_Request{ + Vector: latestObject, + // TODO: this should be deleted after Config.Timestamp deprecation + Config: &payload.Update_Config{ + // TODO: Decrementing because it's gonna be incremented befor being pushed + // to vqueue in the agent. This is a not ideal workaround for the current vqueue implementation + // so we should consider refactoring vqueue. + Timestamp: latestObject.GetTimestamp() - 1, + }, + }, copts...) + if err != nil { + return nil, err + } + log.Infof("vector successfully updated. address: %s, uuid: %s, timestamp: %s", addr, latestObject.GetId(), tss) + c.correctedOldIndexCount.Add(1) + return nil, nil + }) + if err != nil { + // TODO: do some error handling here but do not return here we can continue repaier process using current vec valuer + log.Infof("timestamp inconsistency detected with vector(id: %s, timestamp: %v). updating with the latest vector(id: %s, timestamp: %v)", + ots.GetId(), + ots.GetTimestamp(), + latestObject.GetId(), + latestObject.GetTimestamp(), + ) + log.Error(fmt.Errorf("failed to fix timestamp: %w", err)) + } + } + } + diff := c.indexReplica - len(found) + addrs := c.discoverer.GetAddrs(ctx) + if diff > len(agents)-len(otherAgents) { + log.Infof("replica shortage of vector %s. inserting to other agents...", id) + if len(addrs) == 0 { + return errors.ErrNoAvailableAgentToInsert + } + for _, daddr := range addrs { + if diff > len(agents)-len(otherAgents) && daddr != addr { + _, ok := found[daddr] + if !ok { + log.Infof("inserting replica to %s", daddr) + _, err := c.discoverer.GetClient().Do(grpc.WithGRPCMethod(ctx, vald.PackageName+"."+vald.InsertRPCServiceName+"/"+vald.InsertRPCName), daddr, func(ctx context.Context, + conn *grpc.ClientConn, + copts ...grpc.CallOption, + ) (any, error) { + vald.NewInsertClient(conn).Insert(ctx, &payload.Insert_Request{ + Vector: latestObject, + Config: &payload.Insert_Config{ + // TODO: this should be deleted after Config.Timestamp deprecation + Timestamp: latestObject.GetTimestamp(), + }, + }, copts...) + if err != nil { + return nil, err + } + diff++ + c.correctedReplicationCount.Add(1) + return nil, nil + }) + if err != nil { + // TODO: do some error handling here but do not return here we can continue repaier process using current vec valuer + log.Errorf("failed to insert object to agent(%s): %v", daddr, err) + log.Error(fmt.Errorf("failed to fix index replica: %w", err)) + } + } + } + } + } + if diff < len(agents)-len(otherAgents) { + log.Infof("replica oversupply of vector %s. deleting...", id) + slices.Reverse(addrs) + for _, daddr := range addrs { + if diff < len(agents)-len(otherAgents) { + _, ok := found[daddr] + if ok || daddr == addr { + _, err := c.discoverer.GetClient().Do(grpc.WithGRPCMethod(ctx, vald.PackageName+"."+vald.RemoveRPCServiceName+"/"+vald.RemoveRPCName), daddr, func(ctx context.Context, + conn *grpc.ClientConn, + copts ...grpc.CallOption, + ) (any, error) { + vald.NewRemoveClient(conn).Remove(ctx, &payload.Remove_Request{ + Id: &payload.Object_ID{ + Id: id, + }, + }, copts...) + if err != nil { + return nil, err + } + diff-- + c.correctedReplicationCount.Add(1) + return nil, nil + }) + if err != nil { + // TODO: do some error handling here but do not return here we can continue repaier process using current vec valuer + log.Errorf("failed to delete object from agent(%s): %v", daddr, err) + log.Error(fmt.Errorf("failed to fix index replica: %w", err)) + } + } + } + } } - - // now this id is checked so set it to the disk cache - c.checkedID.AsyncSet(bolteg, []byte(id), nil) - c.checkedIndexCount.Add(1) - return nil })) } } - }, - ); err != nil { + } + }); err != nil { // This only happnes when ErrGRPCClientConnNotFound is returned. // In other cases, OrderedRange continues processing, so jobErrrs is used to keep track of the error status of correction. return err } - jobErrs = errors.RemoveDuplicates(jobErrs) - return errors.Join(jobErrs...) -} - -type vectorReplica struct { - addr string - vec *payload.Object_Vector -} - -// Validate len(addrs) >= 2 before calling this function. -func (c *correct) checkConsistency( - ctx context.Context, targetReplica *vectorReplica, targetAgentIdx int, -) error { - // leftAgentAddrs is the agents' addr that hasn't been corrected yet. - leftAgentAddrs := c.sortedByIndexCntAddrs[targetAgentIdx+1:] - - // Vector with time after this should not be processed - correctionStartTime, err := correctionStartTime(ctx) - if err != nil { - log.Errorf("cannot determine correction start time: %w", err) - return err - } - - foundReplicas := make([]*vectorReplica, 0, len(c.sortedByIndexCntAddrs)) - var mu sync.Mutex - if err := c.discoverer.GetClient().OrderedRangeConcurrent(ctx, leftAgentAddrs, len(leftAgentAddrs), - func(ctx context.Context, addr string, conn *grpc.ClientConn, copts ...grpc.CallOption) error { - vecMeta, err := vald.NewValdClient(conn).GetTimestamp(ctx, &payload.Object_TimestampRequest{ - Id: &payload.Object_ID{ - Id: targetReplica.vec.GetId(), - }, - }) - if err != nil { - if st, ok := status.FromError(err); !ok { - log.Errorf("gRPC call returned not a gRPC status error: %v", err) - return err - } else if st.Code() == codes.NotFound { - // when replica of agent > index replica, this happens - return nil - } else { - log.Errorf("failed to GetObject with unexpected error. code: %v, message: %s", st.Code(), st.Message()) - return err - } - } - - // skip if the vector is inserted after correction start - if vecMeta.GetTimestamp() > correctionStartTime.UnixNano() { - log.Debugf("timestamp of vector(id: %s, timestamp: %v) is newer than correction start time(%v). skipping...", - vecMeta.GetId(), - vecMeta.GetTimestamp(), - correctionStartTime.UnixNano(), - ) - return nil - } - - mu.Lock() - foundReplicas = append(foundReplicas, &vectorReplica{ - addr: addr, - // the vector itself will be fetched when it's needed - vec: &payload.Object_Vector{ - Id: vecMeta.GetId(), - Timestamp: vecMeta.GetTimestamp(), - }, - }) - mu.Unlock() - - return nil - }, - ); err != nil { - return err - } - - // check timestamps - if err := c.correctTimestamp(ctx, targetReplica, foundReplicas); err != nil { - return fmt.Errorf("failed to fix timestamp: %w", err) - } - - // check replica number - if err := c.correctReplica(ctx, targetReplica, foundReplicas); err != nil { - return fmt.Errorf("failed to fix index replica: %w", err) - } - - return nil -} - -func (c *correct) correctTimestamp( - ctx context.Context, targetReplica *vectorReplica, foundReplicas []*vectorReplica, -) error { - if len(foundReplicas) == 0 { - // no replica found. nothing to do about timestamp - return nil - } - - // skipcq: CRT-D0001 - allReplicas := append(foundReplicas, targetReplica) - - // sort by timestamp - slices.SortFunc(allReplicas, func(i, j *vectorReplica) int { - // largest timestamp means the latest - return cmp.Compare(j.vec.GetTimestamp(), i.vec.GetTimestamp()) - }) - - latest := allReplicas[0] - latestTS := latest.vec.GetTimestamp() - for _, replica := range allReplicas { - if replica.vec.GetTimestamp() == latestTS { - // no inconsistency - continue - } - - // udate the vector with the new one - log.Infof("timestamp inconsistency detected with vector(id: %s, timestamp: %v). updating with the latest vector(id: %s, timestamp: %v)", - replica.vec.GetId(), - replica.vec.GetTimestamp(), - latest.vec.GetId(), - latest.vec.GetTimestamp(), - ) - c.correctedOldIndexCount.Add(1) - if err := c.updateObject(ctx, replica, latest); err != nil { - return err - } - } - - return nil -} - -// correctReplica corrects the number of replicas of the target vector. -// skipcq: GO-R1005 -func (c *correct) correctReplica( - ctx context.Context, targetReplica *vectorReplica, foundReplicas []*vectorReplica, -) error { - // diff < 0 means there is less replica than the correct number - existReplica := len(foundReplicas) + 1 - diff := existReplica - c.indexReplica - if diff == 0 { - // replica number is correct - return nil - } - - // availableAddrs = c.agentAddrs - foundReplicas - targetReplica.addr - // here we use c.agentAddrs because we want to decide by memory usage order - // not the number of indexes - availableAddrs := make([]string, 0, len(c.agentAddrs)) - for _, addr := range c.agentAddrs { - if addr == targetReplica.addr { - continue - } - if slices.ContainsFunc(foundReplicas, func(replica *vectorReplica) bool { - return replica.addr == addr - }) { - continue - } - availableAddrs = append(availableAddrs, addr) - } - - // when there are less replicas than the correct number, add the extra replicas - if diff < 0 { - log.Infof("replica shortage of vector %s. inserting to other agents...", targetReplica.vec.GetId()) - c.correctedReplicationCount.Add(1) - if len(availableAddrs) == 0 { - return errors.ErrNoAvailableAgentToInsert - } - - // inserting with the reverse order of availableAddrs since the last agent has the lowest memory usage - for i := len(availableAddrs) - 1; i >= 0 && diff < 0; i-- { - addr := availableAddrs[i] - log.Infof("inserting replica to %s", addr) - if err := c.insertObject(ctx, addr, targetReplica.vec); err != nil { - log.Errorf("failed to insert object to agent(%s): %v", addr, err) - continue - } - diff++ - } - - if diff < 0 { - return errors.ErrFailedToCorrectReplicaNum - } - - return nil - } - - // when there are more replicas than the correct number, delete the extra replicas - log.Infof("replica oversupply of vector %s. deleting...", - targetReplica.vec.GetId()) - c.correctedReplicationCount.Add(1) - // delete from myself - if err := c.deleteObject(ctx, targetReplica.addr, targetReplica.vec); err != nil { - log.Errorf("failed to delete object from agent(%s): %v", targetReplica.addr, err) - } else { - diff-- - } - - // delte from others if there's more to delete - for _, replica := range foundReplicas { - if diff == 0 { - break - } - if err := c.deleteObject(ctx, replica.addr, replica.vec); err != nil { - log.Errorf("failed to delete object from agent(%s): %v", replica.addr, err) - continue - } - diff-- - } - - if diff > 0 { - return errors.ErrFailedToCorrectReplicaNum - } - return nil } -func (c *correct) updateObject(ctx context.Context, dest, src *vectorReplica) error { - // check if the src vector has content not just timestamp - if vec := src.vec.GetVector(); len(vec) == 0 { - if err := c.fillVectorField(ctx, src); err != nil { - return err - } - } - - res, err := c.discoverer.GetClient(). - Do(grpc.WithGRPCMethod(ctx, updateMethod), dest.addr, func(ctx context.Context, conn *grpc.ClientConn, copts ...grpc.CallOption) (any, error) { - // TODO: use UpdateTimestamp when it's implemented because here we just want to update only the timestamp but not the vector - return vald.NewUpdateClient(conn).Update(ctx, &payload.Update_Request{ - Vector: src.vec, - // TODO: this should be deleted after Config.Timestamp deprecation - Config: &payload.Update_Config{ - // TODO: Decrementing because it's gonna be incremented befor being pushed - // to vqueue in the agent. This is a not ideal workaround for the current vqueue implementation - // so we should consider refactoring vqueue. - Timestamp: src.vec.GetTimestamp() - 1, - }, - }, copts...) - }) - if err != nil { - return err - } - - if v, ok := res.(*payload.Object_Location); ok { - log.Infof("vector successfully updated. address: %s, uuid: %v", dest.addr, v.GetUuid()) - } - - return nil -} - -func (c *correct) fillVectorField(ctx context.Context, replica *vectorReplica) error { - res, err := c.discoverer.GetClient(). - Do(grpc.WithGRPCMethod(ctx, "core.v1.Vald/GetObject"), replica.addr, func(ctx context.Context, conn *grpc.ClientConn, copts ...grpc.CallOption) (any, error) { - return vald.NewValdClient(conn).GetObject(ctx, &payload.Object_VectorRequest{ - Id: &payload.Object_ID{ - Id: replica.vec.GetId(), - }, - }, copts...) - }) - if err != nil { - return err - } - - if v, ok := res.(*payload.Object_Vector); ok { - vec := v.GetVector() - if len(vec) == 0 { - return err - } - replica.vec.Vector = v.GetVector() - } - - return nil -} - -func (c *correct) insertObject( - ctx context.Context, addr string, vector *payload.Object_Vector, -) error { - res, err := c.discoverer.GetClient(). - Do(grpc.WithGRPCMethod(ctx, insertMethod), addr, func(ctx context.Context, conn *grpc.ClientConn, copts ...grpc.CallOption) (any, error) { - return vald.NewInsertClient(conn).Insert(ctx, &payload.Insert_Request{ - Vector: vector, - // TODO: this should be deleted after Config.Timestamp deprecation - Config: &payload.Insert_Config{ - Timestamp: vector.GetTimestamp(), - }, - }, copts...) - }) - if err != nil { - return err - } - - if v, ok := res.(*payload.Object_Location); ok { - log.Infof("vector successfully inserted. address: %s, uuid: %v", addr, v.GetUuid()) - } - - return nil -} - -func (c *correct) deleteObject( - ctx context.Context, addr string, vector *payload.Object_Vector, -) error { - res, err := c.discoverer.GetClient(). - Do(grpc.WithGRPCMethod(ctx, deleteMethod), addr, func(ctx context.Context, conn *grpc.ClientConn, copts ...grpc.CallOption) (any, error) { - return vald.NewRemoveClient(conn).Remove(ctx, &payload.Remove_Request{ - Id: &payload.Object_ID{ - Id: vector.GetId(), - }, - }, copts...) - }) - if err != nil { - return err - } - - if v, ok := res.(*payload.Object_Location); ok { - log.Infof("vector successfully deleted. address: %s, uuid: %v", addr, v.GetUuid()) - } - - return nil +func (c *correct) PreStop(_ context.Context) error { + log.Info("removing persistent cache files...") + return c.checkedList.Close(true) } -// loadAgentIndexInfo loads the index info of each agent and sort them by the number of indexes -// then append the result to c.sortedByIndexCntAddrs. -// This sort is required because we want to process the agents with the least number of indexes first -// for performance to filter out the agent as early as possible from broadcast in checkConsistency function. -func (c *correct) loadAgentIndexInfo(ctx context.Context) (err error) { - var u, ucu uint32 - var infoMap sync.Map[string, *payload.Info_Index_Count] - err = c.discoverer.GetClient().RangeConcurrent(ctx, len(c.discoverer.GetAddrs(ctx)), - func(ctx context.Context, - addr string, conn *grpc.ClientConn, copts ...grpc.CallOption, - ) (err error) { - select { - case <-ctx.Done(): - return nil - default: - info, err := vald.NewValdClient(conn).IndexInfo(ctx, new(payload.Empty), copts...) - if err != nil { - log.Warnf("an error occurred while calling IndexInfo of %s: %s", addr, err) - return nil - } - infoMap.Store(addr, info) - atomic.AddUint32(&u, info.GetStored()) - atomic.AddUint32(&ucu, info.GetUncommitted()) - } - return nil - }) - if err != nil { - return err - } - atomic.StoreUint32(&c.uuidsCount, atomic.LoadUint32(&u)) - atomic.StoreUint32(&c.uncommittedUUIDsCount, atomic.LoadUint32(&ucu)) - - type indexInfo struct { - stored int - addr string - } - - var infos []indexInfo - infoMap.Range(func(addr string, info *payload.Info_Index_Count) bool { - log.Infof("index info: addr(%s), stored(%d), uncommitted(%d)", addr, info.GetStored(), info.GetUncommitted()) - - infos = append(infos, indexInfo{ - addr: addr, - stored: int(info.GetStored() + info.GetUncommitted()), - }) - return true - }) - - slices.SortFunc(infos, func(i, j indexInfo) int { - return cmp.Compare(i.stored, j.stored) - }) - for _, info := range infos { - c.sortedByIndexCntAddrs = append(c.sortedByIndexCntAddrs, info.addr) - } - log.Infof("processing order of agents: %v", c.sortedByIndexCntAddrs) - return nil +func (c *correct) NumberOfCheckedIndex() uint64 { + return c.checkedIndexCount.Load() } -func embedTime(ctx context.Context) context.Context { - v := ctx.Value(correctionStartTimeKey) - if _, ok := v.(time.Time); ok { - return ctx - } - return context.WithValue(ctx, correctionStartTimeKey, time.Now()) +func (c *correct) NumberOfCorrectedOldIndex() uint64 { + return c.correctedOldIndexCount.Load() } -func correctionStartTime(ctx context.Context) (time.Time, error) { - v := ctx.Value(correctionStartTimeKey) - if t, ok := v.(time.Time); ok { - return t, nil - } - return time.Time{}, fmt.Errorf("timeKey is not embedded in context") +func (c *correct) NumberOfCorrectedReplication() uint64 { + return c.correctedReplicationCount.Load() } diff --git a/pkg/index/job/correction/service/options.go b/pkg/index/job/correction/service/options.go index 3913271e6b..d094a28312 100644 --- a/pkg/index/job/correction/service/options.go +++ b/pkg/index/job/correction/service/options.go @@ -15,15 +15,27 @@ package service import ( "github.com/vdaas/vald/internal/client/v1/client/discoverer" + "github.com/vdaas/vald/internal/client/v1/client/vald" "github.com/vdaas/vald/internal/errors" + "github.com/vdaas/vald/internal/sync/errgroup" ) // Option represents the functional option for index corrector. type Option func(*correct) error var defaultOpts = []Option{ - WithStreamListConcurrency(200), //nolint:gomnd - WithKvsAsyncWriteConcurrency(2048), //nolint:gomnd + WithStreamListConcurrency(200), //nolint:gomnd + WithErrGroup(errgroup.Get()), +} + +// WithErrGroup returns Option that set errgroup. +func WithErrGroup(eg errgroup.Group) Option { + return func(c *correct) error { + if eg != nil { + c.eg = eg + } + return nil + } } // WithIndexReplica returns Option that sets index replica. @@ -48,24 +60,24 @@ func WithDiscoverer(client discoverer.Client) Option { } } -// WithStreamListConcurrency returns Option that sets concurrency for StreamList field value. -func WithStreamListConcurrency(num int) Option { +// WithGateway returns Option that sets discoverer client. +func WithGateway(client vald.Client) Option { return func(c *correct) error { - if num <= 0 { - return errors.NewErrInvalidOption("streamListConcurrency", num) + if client == nil { + return errors.NewErrCriticalOption("gateway", client) } - c.streamListConcurrency = num + c.gateway = client return nil } } -// WithKvsAsyncWriteConcurrency returns Option that sets concurrency for kvs async write. -func WithKvsAsyncWriteConcurrency(num int) Option { +// WithStreamListConcurrency returns Option that sets concurrency for StreamList field value. +func WithStreamListConcurrency(num int) Option { return func(c *correct) error { if num <= 0 { - return errors.NewErrInvalidOption("kvsAsyncWriteConcurrency", num) + return errors.NewErrInvalidOption("streamListConcurrency", num) } - c.bboltAsyncWriteConcurrency = num + c.streamListConcurrency = num return nil } } diff --git a/pkg/index/job/correction/usecase/corrector.go b/pkg/index/job/correction/usecase/corrector.go index ea82fbe9bd..90c3e2bc68 100644 --- a/pkg/index/job/correction/usecase/corrector.go +++ b/pkg/index/job/correction/usecase/corrector.go @@ -16,10 +16,12 @@ package usecase import ( "context" "os" + "slices" "syscall" "time" "github.com/vdaas/vald/internal/client/v1/client/discoverer" + "github.com/vdaas/vald/internal/client/v1/client/vald" iconf "github.com/vdaas/vald/internal/config" "github.com/vdaas/vald/internal/errors" "github.com/vdaas/vald/internal/log" @@ -47,6 +49,18 @@ type run struct { func New(cfg *config.Data) (r runner.Runner, err error) { eg := errgroup.Get() + gOpts, err := cfg.Corrector.Gateway.Opts() + if err != nil { + return nil, err + } + // skipcq: CRT-D0001 + gOpts = append(gOpts, grpc.WithErrGroup(eg)) + + gateway, err := vald.New(vald.WithClient(grpc.New(gOpts...))) + if err != nil { + return nil, err + } + dOpts, err := cfg.Corrector.Discoverer.Client.Opts() if err != nil { return nil, err @@ -73,10 +87,7 @@ func New(cfg *config.Data) (r runner.Runner, err error) { discoverer.WithOptions(acOpts...), discoverer.WithNodeName(cfg.Corrector.NodeName), discoverer.WithOnDiscoverFunc(func(ctx context.Context, c discoverer.Client, addrs []string) error { - last := len(addrs) - 1 - for i := 0; i < len(addrs)/2; i++ { - addrs[i], addrs[last-i] = addrs[last-i], addrs[i] - } + slices.Reverse(addrs) return nil }), ) @@ -103,8 +114,8 @@ func New(cfg *config.Data) (r runner.Runner, err error) { corrector, err := service.New( service.WithDiscoverer(discoverer), + service.WithGateway(gateway), service.WithIndexReplica(cfg.Corrector.IndexReplica), - service.WithKvsAsyncWriteConcurrency(cfg.Corrector.KvsAsyncWriteConcurrency), service.WithStreamListConcurrency(cfg.Corrector.StreamListConcurrency), ) if err != nil { diff --git a/rust/Cargo.lock b/rust/Cargo.lock index 92d78da00b..e9f1917e8d 100644 --- a/rust/Cargo.lock +++ b/rust/Cargo.lock @@ -674,9 +674,9 @@ checksum = "8b870d8c151b6f2fb93e84a13146138f05d02ed11c7e7c54f8826aaaf7c9f184" [[package]] name = "ppv-lite86" -version = "0.2.18" +version = "0.2.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dee4364d9f3b902ef14fab8a1ddffb783a1cb6b4bba3bfc1fa3922732c7de97f" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" dependencies = [ "zerocopy", ] @@ -1171,11 +1171,11 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "winapi-util" -version = "0.1.8" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d4cc384e1e73b93bafa6fb4f1df8c41695c8a91cf9c4c64358067d15a7b6c6b" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -1196,6 +1196,15 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + [[package]] name = "windows-targets" version = "0.48.5" @@ -1319,9 +1328,9 @@ checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" [[package]] name = "zerocopy" -version = "0.6.6" +version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "854e949ac82d619ee9a14c66a1b674ac730422372ccb759ce0c39cabcf2bf8e6" +checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" dependencies = [ "byteorder", "zerocopy-derive", @@ -1329,9 +1338,9 @@ dependencies = [ [[package]] name = "zerocopy-derive" -version = "0.6.6" +version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "125139de3f6b9d625c39e2efdd73d41bdac468ccd556556440e322be0e1bbd91" +checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", diff --git a/versions/CMAKE_VERSION b/versions/CMAKE_VERSION index 72bde0ab2a..aaa0fde70b 100644 --- a/versions/CMAKE_VERSION +++ b/versions/CMAKE_VERSION @@ -1 +1 @@ -3.30.1 +3.30.2 diff --git a/versions/PROMETHEUS_STACK_VERSION b/versions/PROMETHEUS_STACK_VERSION index f71a9b549a..da2713f830 100644 --- a/versions/PROMETHEUS_STACK_VERSION +++ b/versions/PROMETHEUS_STACK_VERSION @@ -1 +1 @@ -61.6.1 +61.7.0 diff --git a/versions/actions/ACTIONS_UPLOAD_ARTIFACT b/versions/actions/ACTIONS_UPLOAD_ARTIFACT index eda862a98c..e198586e42 100644 --- a/versions/actions/ACTIONS_UPLOAD_ARTIFACT +++ b/versions/actions/ACTIONS_UPLOAD_ARTIFACT @@ -1 +1 @@ -4.3.4 +4.3.5