Skip to content

Commit

Permalink
Always leave election when stopping (#40)
Browse files Browse the repository at this point in the history
* feat(Makefile): add a preflight check when running the dev env

* chore: add more explicit make targets for run, upgrade docs

* fix(readiness): use structured logs when logging an healthcheck retry

* fix(notifier): do not retry on cancel

* fix(cmd): improve logging

* fix(cmd): honor defers and always leave election when stopping

* chore(cmd): lower lease-duration to 10 seconds
  • Loading branch information
jlevesy authored Jun 7, 2023
1 parent 7c9e0f8 commit 466c337
Show file tree
Hide file tree
Showing 7 changed files with 94 additions and 23 deletions.
20 changes: 17 additions & 3 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,13 @@ unit_test:
### Dev

.PHONY: run
run: create_cluster install_agent_example
run: run_agent_example

.PHONY: run_agent_example
run_agent_example: check_dev_dependencies create_cluster install_agent_example

.PHONY: run_proxy_example
run_proxy_example: check_dev_dependencies create_cluster install_proxy_example

.PHONY: create_cluster
create_cluster: ## run a local k3d cluster
Expand All @@ -36,8 +42,8 @@ install_agent_example: install_storage
-f ./example/k8s/agent-values.yaml \
prometheus-elector-dev ./helm | KO_DOCKER_REPO=prometheus-elector-registry.localhost:5000 ko apply -B -t dev -f -

.PHONY: install_ha_example
install_ha_example:
.PHONY: install_proxy_example
install_proxy_example:
helm template \
--set elector.image.devRef=ko://github.com/jlevesy/prometheus-elector/cmd \
--set prometheus.image.repository=jlevesy/prometheus \
Expand All @@ -63,6 +69,14 @@ run_agent_local: dist
dist:
mkdir -p dist

.PHONY: check_dev_dependencies
check_dev_dependencies: ## Checks that all the necesary depencencies for the dev env are present
@helm version >/dev/null 2>&1 || (echo "ERROR: helm is required."; exit 1)
@k3d version >/dev/null 2>&1 || (echo "ERROR: k3d is required."; exit 1)
@kubectl version --client >/dev/null 2>&1 || (echo "ERROR: kubectl is required."; exit 1)
@ko version >/dev/null 2>&1 || (echo "ERROR: google/ko is required."; exit 1)
@grep -Fq "prometheus-elector-registry.localhost" /etc/hosts || (echo "ERROR: please add the following line `prometheus-elector-registry.localhost 127.0.0.1` to your /etc/hosts file"; exit 1)

### CI

.PHONY: ci_release
Expand Down
9 changes: 7 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@ You can find the necessary configuration for this use case in the [example direc

#### Running an Example of this Setup

You need [ko](https://github.com/ko-build/ko), `kubectl` and [k3d](https://github.com/k3d-io/k3d) and docker installed, from there run `make create_cluster install_agent_example`.
You need [ko](https://github.com/ko-build/ko), `kubectl`, [k3d](https://github.com/k3d-io/k3d), `docker` and `helm` installed. You also need to make sure that `prometheus-elector-registry.localhost` resolves to `127.0.0.1` by adding an entry in your `/etc/hosts`.

You can then run `make run_agent_example`.

This command:

Expand All @@ -51,7 +53,10 @@ You can find the necessary configuration for this use case in the [example direc

#### Running an Example of this setup

You need [ko](https://github.com/ko-build/ko), `kubectl` and [k3d](https://github.com/k3d-io/k3d) and docker installed, from there run `make create_cluster install_ha_example`.

You need [ko](https://github.com/ko-build/ko), `kubectl`, [k3d](https://github.com/k3d-io/k3d), `docker` and `helm` installed. You also need to make sure that `prometheus-elector-registry.localhost` resolves to `127.0.0.1` by adding an entry in your `/etc/hosts`.

Then you can run `make run_proxy_example`.

This command:

Expand Down
4 changes: 2 additions & 2 deletions cmd/config.go
Original file line number Diff line number Diff line change
Expand Up @@ -178,8 +178,8 @@ func (c *cliConfig) setupFlags() {

flag.StringVar(&c.leaseName, "lease-name", "", "Name of lease resource")
flag.StringVar(&c.leaseNamespace, "lease-namespace", "", "Name of lease resource namespace")
flag.DurationVar(&c.leaseDuration, "lease-duration", 15*time.Second, "Duration of a lease, client wait the full duration of a lease before trying to take it over")
flag.DurationVar(&c.leaseRenewDeadline, "lease-renew-deadline", 10*time.Second, "Maximum duration spent trying to renew the lease")
flag.DurationVar(&c.leaseDuration, "lease-duration", 10*time.Second, "Duration of a lease, client wait the full duration of a lease before trying to take it over")
flag.DurationVar(&c.leaseRenewDeadline, "lease-renew-deadline", 8*time.Second, "Maximum duration spent trying to renew the lease")
flag.DurationVar(&c.leaseRetryPeriod, "lease-retry-period", 2*time.Second, "Delay between two attempts of taking/renewing the lease")

flag.StringVar(&c.kubeConfigPath, "kubeconfig", "", "Path to a kubeconfig. Only required if out-of-cluster.")
Expand Down
48 changes: 33 additions & 15 deletions cmd/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ import (
"os"
"os/signal"
"path/filepath"
"syscall"
"time"

"golang.org/x/sync/errgroup"
Expand All @@ -27,36 +28,42 @@ import (
)

func main() {
os.Exit(run())
}

func run() int {
var (
cfg = newCLIConfig()
ctx, cancel = signal.NotifyContext(context.Background(), os.Interrupt)
ctx, cancel = signal.NotifyContext(context.Background(), syscall.SIGTERM, syscall.SIGINT)

promReady = make(chan struct{})
grp, grpCtx = errgroup.WithContext(ctx)
)

defer cancel()
cfg.setupFlags()

flag.Parse()

if err := cfg.validateInitConfig(); err != nil {
klog.Fatal("Invalid init config: ", err)
klog.ErrorS(err, "Invalid init config")
return 1
}

reconciller := config.NewReconciller(cfg.configPath, cfg.outputPath)

if err := reconciller.Reconcile(ctx); err != nil {
klog.Fatal("Can't perform an initial sync: ", err)
klog.ErrorS(err, "Can't perform an initial sync")
return 1
}

if cfg.init {
klog.Info("Running in init mode, exiting")
return
return 0
}

if err := cfg.validateRuntimeConfig(); err != nil {
klog.Fatal("Invalid election config: ", err)
klog.ErrorS(err, "Invalid election config")
return 1
}

metricsRegistry := prometheus.NewRegistry()
Expand All @@ -82,12 +89,14 @@ func main() {

k8sConfig, err := clientcmd.BuildConfigFromFlags("", cfg.kubeConfigPath)
if err != nil {
klog.Fatal("Unable to build kube client configuration: ", err)
klog.ErrorS(err, "Unable to build kube client configuration")
return 1
}

k8sClient, err := kubernetes.NewForConfig(k8sConfig)
if err != nil {
klog.Fatal("Can't build the k8s client: ", err)
klog.ErrorS(err, "Can't build the Kubernetes client")
return 1
}

elector, err := election.New(
Expand Down Expand Up @@ -131,24 +140,30 @@ func main() {
metricsRegistry,
)
if err != nil {
klog.Fatal("Can't setup election", err)
klog.ErrorS(err, "Can't setup the election")
return 1
}

// Always stop the election.
// Always leave the election.
defer func() {
stopCtx, cancel := context.WithTimeout(context.Background(), 15*time.Second)
defer cancel()

klog.Info("Graceful shutdown, leaving the election")

if err := elector.Stop(stopCtx); err != nil && errors.Is(err, election.ErrNotRunning) {
klog.ErrorS(err, "unable to stop the elector")
klog.ErrorS(err, "Unable to leave the election")
}

klog.Info("Graceful shutdown, left the election")
}()

reconciller.SetLeaderChecker(elector.Status())

watcher, err := watcher.New(filepath.Dir(cfg.configPath), reconciller, notifier)
if err != nil {
klog.Fatal("Can't create the watcher: ", err)
klog.ErrorS(err, "Can't create the watcher")
return 1
}
defer watcher.Close()

Expand All @@ -166,7 +181,8 @@ func main() {
)

if err != nil {
klog.Fatal("Can't set up API server", err)
klog.ErrorS(err, "Can't set up the API server")
return 1
}

var readinessWaiter readiness.Waiter = readiness.NoopWaiter{}
Expand Down Expand Up @@ -248,8 +264,10 @@ func main() {
grp.Go(func() error { return apiServer.Serve(grpCtx) })

if err := grp.Wait(); err != nil {
klog.Fatal("Error while running prometheus-elector, reason: ", err)
klog.ErrorS(err, "prometheus-elector has reported an error while running")
return 1
}

klog.Info("prometheus-elector exited successfully")
klog.Info("prometheus-elector is gracefully stopping")
return 0
}
29 changes: 29 additions & 0 deletions notifier/notifier_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -93,3 +93,32 @@ func TestHTTPNotifierExhaustRetries(t *testing.T) {

assert.Equal(t, 10, totalReceived)
}

func TestHTTPNotifierNoRetryOnContextCanceled(t *testing.T) {
var (
totalReceived int
ctx, cancel = context.WithCancel(context.Background())
srv = httptest.NewServer(http.HandlerFunc(func(rw http.ResponseWriter, r *http.Request) {
require.Equal(t, r.Method, http.MethodPost)
totalReceived++
rw.WriteHeader(http.StatusInternalServerError)
}))
notifier = notifier.WithRetry(
notifier.NewHTTP(
srv.URL,
http.MethodPost,
time.Second,
),
10,
0*time.Second,
)
)

defer srv.Close()

cancel()
err := notifier.Notify(ctx)
require.Nil(t, err)

assert.Equal(t, 0, totalReceived)
}
5 changes: 5 additions & 0 deletions notifier/retry.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ package notifier

import (
"context"
"errors"
"fmt"
"time"

Expand Down Expand Up @@ -30,6 +31,10 @@ func (r *retryNotifier) Notify(ctx context.Context) error {
return nil
}

if errors.Is(err, context.Canceled) {
return nil
}

if j > 0 {
klog.ErrorS(err, "Failed to notify prometheus, will retry...", "attempt", r.maxAttempts-j, "maxAttempts", r.maxAttempts)
time.Sleep(r.delay)
Expand Down
2 changes: 1 addition & 1 deletion readiness/http.go
Original file line number Diff line number Diff line change
Expand Up @@ -65,7 +65,7 @@ func (w *httpWaiter) checkReadiness(ctx context.Context) (bool, error) {
}

if rsp.StatusCode != http.StatusOK {
klog.Error("Prometheus isn't ready yet", "status", rsp.StatusCode)
klog.InfoS("Prometheus isn't ready yet", "status", rsp.StatusCode)
return false, nil
}

Expand Down

0 comments on commit 466c337

Please sign in to comment.