Skip to content

Commit

Permalink
Merge pull request #796 from jnummelin/etcd-health-check
Browse files Browse the repository at this point in the history
Simplify etcd health check by trying to get a value from the kv store
  • Loading branch information
ncopa authored Mar 23, 2021
2 parents 768427a + cd39317 commit 8581c65
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 46 deletions.
28 changes: 28 additions & 0 deletions cmd/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ package cmd
import (
"context"
"fmt"
"time"

"github.com/sirupsen/logrus"
"github.com/spf13/cobra"
Expand All @@ -31,6 +32,8 @@ func init() {

etcdCmd.AddCommand(etcdLeaveCmd)
etcdCmd.AddCommand(etcdListCmd)
etcdCmd.AddCommand(etcdHealthCmd)

addPersistentFlags(etcdCmd)
}

Expand Down Expand Up @@ -123,3 +126,28 @@ var (
},
}
)

var (
etcdHealthCmd = &cobra.Command{
Use: "health",
Short: "Returns etcd cluster members health status",
RunE: func(cmd *cobra.Command, args []string) error {
ctx := context.Background()
etcdClient, err := etcd.NewClient(k0sVars.CertRootDir, k0sVars.EtcdCertDir)
if err != nil {
return fmt.Errorf("can't create etcd client: %v", err)
}

context, cancel := context.WithTimeout(ctx, 5*time.Second)
defer cancel()
err = etcdClient.Health(context)
if err != nil {
return err
}

fmt.Println("etcd healthy")

return nil
},
}
)
2 changes: 1 addition & 1 deletion pkg/component/controller/etcd.go
Original file line number Diff line number Diff line change
Expand Up @@ -257,8 +257,8 @@ func (e *Etcd) setupCerts() error {
func (e *Etcd) Healthy() error {
logrus.WithField("component", "etcd").Debug("checking etcd endpoint for health")
ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second)
defer cancel()
err := etcd.CheckEtcdReady(ctx, e.K0sVars.CertRootDir, e.K0sVars.EtcdCertDir)
cancel()
return err
}

Expand Down
22 changes: 19 additions & 3 deletions pkg/etcd/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,24 +23,26 @@ import (
"github.com/pkg/errors"

"go.etcd.io/etcd/clientv3"
"go.etcd.io/etcd/etcdserver/api/v3rpc/rpctypes"
"go.etcd.io/etcd/pkg/transport"
)

// Client is our internal helper to access some of the etcd APIs
type Client struct {
client *clientv3.Client
client *clientv3.Client
tlsInfo transport.TLSInfo
}

// NewClient creates new Client
func NewClient(certDir string, etcdCertDir string) (*Client, error) {
client := &Client{}
tlsInfo := transport.TLSInfo{
client.tlsInfo = transport.TLSInfo{
CertFile: filepath.Join(certDir, "apiserver-etcd-client.crt"),
KeyFile: filepath.Join(certDir, "apiserver-etcd-client.key"),
TrustedCAFile: filepath.Join(etcdCertDir, "ca.crt"),
}

tlsConfig, err := tlsInfo.ClientConfig()
tlsConfig, err := client.tlsInfo.ClientConfig()
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -120,3 +122,17 @@ func (c *Client) DeleteMember(ctx context.Context, peerID uint64) error {
func (c *Client) Close() {
c.client.Close()
}

// Health return err if the etcd peer is not reported as healthy
// ref: https://github.com/etcd-io/etcd/blob/3ead91ca3edf66112d56c453169343515bba71c3/etcdctl/ctlv3/command/ep_command.go#L89
func (c *Client) Health(ctx context.Context) error {
_, err := c.client.Get(ctx, "health")

// permission denied is OK since proposal goes through consensus to get it
if err == nil || err == rpctypes.ErrPermissionDenied {
return nil
}

return err

}
43 changes: 1 addition & 42 deletions pkg/etcd/health.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,8 @@ package etcd

import (
"context"
"net/http"
"net/url"
"path/filepath"
"time"

"github.com/sirupsen/logrus"
"go.etcd.io/etcd/pkg/transport"
)

// CheckEtcdReady returns true if etcd responds to the metrics endpoint with a status code of 200
Expand All @@ -18,42 +13,6 @@ func CheckEtcdReady(ctx context.Context, certDir string, etcdCertDir string) err
logrus.Errorf("failed to initialize etcd client: %v", err)
return err
}
memberList, err := c.client.MemberList(ctx)
if err != nil {
logrus.Errorf("failed to fetch etcd member list: %v\n", err)
return err
}

u, err := url.Parse(memberList.Members[0].ClientURLs[0])
if err != nil {
logrus.Errorf("cannot fetch health endpoint: %v\n", err)
return err
}

// the metrics endpoint was selected as a health endpoint in the official etcd docs: https://etcd.io/docs/v3.4.0/op-guide/monitoring/
u.Path = "/metrics"

tr, err := transport.NewTransport(transport.TLSInfo{
CertFile: filepath.Join(certDir, "apiserver-etcd-client.crt"),
KeyFile: filepath.Join(certDir, "apiserver-etcd-client.key"),
TrustedCAFile: filepath.Join(etcdCertDir, "ca.crt"),
}, 5*time.Second)
if err != nil {
logrus.Errorf("error encountered setting up healthcheck TLS config: %v\n", err)
}

resp, err := tr.RoundTrip(&http.Request{
Header: make(http.Header),
Method: http.MethodGet,
URL: u,
})
if err != nil {
logrus.Errorf("error accessing health endpoint: %v\n", err)
return err
}
if resp.StatusCode != http.StatusOK {
logrus.Printf("received unexpected status code from endpoint. expected %v, received %v", http.StatusOK, resp.StatusCode)
}

return nil
return c.Health(ctx)
}

0 comments on commit 8581c65

Please sign in to comment.