Skip to content

Commit

Permalink
Retry registration timeout fix (#6136)
Browse files Browse the repository at this point in the history
## Motivation
Fix an issue #6035



Co-authored-by: ConvallariaMaj <[email protected]>
  • Loading branch information
ConvallariaMaj and ConvallariaMaj committed Aug 15, 2024
1 parent 1fb28c8 commit 5bedb4e
Show file tree
Hide file tree
Showing 8 changed files with 107 additions and 40 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ such and the node will continue to scan new ATXs for their validity.
### Features

### Improvements
* [#6035](https://github.com/spacemeshos/go-spacemesh/issues/6035) Fixed an issue where the node retried registering for the PoET round
only for 15-20 minutes instead of continuing until the start of the round

## Release v1.6.6-hotfix1

Expand Down
18 changes: 12 additions & 6 deletions activation/activation.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,18 @@ var (

// PoetConfig is the configuration to interact with the poet server.
type PoetConfig struct {
PhaseShift time.Duration `mapstructure:"phase-shift"`
CycleGap time.Duration `mapstructure:"cycle-gap"`
GracePeriod time.Duration `mapstructure:"grace-period"`
RequestTimeout time.Duration `mapstructure:"poet-request-timeout"`
RequestRetryDelay time.Duration `mapstructure:"retry-delay"`
// Offset from the epoch start when the poet round starts
PhaseShift time.Duration `mapstructure:"phase-shift"`
// CycleGap gives the duration between the end of a PoET round and the start of the next
CycleGap time.Duration `mapstructure:"cycle-gap"`
// GracePeriod defines the time before the start of the next PoET round until the node
// waits before building its NiPoST challenge. Shorter durations allow the node to
// possibly pick a better positioning ATX, but come with the risk that the node might
// not be able to validate that ATX and has to fall back to using its own previous ATX.
GracePeriod time.Duration `mapstructure:"grace-period"`
RequestTimeout time.Duration `mapstructure:"poet-request-timeout"`
RequestRetryDelay time.Duration `mapstructure:"retry-delay"`
// Period to find positioning ATX. Must be less, than GracePeriod
PositioningATXSelectionTimeout time.Duration `mapstructure:"positioning-atx-selection-timeout"`
CertifierInfoCacheTTL time.Duration `mapstructure:"certifier-info-cache-ttl"`
PowParamsCacheTTL time.Duration `mapstructure:"pow-params-cache-ttl"`
Expand Down Expand Up @@ -559,7 +566,6 @@ func (b *Builder) BuildNIPostChallenge(ctx context.Context, nodeID types.NodeID)
case <-time.After(time.Until(wait)):
}
}

if b.poetCfg.PositioningATXSelectionTimeout > 0 {
var cancel context.CancelFunc

Expand Down
5 changes: 1 addition & 4 deletions activation/nipost.go
Original file line number Diff line number Diff line change
Expand Up @@ -366,10 +366,7 @@ func (nb *NIPostBuilder) submitPoetChallenge(

logger.Debug("submitting challenge to poet proving service")

submitCtx, cancel := withConditionalTimeout(ctx, nb.poetCfg.RequestTimeout)
defer cancel()

round, err := client.Submit(submitCtx, deadline, prefix, challenge, signature, nodeID)
round, err := client.Submit(ctx, deadline, prefix, challenge, signature, nodeID)
if err != nil {
return nipost.PoETRegistration{},
&PoetSvcUnstableError{msg: "failed to submit challenge to poet service", source: err}
Expand Down
71 changes: 47 additions & 24 deletions activation/poet.go
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ import (
"errors"
"fmt"
"io"
"math"
"math/rand/v2"
"net/http"
"net/url"
"sync"
Expand Down Expand Up @@ -69,10 +71,11 @@ type PoetClient interface {

// HTTPPoetClient implements PoetProvingServiceClient interface.
type HTTPPoetClient struct {
id []byte
baseURL *url.URL
client *retryablehttp.Client
logger *zap.Logger
id []byte
baseURL *url.URL
client *retryablehttp.Client
submitChallengeClient *retryablehttp.Client
logger *zap.Logger
}

func checkRetry(ctx context.Context, resp *http.Response, err error) (bool, error) {
Expand Down Expand Up @@ -126,6 +129,13 @@ func WithLogger(logger *zap.Logger) PoetClientOpts {
}
}

func customLinearJitterBackoff(min, max time.Duration, _ int, _ *http.Response) time.Duration {
if max <= min {
return min
}
return min + rand.N(max-min)
}

// NewHTTPPoetClient returns new instance of HTTPPoetClient connecting to the specified url.
func NewHTTPPoetClient(server types.PoetServer, cfg PoetConfig, opts ...PoetClientOpts) (*HTTPPoetClient, error) {
client := &retryablehttp.Client{
Expand All @@ -136,6 +146,14 @@ func NewHTTPPoetClient(server types.PoetServer, cfg PoetConfig, opts ...PoetClie
CheckRetry: checkRetry,
}

submitChallengeClient := &retryablehttp.Client{
RetryMax: math.MaxInt,
RetryWaitMin: cfg.RequestRetryDelay,
RetryWaitMax: 2 * cfg.RequestRetryDelay,
Backoff: customLinearJitterBackoff,
CheckRetry: retryablehttp.DefaultRetryPolicy,
}

baseURL, err := url.Parse(server.Address)
if err != nil {
return nil, fmt.Errorf("parsing address: %w", err)
Expand All @@ -145,11 +163,13 @@ func NewHTTPPoetClient(server types.PoetServer, cfg PoetConfig, opts ...PoetClie
}

poetClient := &HTTPPoetClient{
id: server.Pubkey.Bytes(),
baseURL: baseURL,
client: client,
logger: zap.NewNop(),
id: server.Pubkey.Bytes(),
baseURL: baseURL,
client: client,
submitChallengeClient: submitChallengeClient,
logger: zap.NewNop(),
}

for _, opt := range opts {
opt(poetClient)
}
Expand All @@ -158,11 +178,11 @@ func NewHTTPPoetClient(server types.PoetServer, cfg PoetConfig, opts ...PoetClie
"created poet client",
zap.Stringer("url", baseURL),
zap.Binary("pubkey", server.Pubkey.Bytes()),
zap.Int("max retries", client.RetryMax),
zap.Int("default max retries", client.RetryMax),
zap.Int("submit challenge max retries", submitChallengeClient.RetryMax),
zap.Duration("min retry wait", client.RetryWaitMin),
zap.Duration("max retry wait", client.RetryWaitMax),
)

return poetClient, nil
}

Expand All @@ -176,7 +196,7 @@ func (c *HTTPPoetClient) Address() string {

func (c *HTTPPoetClient) PowParams(ctx context.Context) (*PoetPowParams, error) {
resBody := rpcapi.PowParamsResponse{}
if err := c.req(ctx, http.MethodGet, "/v1/pow_params", nil, &resBody); err != nil {
if err := c.req(ctx, http.MethodGet, "/v1/pow_params", nil, &resBody, c.client); err != nil {
return nil, fmt.Errorf("querying PoW params: %w", err)
}

Expand Down Expand Up @@ -228,7 +248,7 @@ func (c *HTTPPoetClient) Submit(
}

resBody := rpcapi.SubmitResponse{}
if err := c.req(ctx, http.MethodPost, "/v1/submit", &request, &resBody); err != nil {
if err := c.req(ctx, http.MethodPost, "/v1/submit", &request, &resBody, c.submitChallengeClient); err != nil {
return nil, fmt.Errorf("submitting challenge: %w", err)
}
roundEnd := time.Time{}
Expand All @@ -241,7 +261,7 @@ func (c *HTTPPoetClient) Submit(

func (c *HTTPPoetClient) Info(ctx context.Context) (*types.PoetInfo, error) {
resBody := rpcapi.InfoResponse{}
if err := c.req(ctx, http.MethodGet, "/v1/info", nil, &resBody); err != nil {
if err := c.req(ctx, http.MethodGet, "/v1/info", nil, &resBody, c.client); err != nil {
return nil, fmt.Errorf("getting poet info: %w", err)
}

Expand All @@ -268,7 +288,7 @@ func (c *HTTPPoetClient) Info(ctx context.Context) (*types.PoetInfo, error) {
// Proof implements PoetProvingServiceClient.
func (c *HTTPPoetClient) Proof(ctx context.Context, roundID string) (*types.PoetProofMessage, []types.Hash32, error) {
resBody := rpcapi.ProofResponse{}
if err := c.req(ctx, http.MethodGet, fmt.Sprintf("/v1/proofs/%s", roundID), nil, &resBody); err != nil {
if err := c.req(ctx, http.MethodGet, fmt.Sprintf("/v1/proofs/%s", roundID), nil, &resBody, c.client); err != nil {
return nil, nil, fmt.Errorf("getting proof: %w", err)
}

Expand Down Expand Up @@ -300,7 +320,12 @@ func (c *HTTPPoetClient) Proof(ctx context.Context, roundID string) (*types.Poet
return &proof, members, nil
}

func (c *HTTPPoetClient) req(ctx context.Context, method, path string, reqBody, resBody proto.Message) error {
func (c *HTTPPoetClient) req(
ctx context.Context,
method, path string,
reqBody, resBody proto.Message,
client *retryablehttp.Client,
) error {
jsonReqBody, err := protojson.Marshal(reqBody)
if err != nil {
return fmt.Errorf("marshaling request body: %w", err)
Expand All @@ -312,7 +337,7 @@ func (c *HTTPPoetClient) req(ctx context.Context, method, path string, reqBody,
}
req.Header.Set("Content-Type", "application/json")

res, err := c.client.Do(req)
res, err := client.Do(req)
if err != nil {
return fmt.Errorf("doing request: %w", err)
}
Expand Down Expand Up @@ -343,7 +368,6 @@ func (c *HTTPPoetClient) req(ctx context.Context, method, path string, reqBody,
return fmt.Errorf("decoding response body to proto: %w", err)
}
}

return nil
}

Expand Down Expand Up @@ -371,9 +395,10 @@ func (c *cachedData[T]) get(init func() (T, error)) (T, error) {
// poetService is a higher-level interface to communicate with a PoET service.
// It wraps the HTTP client, adding additional functionality.
type poetService struct {
db poetDbAPI
logger *zap.Logger
client PoetClient
db poetDbAPI
logger *zap.Logger
client PoetClient

requestTimeout time.Duration

// Used to avoid concurrent requests for proof.
Expand Down Expand Up @@ -567,9 +592,7 @@ func (c *poetService) Submit(

logger.Debug("submitting challenge to poet proving service")

submitCtx, cancel := withConditionalTimeout(ctx, c.requestTimeout)
defer cancel()
round, err := c.client.Submit(submitCtx, deadline, prefix, challenge, signature, nodeID, *auth)
round, err := c.client.Submit(ctx, deadline, prefix, challenge, signature, nodeID, *auth)
switch {
case err == nil:
return round, nil
Expand All @@ -579,7 +602,7 @@ func (c *poetService) Submit(
if err != nil {
return nil, fmt.Errorf("authorizing: %w", err)
}
return c.client.Submit(submitCtx, deadline, prefix, challenge, signature, nodeID, *auth)
return c.client.Submit(ctx, deadline, prefix, challenge, signature, nodeID, *auth)
}
return nil, fmt.Errorf("submitting challenge: %w", err)
}
Expand Down
35 changes: 35 additions & 0 deletions activation/poet_client_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,41 @@ func Test_HTTPPoetClient_Submit(t *testing.T) {
require.NoError(t, err)
}

func Test_HTTPPoetClient_SubmitTillCtxCanceled(t *testing.T) {
t.Parallel()
ctx, cancel := context.WithCancel(context.Background())
defer cancel()
tries := 0
mux := http.NewServeMux()
mux.HandleFunc("POST /v1/submit", func(w http.ResponseWriter, r *http.Request) {
tries += 1
if tries == 3 {
cancel()
}
http.Error(w, "some_error", http.StatusInternalServerError)
})
ts := httptest.NewServer(mux)
defer ts.Close()
cfg := server.DefaultRoundConfig()
client, err := NewHTTPPoetClient(types.PoetServer{Address: ts.URL}, PoetConfig{
PhaseShift: cfg.PhaseShift,
CycleGap: cfg.CycleGap,
MaxRequestRetries: 1,
}, withCustomHttpClient(ts.Client()))
require.NoError(t, err)
_, err = client.Submit(
ctx,
time.Time{},
nil,
nil,
types.EmptyEdSignature,
types.NodeID{},
PoetAuth{},
)
require.ErrorIs(t, err, context.Canceled)
require.Equal(t, 3, tries)
}

func Test_HTTPPoetClient_Address(t *testing.T) {
t.Run("with scheme", func(t *testing.T) {
t.Parallel()
Expand Down
8 changes: 4 additions & 4 deletions cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -317,13 +317,13 @@ func AddFlags(flagSet *pflag.FlagSet, cfg *config.Config) (configPath *string) {
/**======================== PoET Flags ========================== **/

flagSet.DurationVar(&cfg.POET.PhaseShift, "phase-shift",
cfg.POET.PhaseShift, "phase shift of poet server")
cfg.POET.PhaseShift, "phase shift of poet server: duration after epoch start, at which poet round starts")
flagSet.DurationVar(&cfg.POET.CycleGap, "cycle-gap",
cfg.POET.CycleGap, "cycle gap of poet server")
cfg.POET.CycleGap, "cycle gap of poet server: gap between poet rounds")
flagSet.DurationVar(&cfg.POET.GracePeriod, "grace-period",
cfg.POET.GracePeriod, "time before PoET round starts when the node builds and submits a challenge")
cfg.POET.GracePeriod, "time before poet round starts, when the node builds and submits a challenge")
flagSet.DurationVar(&cfg.POET.RequestTimeout, "poet-request-timeout",
cfg.POET.RequestTimeout, "timeout for poet requests")
cfg.POET.RequestTimeout, "default timeout for poet requests")

/**======================== bootstrap data updater Flags ========================== **/

Expand Down
4 changes: 3 additions & 1 deletion config/presets/fastnet.go
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,9 @@ func fastnet() config.Config {
conf.POET.GracePeriod = 10 * time.Second
conf.POET.CycleGap = 30 * time.Second
conf.POET.PhaseShift = 30 * time.Second
conf.POET.RequestTimeout = 12 * time.Second // RequestRetryDelay * 2 * MaxRequestRetries*(MaxRequestRetries+1)/2
conf.POET.PositioningATXSelectionTimeout = 8 * time.Second
// RequestTimeout = RequestRetryDelay * 2 * MaxRequestRetries*(MaxRequestRetries+1)/2
conf.POET.RequestTimeout = 12 * time.Second
conf.POET.RequestRetryDelay = 1 * time.Second
conf.POET.MaxRequestRetries = 3
conf.POET.CertifierInfoCacheTTL = time.Minute
Expand Down
4 changes: 3 additions & 1 deletion config/presets/standalone.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,9 @@ func standalone() config.Config {
conf.POET.GracePeriod = 12 * time.Second
conf.POET.CycleGap = 30 * time.Second
conf.POET.PhaseShift = 30 * time.Second
conf.POET.RequestTimeout = 12 * time.Second // RequestRetryDelay * 2 * MaxRequestRetries*(MaxRequestRetries+1)/2
conf.POET.PositioningATXSelectionTimeout = 8 * time.Second
// RequestTimeout = RequestRetryDelay * 2 * MaxRequestRetries*(MaxRequestRetries+1)/2
conf.POET.RequestTimeout = 12 * time.Second
conf.POET.RequestRetryDelay = 1 * time.Second
conf.POET.MaxRequestRetries = 3
conf.POET.CertifierInfoCacheTTL = time.Minute
Expand Down

0 comments on commit 5bedb4e

Please sign in to comment.