diff --git a/region/client.go b/region/client.go index bb6f1e3..2a289f0 100644 --- a/region/client.go +++ b/region/client.go @@ -65,25 +65,25 @@ var ( } // If a Java exception listed here is returned by HBase, the client should - // backoff and resend the RPC message to the same region and region server + // backoff and resend the RPC message to the same region and region server. // The value of exception should be contained in the stack trace. javaRetryableExceptions = map[string]string{ "org.apache.hadoop.hbase.CallQueueTooBigException": "", "org.apache.hadoop.hbase.exceptions.RegionOpeningException": "", - "org.apache.hadoop.hbase.ipc.ServerNotRunningYetException": "", "org.apache.hadoop.hbase.quotas.RpcThrottlingException": "", "org.apache.hadoop.hbase.RetryImmediatelyException": "", "org.apache.hadoop.hbase.RegionTooBusyException": "", + "org.apache.hadoop.hbase.PleaseHoldException": "", } - // javaServerExceptions is a map where all Java exceptions that signify - // the RPC should be sent again are listed (as keys). If a Java exception - // listed here is returned by HBase, the RegionClient will be closed and a new - // one should be established. + // If a Java exception listed here is returned by HBase, the RegionClient + // will be closed and a new one should be established. // The value of exception should be contained in the stack trace. javaServerExceptions = map[string]string{ "org.apache.hadoop.hbase.regionserver.RegionServerAbortedException": "", "org.apache.hadoop.hbase.regionserver.RegionServerStoppedException": "", + "org.apache.hadoop.hbase.exceptions.MasterStoppedException": "", + "org.apache.hadoop.hbase.ipc.ServerNotRunningYetException": "", } ) diff --git a/rpc.go b/rpc.go index b13e578..6b1d15e 100644 --- a/rpc.go +++ b/rpc.go @@ -91,6 +91,7 @@ func (c *client) SendRPC(rpc hrpc.Call) (msg proto.Message, err error) { }() backoff := backoffStart + serverErrorCount := 0 for { rc, err := c.getRegionAndClientForRPC(ctx, rpc) if err != nil { @@ -105,7 +106,20 @@ func (c *client) SendRPC(rpc hrpc.Call) (msg proto.Message, err error) { return msg, err } continue // retry - case region.ServerError, region.NotServingRegionError: + case region.ServerError: + // Retry ServerError immediately, as we want failover fast to + // another server. But if HBase keep sending us ServerError, we + // should start to backoff. We don't want to overwhelm HBase. + if serverErrorCount > 1 { + sp.AddEvent("retrySleep") + backoff, err = sleepAndIncreaseBackoff(ctx, backoff) + if err != nil { + return msg, err + } + } + serverErrorCount++ + continue // retry + case region.NotServingRegionError: continue // retry } return msg, err