From 67ed1212097987a4e61f832980ca73be6d976ad0 Mon Sep 17 00:00:00 2001 From: katelyn martin Date: Mon, 23 Sep 2024 17:51:13 -0400 Subject: [PATCH] fix(retry): Avoid panicking if responses come early (#3216) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `linkerd-http-retry` and `linkerd-retry` provide generic `tower` middleware to allow services to retry requests that fail. part of this middleware hinges on a `ReplayBody` that will lazily buffer request bodies' data for use in subsequent attempts. if a request fails, the retry middleware will attempt to send another request, assuming the original body was able to completely fit into this buffer. `ReplayBody` makes a subtle assummption, most succinctly stated in this excerpt of an internal method's documentation: ```rust // linkerd/http/retry/src/replay.rs impl ReplayBody { /// This panics if another clone has currently acquired the state, based on /// the assumption that a retry body will not be polled until the previous /// request has been dropped. fn acquire_state<'a>( state: &'a mut Option>, shared: &Mutex>>, ) -> &'a mut BodyState { // ... } } ``` this assumption is slightly at odds with the request/response lifecycle permitted within the HTTP/2 specification. see RFC 9113 ยง 8.1, "_HTTP Message Framing_" (emphasis added): > .. > An HTTP request/response exchange fully consumes a single stream. A > request starts with the HEADERS frame that puts the stream into the > "open" state. **The request ends with a frame with the END_STREAM flag > set**, which causes the stream to become "half-closed (local)" for the > client and "half-closed (remote)" for the server. A response stream > starts with zero or more interim responses in HEADERS frames, followed > by a HEADERS frame containing a final status code. > > An HTTP response is complete after the server sends -- or the client > receives -- a frame with the END_STREAM flag set (including any > CONTINUATION frames needed to complete a field block). **A server can > send a complete response prior to the client sending an entire request > if the response does not depend on any portion of the request that has > not been sent and received.** because of this, a retry may panic when checking if the previous request body was capped, if a server delivers a response before the request is complete. this has been observed when retrying [wire-grpc](https://github.com/square/wire) requests, manifesting in a panic with this message: ```text thread 'main' panicked at 'if our `state` was `None`, the shared state must be `Some`', /__w/linkerd2-proxy/linkerd2-proxy/linkerd/http-retry/src/replay.rs:152:22 ``` this commit refactors `ReplayBody::is_capped()` so that it will no longer panic if there is an outstanding body still being polled. rather, it will return `Some(true)` or `Some(false)` if the previous body was capped, or `None` if it has not finished streaming. the related logic in the `linkerd-http-retry` library is updated to refrain from attempting a retry if a response is received before the request stream was completed. Signed-off-by: katelyn martin --- linkerd/app/outbound/src/http/retry.rs | 13 +++++++++++-- linkerd/http/retry/src/lib.rs | 7 +++++-- linkerd/http/retry/src/replay.rs | 16 ++++++---------- 3 files changed, 22 insertions(+), 14 deletions(-) diff --git a/linkerd/app/outbound/src/http/retry.rs b/linkerd/app/outbound/src/http/retry.rs index ef9a90fe87..427587d5f0 100644 --- a/linkerd/app/outbound/src/http/retry.rs +++ b/linkerd/app/outbound/src/http/retry.rs @@ -93,8 +93,17 @@ where .is_failure(); // did the body exceed the maximum length limit? let exceeded_max_len = req.body().is_capped(); - let retryable = is_failure && !exceeded_max_len; - tracing::trace!(is_failure, exceeded_max_len, retryable); + let retryable = if let Some(false) = exceeded_max_len { + // If the body hasn't exceeded our length limit, we should + // retry the request if it's a failure of some sort. + is_failure + } else { + // We received a response before the request body was fully + // finished streaming. To be safe, we will consider this + // as an unretryable request. + false + }; + tracing::trace!(is_failure, ?exceeded_max_len, retryable); retryable } }; diff --git a/linkerd/http/retry/src/lib.rs b/linkerd/http/retry/src/lib.rs index ac07c6c868..5c47cc2c64 100644 --- a/linkerd/http/retry/src/lib.rs +++ b/linkerd/http/retry/src/lib.rs @@ -268,7 +268,10 @@ async fn send_req_with_retries( tracing::trace!("Success on first attempt"); return result.map(|rsp| rsp.map(BoxBody::new)); } - if backup.body().is_capped() { + if matches!(backup.body().is_capped(), None | Some(true)) { + // The body was either too large, or we received an early response + // before the request body was completed read. We cannot safely + // attempt to send this request again. return result.map(|rsp| rsp.map(BoxBody::new)); } @@ -300,7 +303,7 @@ async fn send_req_with_retries( tracing::debug!("Retry success"); return result.map(|rsp| rsp.map(BoxBody::new)); } - if backup.body().is_capped() { + if matches!(backup.body().is_capped(), None | Some(true)) { return result.map(|rsp| rsp.map(BoxBody::new)); } } diff --git a/linkerd/http/retry/src/replay.rs b/linkerd/http/retry/src/replay.rs index ff53843146..e57f80991d 100644 --- a/linkerd/http/retry/src/replay.rs +++ b/linkerd/http/retry/src/replay.rs @@ -136,23 +136,19 @@ impl ReplayBody { state.get_or_insert_with(|| shared.lock().take().expect("missing body state")) } - /// Returns `true` if the body previously exceeded the configured maximum + /// Returns `Some(true)` if the body previously exceeded the configured maximum /// length limit. /// /// If this is true, the body is now empty, and the request should *not* be /// retried with this body. - pub fn is_capped(&self) -> bool { + /// + /// If this is `None`, another clone has currently acquired the state, and is + /// still being polled. + pub fn is_capped(&self) -> Option { self.state .as_ref() .map(BodyState::is_capped) - .unwrap_or_else(|| { - self.shared - .body - .lock() - .as_ref() - .expect("if our `state` was `None`, the shared state must be `Some`") - .is_capped() - }) + .or_else(|| self.shared.body.lock().as_ref().map(BodyState::is_capped)) } }