From 3d971df5c2bebae734468ab001810019384b3c44 Mon Sep 17 00:00:00 2001 From: Javier Evans Date: Wed, 6 Mar 2024 11:40:32 -0800 Subject: [PATCH] Cache byte range requests (#215). Fixes #188 # What A potential fix for #188 When the `Range` header is supplied: * NGINX will perform subrequests to s3 in byte ranges of `PROXY_CACHE_SLICE_SIZE` until the requested range is satisfied * Cache will be populated in slices of `PROXY_CACHE_SLICE_SIZE`. * Only the requested byte range will be cached When the `Range` header is not supplied: * Normal behavior - files will be cached in their entirety * For large files, `proxy_cache_lock` ensures that multiple requests for the same file are not cached multiple times. Requests received after the initial `MISS` will queue until they can be served from the cache (the initial request cache write is complete). ## Implementation Details * This solution takes advantage of the existing [redirectToS3](https://github.com/nginxinc/nginx-s3-gateway/blob/656395c2b2cc8aaf79a78b59b4abbe5b5d04a5a3/common/etc/nginx/include/s3gateway.js#L347) function to change the target NGINX conf location based on the presence of the `Range` header * The main configuration for the s3 proxy action has been broken out into `common/etc/nginx/templates/gateway/s3_location_common.conf.template` * A separate cache is defined for the slice-based caching * In the slice caching location, the [http_slice_module](http://nginx.org/en/docs/http/ngx_http_slice_module.html) is configured and other caching options overridden as necessary. ## Examples ### Normal Request ```bash curl -o foo.txt localhost:8989/a/5mb.txt % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 5120k 100 5120k 0 0 111M 0 --:--:-- --:--:-- --:--:-- 113M ``` A single cache file is created ```bash root@f339daeb2d44:/var/cache/nginx/s3_proxy# tree . . `-- 5 `-- 9e `-- 447b5a707c18a4c0e90344925e6b39e5 ``` The size of the cache file is equal to the requested file: ```bash root@f339daeb2d44:/var/cache/nginx/s3_proxy# du -h . 5.1M ./5/9e 5.1M ./5 5.1M . ``` ### Byte Range Request In this example, I'm requesting a 5mb file, and the `PROXY_CACHE_SLICE_SIZE` option has been set to `1000k` (1000 [kilobytes](http://nginx.org/en/docs/syntax.html)) ```bash curl -o foo.txt -r 1000000-4000000 localhost:8989/a/5mb.txt % Total % Received % Xferd Average Speed Time Time Time Current Dload Upload Total Spent Left Speed 100 2929k 100 2929k 0 0 66.8M 0 --:--:-- --:--:-- --:--:-- 68.1M ``` Cache files are created in chunks: ```bash root@f339daeb2d44:/var/cache/nginx/s3_proxy_slices# tree . . |-- 0 | `-- 5c | `-- 18f94c01f7a1beed3afe0aa92baf05c0 |-- 4 | `-- 30 | `-- 9fac913edc79622fdcc2975d91e4f304 |-- b | `-- 5b | `-- 91bfb9ef86136be4b07cdc2eb51025bb `-- d `-- 82 `-- 339384e3e9840cf7f8fe4e54fdc8182d ``` The size of each cache file is roughly equal to the requested file the chunk size: ```bash root@f339daeb2d44:/var/cache/nginx/s3_proxy_slices# du -h . 1008K ./d/82 1012K ./d 1008K ./0/5c 1012K ./0 1008K ./b/5b 1012K ./b 1008K ./4/30 1012K ./4 4.0M . ``` --- Dockerfile.buildkit.plus | 1 + Dockerfile.oss | 1 + Dockerfile.plus | 1 + common/etc/nginx/include/s3gateway.js | 7 ++- common/etc/nginx/nginx.conf | 2 + .../etc/nginx/templates/cache.conf.template | 8 +++ .../etc/nginx/templates/default.conf.template | 57 +++++-------------- .../gateway/s3_location_common.conf.template | 44 ++++++++++++++ docs/getting_started.md | 14 ++++- settings.example | 1 + standalone_ubuntu_oss_install.sh | 3 + test/docker-compose.yaml | 1 + test/integration/test_api.sh | 33 ++++++++++- 13 files changed, 127 insertions(+), 46 deletions(-) create mode 100644 common/etc/nginx/templates/gateway/s3_location_common.conf.template diff --git a/Dockerfile.buildkit.plus b/Dockerfile.buildkit.plus index 36c7996f..50a09430 100644 --- a/Dockerfile.buildkit.plus +++ b/Dockerfile.buildkit.plus @@ -7,6 +7,7 @@ ENV XSLT_VERSION 30-1 ENV PROXY_CACHE_MAX_SIZE "10g" ENV PROXY_CACHE_INACTIVE "60m" +ENV PROXY_CACHE_SLICE_SIZE "1m" ENV PROXY_CACHE_VALID_OK "1h" ENV PROXY_CACHE_VALID_NOTFOUND "1m" ENV PROXY_CACHE_VALID_FORBIDDEN "30s" diff --git a/Dockerfile.oss b/Dockerfile.oss index 201db64b..3ad4568c 100644 --- a/Dockerfile.oss +++ b/Dockerfile.oss @@ -5,6 +5,7 @@ ENV NJS_VERSION "0.8.2" ENV PROXY_CACHE_MAX_SIZE "10g" ENV PROXY_CACHE_INACTIVE "60m" +ENV PROXY_CACHE_SLICE_SIZE "1m" ENV PROXY_CACHE_VALID_OK "1h" ENV PROXY_CACHE_VALID_NOTFOUND "1m" ENV PROXY_CACHE_VALID_FORBIDDEN "30s" diff --git a/Dockerfile.plus b/Dockerfile.plus index 80f2d15d..80f604ac 100644 --- a/Dockerfile.plus +++ b/Dockerfile.plus @@ -7,6 +7,7 @@ ENV XSLT_VERSION 30-1 ENV PROXY_CACHE_MAX_SIZE "10g" ENV PROXY_CACHE_INACTIVE "60m" +ENV PROXY_CACHE_SLICE_SIZE "1m" ENV PROXY_CACHE_VALID_OK "1h" ENV PROXY_CACHE_VALID_NOTFOUND "1m" ENV PROXY_CACHE_VALID_FORBIDDEN "30s" diff --git a/common/etc/nginx/include/s3gateway.js b/common/etc/nginx/include/s3gateway.js index b1be836a..7a497cf8 100644 --- a/common/etc/nginx/include/s3gateway.js +++ b/common/etc/nginx/include/s3gateway.js @@ -362,7 +362,12 @@ function redirectToS3(r) { } else if (!ALLOW_LISTING && !PROVIDE_INDEX_PAGE && uriPath === "/") { r.internalRedirect("@error404"); } else { - r.internalRedirect("@s3"); + if (r.headersIn["Range"]) { + r.internalRedirect("@s3_sliced"); + } else { + r.internalRedirect("@s3"); + } + } } diff --git a/common/etc/nginx/nginx.conf b/common/etc/nginx/nginx.conf index 99b216a7..cd938089 100644 --- a/common/etc/nginx/nginx.conf +++ b/common/etc/nginx/nginx.conf @@ -26,7 +26,9 @@ env APPEND_SLASH_FOR_POSSIBLE_DIRECTORY; env DIRECTORY_LISTING_PATH_PREFIX; env PROXY_CACHE_MAX_SIZE; env PROXY_CACHE_INACTIVE; +env PROXY_CACHE_SLICE_SIZE; env PROXY_CACHE_VALID_OK; +env PROXY_CACHE_SLICE_SIZE; env PROXY_CACHE_VALID_NOTFOUND; env PROXY_CACHE_VALID_FORBIDDEN; env HEADER_PREFIXES_TO_STRIP; diff --git a/common/etc/nginx/templates/cache.conf.template b/common/etc/nginx/templates/cache.conf.template index 069d57ad..fc9ac922 100644 --- a/common/etc/nginx/templates/cache.conf.template +++ b/common/etc/nginx/templates/cache.conf.template @@ -6,3 +6,11 @@ keys_zone=s3_cache:10m max_size=$PROXY_CACHE_MAX_SIZE inactive=$PROXY_CACHE_INACTIVE use_temp_path=off; + + +proxy_cache_path /var/cache/nginx/s3_proxy_slices +levels=1:2 +keys_zone=s3_cache_slices:10m +max_size=$PROXY_CACHE_MAX_SIZE +inactive=$PROXY_CACHE_INACTIVE +use_temp_path=off; diff --git a/common/etc/nginx/templates/default.conf.template b/common/etc/nginx/templates/default.conf.template index 210ae7ce..faa1e492 100644 --- a/common/etc/nginx/templates/default.conf.template +++ b/common/etc/nginx/templates/default.conf.template @@ -83,7 +83,7 @@ server { # CORS is implemented by returning the appropriate headers as part of # the response to an OPTIONS request. If you want to customize the # CORS response, the cors.conf.template file can be overwritten and - # extended to meet one's needs. + # extended to meet your needs. include /etc/nginx/conf.d/gateway/cors.conf; auth_request /aws/credentials/retrieve; @@ -101,51 +101,22 @@ server { include /etc/nginx/conf.d/gateway/js_fetch_trusted_certificate.conf; } + # This is the primary location that proxies the request to s3 + # See the included s3_location_common.conf file for all logic location @s3 { - # We include only the headers needed for the authentication signatures that - # we plan to use. - include /etc/nginx/conf.d/gateway/v${AWS_SIGS_VERSION}_headers.conf; - - # The CORS configuration needs to be imported in several places in order for - # it to be applied within different contexts. - include /etc/nginx/conf.d/gateway/cors.conf; - - # Don't allow any headers from the client - we don't want them messing - # with S3 at all. - proxy_pass_request_headers off; - - # Enable passing of the server name through TLS Server Name Indication extension. - proxy_ssl_server_name on; - proxy_ssl_name ${S3_SERVER}; - - # Set the Authorization header to the AWS Signatures credentials - proxy_set_header Authorization $s3auth; - proxy_set_header X-Amz-Security-Token $awsSessionToken; - - # We set the host as the bucket name to inform the S3 API of the bucket - proxy_set_header Host $s3_host_hdr; - - # Use keep alive connections in order to improve performance - proxy_http_version 1.1; - proxy_set_header Connection ''; - - # We strip off all of the AWS specific headers from the server so that - # there is nothing identifying the object as having originated in an - # object store. - js_header_filter s3gateway.editHeaders; - - # Catch all errors from S3 and sanitize them so that the user can't - # gain intelligence about the S3 bucket being proxied. - proxy_intercept_errors on; - - # Comment out this line to receive the error messages returned by S3 - error_page 400 401 402 403 405 406 407 408 409 410 411 412 413 414 415 416 417 418 420 422 423 424 426 428 429 431 444 449 450 451 500 501 502 503 504 505 506 507 508 509 510 511 =404 @error404; - - error_page 404 @trailslashControl; + include /etc/nginx/conf.d/gateway/s3_location_common.conf; + } - proxy_pass ${S3_SERVER_PROTO}://storage_urls$s3uri; + # Same as the primary location above but handling and caching + # byte range requests efficiently + location @s3_sliced { + proxy_cache s3_cache_slices; + proxy_cache_valid 200 302 206 ${PROXY_CACHE_VALID_OK}; + proxy_cache_key "$request_method$host$uri$slice_range"; - include /etc/nginx/conf.d/gateway/s3_location.conf; + slice ${PROXY_CACHE_SLICE_SIZE}; + proxy_set_header Range $slice_range; + include /etc/nginx/conf.d/gateway/s3_location_common.conf; } location @s3PreListing { diff --git a/common/etc/nginx/templates/gateway/s3_location_common.conf.template b/common/etc/nginx/templates/gateway/s3_location_common.conf.template new file mode 100644 index 00000000..f65f9987 --- /dev/null +++ b/common/etc/nginx/templates/gateway/s3_location_common.conf.template @@ -0,0 +1,44 @@ +# We include only the headers needed for the authentication signatures that +# we plan to use. +include /etc/nginx/conf.d/gateway/v${AWS_SIGS_VERSION}_headers.conf; + +# The CORS configuration needs to be imported in several places in order for +# it to be applied within different contexts. +include /etc/nginx/conf.d/gateway/cors.conf; + +# Don't allow any headers from the client - we don't want them messing +# with S3 at all. +proxy_pass_request_headers off; + +# Enable passing of the server name through TLS Server Name Indication extension. +proxy_ssl_server_name on; +proxy_ssl_name ${S3_SERVER}; + +# Set the Authorization header to the AWS Signatures credentials +proxy_set_header Authorization $s3auth; +proxy_set_header X-Amz-Security-Token $awsSessionToken; + +# We set the host as the bucket name to inform the S3 API of the bucket +proxy_set_header Host $s3_host_hdr; + +# Use keep alive connections in order to improve performance +proxy_http_version 1.1; +proxy_set_header Connection ''; + +# We strip off all of the AWS specific headers from the server so that +# there is nothing identifying the object as having originated in an +# object store. +js_header_filter s3gateway.editHeaders; + +# Catch all errors from S3 and sanitize them so that the user can't +# gain intelligence about the S3 bucket being proxied. +proxy_intercept_errors on; + +# Comment out this line to receive the error messages returned by S3 +error_page 400 401 402 403 405 406 407 408 409 410 411 412 413 414 415 416 417 418 420 422 423 424 426 428 429 431 444 449 450 451 500 501 502 503 504 505 506 507 508 509 510 511 =404 @error404; + +error_page 404 @trailslashControl; + +proxy_pass ${S3_SERVER_PROTO}://storage_urls$s3uri; + +include /etc/nginx/conf.d/gateway/s3_location.conf; diff --git a/docs/getting_started.md b/docs/getting_started.md index 4685f2d8..d3380817 100644 --- a/docs/getting_started.md +++ b/docs/getting_started.md @@ -31,7 +31,8 @@ running as a Container or as a Systemd service. | `DIRECTORY_LISTING_PATH_PREFIX` | No | | | In `ALLOW_DIRECTORY_LIST=true` mode [adds defined prefix to links](#configuring-directory-listing) | | `DNS_RESOLVERS` | No | | | DNS resolvers (separated by single spaces) to configure NGINX with | | `PROXY_CACHE_MAX_SIZE` | No | | `10g` | Limits cache size | -| `PROXY_CACHE_INACTIVE` | No | | `60m` | Cached data that are not accessed during the time specified by the parameter get removed from the cache regardless of their freshness | +| `PROXY_CACHE_INACTIVE` | No | | `60m` | Cached data that are not accessed during the time specified by the parameter get removed from the cache regardless of their freshness +| `PROXY_CACHE_SLICE_SIZE` | No | | `1m` | For requests with a `Range` header included, determines the size of the chunks in which the file is fetched. Values much smaller than the requests can lead to inefficiencies due to reading and writing many files. See [below for more details](#byte-range-requests-and-caching) | | | `PROXY_CACHE_VALID_OK` | No | | `1h` | Sets caching time for response code 200 and 302 | | `PROXY_CACHE_VALID_NOTFOUND` | No | | `1m` | Sets caching time for response code 404 | | `PROXY_CACHE_VALID_FORBIDDEN` | No | | `30s` | Sets caching time for response code 403 | @@ -112,6 +113,17 @@ S3 bucket in a subfolder on an ALB. For example, if you wanted to expose the root of a bucket under the path "www.mysite.com/somepath", you would set this variable to "/somepath". +## Byte-Range Requests and Caching +The gateway caches [byte-range](https://developer.mozilla.org/en-US/docs/Web/HTTP/Range_requests) (requests sent with a `Range` header) requests differently than normal requests. + +The gateway is configured to cache such requests in chunks of size `PROXY_CACHE_SLICE_SIZE`. If you don't provide this configuration value it will default to 1 megabyte. + +This means that if you request 2.5 megabytes of a 1 gigabyte file, the gateway will cache 3 megabytes and nothing else. + +Setting your slice size too small can have performance impacts since NGINX performs a subrequest for each slice. For more details see the [official reference](http://nginx.org/en/docs/http/ngx_http_slice_module.html). + +You may make byte-range requests and normal requests for the same file and NGINX will automatically handle them differently. The caches for file chunks and normal file requests are separate on disk. + ## Running as a Systemd Service An [install script](/standalone_ubuntu_oss_install.sh) for the gateway shows diff --git a/settings.example b/settings.example index 832eaaa5..c6d85b24 100644 --- a/settings.example +++ b/settings.example @@ -14,6 +14,7 @@ PROVIDE_INDEX_PAGE=false APPEND_SLASH_FOR_POSSIBLE_DIRECTORY=false DIRECTORY_LISTING_PATH_PREFIX="" PROXY_CACHE_MAX_SIZE=10g +ENV PROXY_CACHE_SLICE_SIZE="1m" PROXY_CACHE_INACTIVE=60m PROXY_CACHE_VALID_OK=1h PROXY_CACHE_VALID_NOTFOUND=1m diff --git a/standalone_ubuntu_oss_install.sh b/standalone_ubuntu_oss_install.sh index eab03665..ba161179 100644 --- a/standalone_ubuntu_oss_install.sh +++ b/standalone_ubuntu_oss_install.sh @@ -92,6 +92,7 @@ echo "Directory Listing Enabled: ${ALLOW_DIRECTORY_LIST}" echo "Directory Listing path prefix: ${DIRECTORY_LISTING_PATH_PREFIX}" echo "Cache size limit: ${PROXY_CACHE_MAX_SIZE}" echo "Cache inactive timeout: ${PROXY_CACHE_INACTIVE}" +echo "Slice of slice for byte range requests: ${PROXY_CACHE_SLICE_SIZE}" echo "Proxy Caching Time for Valid Response: ${PROXY_CACHE_VALID_OK}" echo "Proxy Caching Time for Not Found Response: ${PROXY_CACHE_VALID_NOTFOUND}" echo "Proxy Caching Time for Forbidden Response: ${PROXY_CACHE_VALID_FORBIDDEN}" @@ -167,6 +168,8 @@ DEBUG=${DEBUG:-'false'} PROXY_CACHE_MAX_SIZE=${PROXY_CACHE_MAX_SIZE:-'10g'} # Cached data that are not accessed during the time get removed PROXY_CACHE_INACTIVE=${PROXY_CACHE_INACTIVE:-'60m'} +# Request slice size +PROXY_CACHE_SLICE_SIZE=${PROXY_CACHE_SLICE_SIZE:-'1m'} # Proxy caching time for response code 200 and 302 PROXY_CACHE_VALID_OK=${PROXY_CACHE_VALID_OK:-'1h'} # Proxy caching time for response code 404 diff --git a/test/docker-compose.yaml b/test/docker-compose.yaml index b129ea8e..44c58763 100644 --- a/test/docker-compose.yaml +++ b/test/docker-compose.yaml @@ -31,6 +31,7 @@ services: AWS_SIGS_VERSION: STATIC_SITE_HOSTING: PROXY_CACHE_MAX_SIZE: "10g" + PROXY_CACHE_SLICE_SIZE: "1m" PROXY_CACHE_INACTIVE: "60m" PROXY_CACHE_VALID_OK: "1h" PROXY_CACHE_VALID_NOTFOUND: "1m" diff --git a/test/integration/test_api.sh b/test/integration/test_api.sh index 082bb23a..a3f8a410 100644 --- a/test/integration/test_api.sh +++ b/test/integration/test_api.sh @@ -77,6 +77,14 @@ if ! [ -x "${checksum_cmd}" ]; then exit ${no_dep_exit_code} fi + +file_convert_command="$(command -v dd || true)" + +if ! [ -x "${file_convert_command}" ]; then + e "required dependency not found: dd not found in the path or not executable" + exit ${no_dep_exit_code} +fi + # If we are using the `md5` executable # then use the -r flag which makes it behave the same as `md5sum` # this is done after the `-x` check for ability to execute @@ -140,6 +148,27 @@ assertHttpRequestEquals() { exit ${test_fail_exit_code} fi fi + # Not a real method but better than making a whole new helper or massively refactoring this one + elif [ "${method}" = "GET_RANGE" ]; then + # Call format to check for a range of byte 30 to 1000: + # assertHttpRequestEquals "GET_RANGE" "a.txt" "data/bucket-1/a.txt" 30 1000 "206" + body_data_path="${test_dir}/$3" + range_start="$4" + range_end="$5" + byte_count=$((range_end - range_start + 1)) # add one since we read through the last byte + expected_response_code="$6" + + file_checksum=$(${file_convert_command} if="$body_data_path" bs=1 skip="$range_start" count="$byte_count" 2>/dev/null | ${checksum_cmd}) + expected_checksum="${file_checksum:0:${checksum_length}}" + + curl_checksum_output="$(${curl_cmd} -X "GET" -r "${range_start}"-"${range_end}" "${uri}" ${extra_arg} | ${checksum_cmd})" + s3_file_checksum="${curl_checksum_output:0:${checksum_length}}" + + if [ "${expected_checksum}" != "${s3_file_checksum}" ]; then + e "Checksum doesn't match expectation. Request [GET ${uri} Range: "${range_start}"-"${range_end}"] Expected [${expected_checksum}] Actual [${s3_file_checksum}]" + e "curl command: ${curl_cmd} -X "GET" -r "${range_start}"-"${range_end}" "${uri}" ${extra_arg} | ${checksum_cmd}" + exit ${test_fail_exit_code} + fi else e "Method unsupported: [${method}]" fi @@ -175,7 +204,6 @@ if [ -n "${prefix_leading_directory_path}" ]; then fi # Ordinary filenames - assertHttpRequestEquals "HEAD" "a.txt" "200" assertHttpRequestEquals "HEAD" "a.txt?some=param&that=should&be=stripped#aaah" "200" assertHttpRequestEquals "HEAD" "b/c/d.txt" "200" @@ -184,6 +212,9 @@ assertHttpRequestEquals "HEAD" "b/e.txt" "200" assertHttpRequestEquals "HEAD" "b//e.txt" "200" assertHttpRequestEquals "HEAD" "a/abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789.txt" "200" +# Byte range requests +assertHttpRequestEquals "GET_RANGE" 'a/plus%2Bplus.txt' "data/bucket-1/a/plus+plus.txt" 30 1000 "206" + # We try to request URLs that are properly encoded as well as URLs that # are not properly encoded to understand what works and what does not.