From c19175331469f1ff3c4223e5b0e354336322722b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=AB=98=E9=98=B3=E9=98=B3?= Date: Fri, 26 Apr 2024 08:40:45 +0800 Subject: [PATCH] remove duplicate content for local cache part (#5535) remove duplicate content from doc --- docs/get-started/VeloxABFS.md | 14 +------------- docs/get-started/VeloxLocalCache.md | 20 ++++++++++++++++++++ docs/get-started/VeloxS3.md | 14 +------------- 3 files changed, 22 insertions(+), 26 deletions(-) create mode 100644 docs/get-started/VeloxLocalCache.md diff --git a/docs/get-started/VeloxABFS.md b/docs/get-started/VeloxABFS.md index 9bb9c83329bb..6e08824237a8 100644 --- a/docs/get-started/VeloxABFS.md +++ b/docs/get-started/VeloxABFS.md @@ -20,16 +20,4 @@ spark.hadoop.fs.azure.account.key..dfs.core.windows.net XXXXXX # Local Caching support -Velox supports a local cache when reading data from HDFS/S3/ABFS. With this feature, Velox can asynchronously cache the data on local disk when reading from remote storage and future read requests on previously cached blocks will be serviced from local cache files. To enable the local caching feature, the following configurations are required: - -``` -spark.gluten.sql.columnar.backend.velox.cacheEnabled // enable or disable velox cache, default false. -spark.gluten.sql.columnar.backend.velox.memCacheSize // the total size of in-mem cache, default is 128MB. -spark.gluten.sql.columnar.backend.velox.ssdCachePath // the folder to store the cache files, default is "/tmp". -spark.gluten.sql.columnar.backend.velox.ssdCacheSize // the total size of the SSD cache, default is 128MB. Velox will do in-mem cache only if this value is 0. -spark.gluten.sql.columnar.backend.velox.ssdCacheShards // the shards of the SSD cache, default is 1. -spark.gluten.sql.columnar.backend.velox.ssdCacheIOThreads // the IO threads for cache promoting, default is 1. Velox will try to do "read-ahead" if this value is bigger than 1 -spark.gluten.sql.columnar.backend.velox.ssdODirect // enable or disable O_DIRECT on cache write, default false. -``` - -It's recommended to mount SSDs to the cache path to get the best performance of local caching. Cache files will be written to "spark.gluten.sql.columnar.backend.velox.cachePath", with UUID based suffix, e.g. "/tmp/cache.13e8ab65-3af4-46ac-8d28-ff99b2a9ec9b0". Gluten cannot reuse older caches for now, and the old cache files are left after Spark context shutdown. +Velox supports a local cache when reading data from ABFS. Please refer [Velox Local Cache](VeloxLocalCache.md) part for more detailed configurations. \ No newline at end of file diff --git a/docs/get-started/VeloxLocalCache.md b/docs/get-started/VeloxLocalCache.md new file mode 100644 index 000000000000..1c7c40ced02b --- /dev/null +++ b/docs/get-started/VeloxLocalCache.md @@ -0,0 +1,20 @@ +--- +layout: page +title: Velox Local Caching +nav_order: 7 +parent: Getting-Started +--- + +Velox supports a local cache when reading data from HDFS/S3/ABFS. With this feature, Velox can asynchronously cache the data on local disk when reading from remote storage and future read requests on previously cached blocks will be serviced from local cache files. To enable the local caching feature, the following configurations are required: + +``` +spark.gluten.sql.columnar.backend.velox.cacheEnabled // enable or disable velox cache, default false. +spark.gluten.sql.columnar.backend.velox.memCacheSize // the total size of in-mem cache, default is 128MB. +spark.gluten.sql.columnar.backend.velox.ssdCachePath // the folder to store the cache files, default is "/tmp". +spark.gluten.sql.columnar.backend.velox.ssdCacheSize // the total size of the SSD cache, default is 128MB. Velox will do in-mem cache only if this value is 0. +spark.gluten.sql.columnar.backend.velox.ssdCacheShards // the shards of the SSD cache, default is 1. +spark.gluten.sql.columnar.backend.velox.ssdCacheIOThreads // the IO threads for cache promoting, default is 1. Velox will try to do "read-ahead" if this value is bigger than 1 +spark.gluten.sql.columnar.backend.velox.ssdODirect // enable or disable O_DIRECT on cache write, default false. +``` + +It's recommended to mount SSDs to the cache path to get the best performance of local caching. Cache files will be written to "spark.gluten.sql.columnar.backend.velox.cachePath", with UUID based suffix, e.g. "/tmp/cache.13e8ab65-3af4-46ac-8d28-ff99b2a9ec9b0". Gluten cannot reuse older caches for now, and the old cache files are left after Spark context shutdown. diff --git a/docs/get-started/VeloxS3.md b/docs/get-started/VeloxS3.md index 2ece52b2fa5b..c57bf6da6843 100644 --- a/docs/get-started/VeloxS3.md +++ b/docs/get-started/VeloxS3.md @@ -58,16 +58,4 @@ You can change log granularity of AWS C++ SDK by setting the `spark.gluten.velox # Local Caching support -Velox supports a local cache when reading data from HDFS/S3. The feature is very useful if remote storage is slow, e.g., reading from a public S3 bucket and stronger performance is desired. With this feature, Velox can asynchronously cache the data on local disk when reading from remote storage, and the future reading requests on already cached blocks will be serviced from local cache files. To enable the local caching feature, below configurations are required: - -``` -spark.gluten.sql.columnar.backend.velox.cacheEnabled // enable or disable velox cache, default false. -spark.gluten.sql.columnar.backend.velox.memCacheSize // the total size of in-mem cache, default is 128MB. -spark.gluten.sql.columnar.backend.velox.ssdCachePath // the folder to store the cache files, default is "/tmp". -spark.gluten.sql.columnar.backend.velox.ssdCacheSize // the total size of the SSD cache, default is 128MB. Velox will do in-mem cache only if this value is 0. -spark.gluten.sql.columnar.backend.velox.ssdCacheShards // the shards of the SSD cache, default is 1. -spark.gluten.sql.columnar.backend.velox.ssdCacheIOThreads // the IO threads for cache promoting, default is 1. Velox will try to do "read-ahead" if this value is bigger than 1 -spark.gluten.sql.columnar.backend.velox.ssdODirect // enable or disable O_DIRECT on cache write, default false. -``` - -It's recommended to mount SSDs to the cache path to get the best performance of local caching. On the start up of Spark context, the cache files will be allocated under "spark.gluten.sql.columnar.backend.velox.cachePath", with UUID based suffix, e.g. "/tmp/cache.13e8ab65-3af4-46ac-8d28-ff99b2a9ec9b0". Gluten is not able to reuse older caches for now, and the old cache files are left there after Spark context shutdown. +Velox supports a local cache when reading data from S3. Please refer [Velox Local Cache](VeloxLocalCache.md) part for more detailed configurations. \ No newline at end of file