Update docker images

Signed-off-by: Antoni Baum <[email protected]>
ray-project · Jul 3, 2023 · bbfe3c5 · kevin85421 · Jul 12, 2023 · shaowei-su
1 parent caa3062
commit bbfe3c5
Show file tree

Hide file tree

Showing 14 changed files with 37 additions and 15 deletions.
diff --git a/aviary/backend/llm/predictor/predictor.py b/aviary/backend/llm/predictor/predictor.py
@@ -448,7 +448,7 @@ async def _create_worker_group(
         await asyncio.gather(
             *[
                 initialize_node_remote_pg.remote(
-                    llm_config.model_id,
+                    llm_config.actual_hf_model_id,
                     llm_config.initialization.s3_mirror_config,
                 )
                 for i in range(scaling_config.num_workers)

diff --git a/aviary/backend/llm/utils.py b/aviary/backend/llm/utils.py
@@ -33,7 +33,7 @@ def download_model(
     Download a model from an S3 bucket and save it in TRANSFORMERS_CACHE for
     seamless interoperability with Hugging Face's Transformers library.
 
-    The downloaded model must have a 'hash' file containing the commit hash corresponding
+    The downloaded model may have a 'hash' file containing the commit hash corresponding
     to the commit on Hugging Face Hub.
     """
     from transformers.utils.hub import TRANSFORMERS_CACHE
@@ -48,11 +48,13 @@ def download_model(
         + [os.path.join(bucket_uri, "hash"), "."]
     )
     if not os.path.exists(os.path.join(".", "hash")):
-        raise RuntimeError(
-            "Hash file not found in the bucket or bucket could not have been downloaded."
+        f_hash = "0000000000000000000000000000000000000000"
+        logger.warning(
+            f"hash file does not exist in {bucket_uri}. Using {f_hash} as the hash."
         )
-    with open(os.path.join(".", "hash"), "r") as f:
-        f_hash = f.read().strip()
+    else:
+        with open(os.path.join(".", "hash"), "r") as f:
+            f_hash = f.read().strip()
     logger.info(
         f"Downloading {model_id} from {bucket_uri} to {os.path.join(path, 'snapshots', f_hash)}"
     )

diff --git a/aviary/backend/server/models.py b/aviary/backend/server/models.py
@@ -400,6 +400,15 @@ def initializer_pipeline(cls, values):
             )
         return values
 
+    @root_validator
+    def s3_mirror_config_transformers(cls, values):
+        s3_mirror_config: S3MirrorConfig = values.get("s3_mirror_config")
+        if s3_mirror_config and s3_mirror_config.bucket_uri:
+            initializer: Initializer = values.get("initializer")
+            if isinstance(initializer, Transformers):
+                initializer.from_pretrained_kwargs["local_files_only"] = True
+        return values
+
 
 class StaticBatchingInitializationConfig(InitializationConfig):
     initializer: Annotated[

diff --git a/aviary/common/constants.py b/aviary/common/constants.py
@@ -350,6 +350,11 @@
     font-size: 1rem;
 }
 
+.ticker-container.block {
+    padding: 4px 8px !important;
+    border: 1px solid var(--button-primary-border-color) !important;
+}
+
 #prompt-examples-column {
     flex-grow: 0 !important;
 }

diff --git a/aviary/frontend/app.py b/aviary/frontend/app.py
@@ -457,7 +457,7 @@ def noop(*args, **kwargs):
             pass
 
         # Get the port the serve app is running on
-        controller = serve.context._global_client._controller
+        controller = ray.serve.context.get_global_client()._controller
         port = ray.get(controller.get_http_config.remote()).port
 
         blocks._queue.set_url(f"http://localhost:{port}/")

diff --git a/deploy/_internal/backend/service.yaml b/deploy/_internal/backend/service.yaml
@@ -1,4 +1,4 @@
-models: ./models
+models: ./models/continuous_batching
 
 ray_serve_config:
   applications:

diff --git a/deploy/ray/Dockerfile b/deploy/ray/Dockerfile
@@ -2,12 +2,13 @@ FROM rayproject/ray:nightly-cu118
 
 RUN sudo apt-get update && sudo apt-get install -y libaio-dev git-lfs awscli && sudo rm -rf /var/lib/apt/lists/*
 
-RUN conda install python=3.10
+RUN conda update -n base -c defaults conda && conda install python=3.10
 RUN pip install -i https://download.pytorch.org/whl/cu118 torch torchvision torchaudio
+RUN pip uninstall -y ray && pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl
 
 # Created by build_aviary_wheel.sh
 COPY "./dist" "/home/ray/dist"
-RUN cd /home/ray/dist && pip install "$(ls *.whl | head -n1)[backend]"
+RUN cd /home/ray/dist && pip install "$(ls *.whl | head -n1)[backend, frontend]"
 
 # The build context should be the root of the repo
 # So this gives the model definitions
@@ -18,4 +19,4 @@ ENV SAFETENSORS_FAST_GPU=1
 ENV RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1
 RUN echo "Testing aviary install" && python -c "import aviary.backend"
 
-RUN pip cache purge && conda clean -a && rm -rf ~/.cache
+RUN (pip cache purge || true) && conda clean -a && rm -rf ~/.cache
diff --git a/deploy/ray/Dockerfile-tgi b/deploy/ray/Dockerfile-tgi
@@ -28,11 +28,12 @@ RUN export FORCE_CUDA=1 NVCC_PREPEND_FLAGS="--forward-unknown-opts" DS_BUILD_OPS
   "numpy<1.24" \
   "ninja"
 RUN pip install --no-deps "git+https://github.com/huggingface/optimum.git"
+RUN pip uninstall -y ray && pip install -U https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl
 RUN source "$HOME/.cargo/env" && pip install boto3
 
 # Created by build_aviary_wheel.sh
 COPY "./dist" "/home/ray/dist"
-RUN cd /home/ray/dist && pip install "$(ls *.whl | head -n1)[backend]"
+RUN cd /home/ray/dist && pip install --no-deps "$(ls *.whl | head -n1)[backend, frontend]"
 
 # The build context should be the root of the repo
 # So this gives the model definitions
@@ -48,4 +49,4 @@ RUN sudo supervisord --version
 
 RUN echo "Testing aviary install" && python -c "import aviary.backend"
 
-RUN pip cache purge && conda clean -a && rm -rf ~/.cache
+RUN (pip cache purge || true) && conda clean -a && rm -rf ~/.cache
diff --git a/models/continuous_batching/lmsys--vicuna-33b-v1.3.yaml b/models/continuous_batching/lmsys--vicuna-33b-v1.3.yaml
@@ -2,7 +2,7 @@ deployment_config:
   autoscaling_config:
     min_replicas: 1
     initial_replicas: 1
-    max_replicas: 8
+    max_replicas: 1
     target_num_ongoing_requests_per_replica: 1.0
     metrics_interval_s: 10.0
     look_back_period_s: 30.0

diff --git a/models/static_batching/OpenAssistant--falcon-40b-sft-top1-560.yaml b/models/static_batching/OpenAssistant--falcon-40b-sft-top1-560.yaml
@@ -15,9 +15,9 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: OpenAssistant/falcon-40b-sft-top1-560
+  batching: static
   max_input_words: 800
   initialization:
-
     s3_mirror_config:
       bucket_uri: s3://large-dl-models-mirror/models--OpenAssistant--falcon-40b-sft-top1-560/main-safetensors/
     initializer:

diff --git a/models/static_batching/OpenAssistant--falcon-7b-sft-top1-696.yaml b/models/static_batching/OpenAssistant--falcon-7b-sft-top1-696.yaml
@@ -14,6 +14,7 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: OpenAssistant/falcon-7b-sft-top1-696
+  batching: static
   max_input_words: 800
   initialization:
     s3_mirror_config:

diff --git a/models/static_batching/OpenAssistant--oasst-sft-7-llama-30b-xor.yaml b/models/static_batching/OpenAssistant--oasst-sft-7-llama-30b-xor.yaml
@@ -14,6 +14,7 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: OpenAssistant/oasst-sft-7-llama-30b-xor
+  batching: static
   max_input_words: 800
   model_description: "Open Assistant is a project meant to give everyone access to a great chat based large language model.\nWe believe that by doing this we will create a revolution in innovation in language. In the same way that stable-diffusion helped the world make art and images in new ways we hope Open Assistant can help improve the world by improving language itself."
   initialization:

diff --git a/models/static_batching/lmsys--vicuna-13b-delta-v1.1.yaml b/models/static_batching/lmsys--vicuna-13b-delta-v1.1.yaml
@@ -13,6 +13,7 @@ deployment_config:
     resources:
       accelerator_type_cpu: 0.01
 model_config:
+  batching: static
   model_id: lmsys/vicuna-13b-delta-v1.1
   max_input_words: 800
   model_description: "Vicuna is an open-source chatbot trained by fine-tuning LLaMA on user-shared conversations collected from ShareGPT. It is an auto-regressive language model, based on the transformer architecture."

diff --git a/models/mosaicml--mpt-30b-chat.yaml → ...atic_batching/mosaicml--mpt-30b-chat.yaml b/models/mosaicml--mpt-30b-chat.yaml → ...atic_batching/mosaicml--mpt-30b-chat.yaml
@@ -14,6 +14,7 @@ deployment_config:
       accelerator_type_cpu: 0.01
 model_config:
   model_id: mosaicml/mpt-30b-chat
+  batching: static
   max_input_words: 800
   initialization:
     s3_mirror_config: