From c28b88e764b61c0378510c35fc164e55453f40b4 Mon Sep 17 00:00:00 2001 From: "lu.wang@databricks.com" Date: Thu, 14 Dec 2023 01:05:32 +0000 Subject: [PATCH] Add examples for mixtral 8x7b --- llm-models/mixtral-8x7b/01_load_inference.py | 229 +++++++++++++++ .../02_mlflow_logging_inference.py | 275 ++++++++++++++++++ llm-models/mixtral-8x7b/README.md | 28 ++ 3 files changed, 532 insertions(+) create mode 100644 llm-models/mixtral-8x7b/01_load_inference.py create mode 100644 llm-models/mixtral-8x7b/02_mlflow_logging_inference.py create mode 100644 llm-models/mixtral-8x7b/README.md diff --git a/llm-models/mixtral-8x7b/01_load_inference.py b/llm-models/mixtral-8x7b/01_load_inference.py new file mode 100644 index 0000000..7aba99e --- /dev/null +++ b/llm-models/mixtral-8x7b/01_load_inference.py @@ -0,0 +1,229 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC # `Mixtral-8x7B-Instruct-v0.1` Inference with vllm on Databricks +# MAGIC +# MAGIC The [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) Large Language Model (LLM) is a instruct fine-tuned version of the [mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1) generative text model using a variety of publicly available conversation datasets. +# MAGIC +# MAGIC [vllm](https://github.com/vllm-project/vllm/tree/main) is an open-source library that makes LLM inference fast with various optimizations. +# MAGIC +# MAGIC Environment for this notebook: +# MAGIC - Runtime: 14.1 GPU ML Runtime +# MAGIC - There could be CUDA incompatability issues to install and use vllm on 13.x GPU ML Runtime. +# MAGIC - Instance: `g5.48xlarge` on AWS (8xA10) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## Speed up inference with vllm + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### Instsall vllm + +# COMMAND ---------- + +# MAGIC %pip install vllm==0.2.4 transformers==4.36.0 megablocks==0.5.0 ray +# MAGIC dbutils.library.restartPython() + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### Load model using vllm + +# COMMAND ---------- + +from vllm import LLM +llm = LLM(model="mistralai/Mixtral-8x7B-Instruct-v0.1", tensor_parallel_size=8) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### Inference + +# COMMAND ---------- + +# In order to leverage instruction fine-tuning, your prompt should be surrounded by [INST] and [\INST] tokens. The very first instruction should begin with a begin of sentence id. The next instructions should not. The assistant generation will be ended by the end-of-sentence token id. + +DEFAULT_SYSTEM_PROMPT = """\ +You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. + +If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""" + +INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request." +PROMPT_FOR_GENERATION_FORMAT = """ +[INST]<> +{system_prompt} +<> + +{instruction} +[/INST] +""".format( + system_prompt=DEFAULT_SYSTEM_PROMPT, + instruction="{instruction}" +) + +# COMMAND ---------- + +from vllm import SamplingParams + +# Define parameters to generate text +def gen_text(prompts, use_template=False, **kwargs): + if use_template: + full_prompts = [ + PROMPT_FOR_GENERATION_FORMAT.format(instruction=prompt) + for prompt in prompts + ] + else: + full_prompts = prompts + + # the default max length is pretty small (16), which would cut the generated output in the middle, so it's necessary to increase the threshold to the complete response + if "max_tokens" not in kwargs: + kwargs["max_tokens"] = 512 + + # configure other text generation arguments, see common configurable args here: https://github.com/vllm-project/vllm/blob/main/vllm/sampling_params.py + # kwargs.update( + # { + # "temperature": 0.8, + # } + # ) + + sampling_params = SamplingParams(**kwargs) + + outputs = llm.generate(full_prompts, sampling_params=sampling_params) + texts = [out.outputs[0].text for out in outputs] + + return texts + +# COMMAND ---------- + +# Inference on a single input +results = gen_text(["What is a large language model?"]) +print(results[0]) + +# COMMAND ---------- + +# Use args such as temperature and max_tokens to control text generation +results = gen_text(["What is a large language model?"], temperature=0.5, max_tokens=100, use_template=True) +print(results[0]) + +# COMMAND ---------- + +# Check that the generation quality when the context is long +from transformers import AutoTokenizer +long_input = """Provide a concise summary of the below passage. + +Hannah Arendt was one of the seminal political thinkers of the twentieth century. The power and originality of her thinking was evident in works such as The Origins of Totalitarianism, The Human Condition, On Revolution and The Life of the Mind. In these works and in numerous essays she grappled with the most crucial political events of her time, trying to grasp their meaning and historical import, and showing how they affected our categories of moral and political judgment. What was required, in her view, was a new framework that could enable us to come to terms with the twin horrors of the twentieth century, Nazism and Stalinism. She provided such framework in her book on totalitarianism, and went on to develop a new set of philosophical categories that could illuminate the human condition and provide a fresh perspective on the nature of political life. + +Although some of her works now belong to the classics of the Western tradition of political thought, she has always remained difficult to classify. Her political philosophy cannot be characterized in terms of the traditional categories of conservatism, liberalism, and socialism. Nor can her thinking be assimilated to the recent revival of communitarian political thought, to be found, for example, in the writings of A. MacIntyre, M. Sandel, C. Taylor and M. Walzer. Her name has been invoked by a number of critics of the liberal tradition, on the grounds that she presented a vision of politics that stood in opposition some key liberal principles. There are many strands of Arendt’s thought that could justify such a claim, in particular, her critique of representative democracy, her stress on civic engagement and political deliberation, her separation of morality from politics, and her praise of the revolutionary tradition. However, it would be a mistake to view Arendt as an anti-liberal thinker. Arendt was in fact a stern defender of constitutionalism and the rule of law, an advocate of fundamental human rights (among which she included not only the right to life, liberty, and freedom of expression, but also the right to action and to opinion), and a critic of all forms of political community based on traditional ties and customs, as well as those based on religious, ethnic, or racial identity. + +Arendt’s political thought cannot, in this sense, be identified either with the liberal tradition or with the claims advanced by a number of its critics. Arendt did not conceive of politics as a means for the satisfaction of individual preferences, nor as a way to integrate individuals around a shared conception of the good. Her conception of politics is based instead on the idea of active citizenship, that is, on the value and importance of civic engagement and collective deliberation about all matters affecting the political community. If there is a tradition of thought with which Arendt can be identified, it is the classical tradition of civic republicanism originating in Aristotle and embodied in the writings of Machiavelli, Montesquieu, Jefferson, and Tocqueville. According to this tradition politics finds its authentic expression whenever citizens gather together in a public space to deliberate and decide about matters of collective concern. Political activity is valued not because it may lead to agreement or to a shared conception of the good, but because it enables each citizen to exercise his or her powers of agency, to develop the capacities for judgment and to attain by concerted action some measure of political efficacy.""" + +def get_num_tokens(text): + mistral_tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.1", padding_side="left") + inputs = mistral_tokenizer(text, return_tensors="pt").input_ids.to("cuda") + return inputs.shape[1] + +print('number of tokens for input:', get_num_tokens(long_input)) + +results = gen_text([long_input], use_template=True, max_tokens=150) +print(results[0]) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### Batch Inference +# MAGIC + +# COMMAND ---------- + +# From databricks-dolly-15k +inputs = [ + "Think of some family rules to promote a healthy family relationship", + "In the series A Song of Ice and Fire, who is the founder of House Karstark?", + "which weighs more, cold or hot water?", + "Write a short paragraph about why you should not have both a pet cat and a pet bird.", + "Is beauty objective or subjective?", + "What is SVM?", + "What is the current capital of Japan?", + "Name 10 colors", + "How should I invest my money?", + "What are some ways to improve the value of your home?", + "What does fasting mean?", + "What is cloud computing in simple terms?", + "What is the meaning of life?", + "What is Linux?", + "Why do people like gardening?", + "What makes for a good photograph?" +] + +# COMMAND ---------- + +results = gen_text(inputs, use_template=True) + +for output in results: + print(output) + print('\n') +for i, output in enumerate(results): + print(f"======Output No. {i+1}======") + print(output) + print("\n") + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### Measure inference speed +# MAGIC Text generation speed is often measured with token/s, which is the average number of tokens that are generated by the model per second. +# MAGIC + +# COMMAND ---------- + +import time +import logging + + +def get_gen_text_throughput(prompt, use_template=True, **kwargs): + """ + Return tuple ( number of tokens / sec, num tokens, output ) of the generated tokens + """ + if use_template: + full_prompt = PROMPT_FOR_GENERATION_FORMAT.format(instruction=prompt) + else: + full_prompt = prompt + + if "max_tokens" not in kwargs: + kwargs["max_tokens"] = 512 + sampling_params = SamplingParams(**kwargs) + + num_input_tokens = get_num_tokens(full_prompt) + + # measure the time it takes for text generation + start = time.time() + outputs = llm.generate(full_prompt, sampling_params=sampling_params) + duration = time.time() - start + + # get the number of generated tokens + token_ids = outputs[0].outputs[0].token_ids + n_tokens = len(token_ids) + + # show the generated text in logging + text = outputs[0].outputs[0].text + + return (n_tokens / duration, n_tokens, text) + +# COMMAND ---------- + +throughput, n_tokens, text = get_gen_text_throughput("What is ML?", use_template=False) + +print(f"{throughput:.2f} tokens/sec, {n_tokens} tokens (not including prompt)") + +# COMMAND ---------- + +# When the context is long or the generated text is long, it takes longer to generate each token in average +throughput, n_tokens, text = get_gen_text_throughput(long_input, use_template=True, max_tokens=200) + +print(f"{throughput:2f} tokens/sec, {n_tokens} tokens (not including prompt)") + +# COMMAND ---------- + + diff --git a/llm-models/mixtral-8x7b/02_mlflow_logging_inference.py b/llm-models/mixtral-8x7b/02_mlflow_logging_inference.py new file mode 100644 index 0000000..7309ce0 --- /dev/null +++ b/llm-models/mixtral-8x7b/02_mlflow_logging_inference.py @@ -0,0 +1,275 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC # Manage `Mixtral-8x7B-Instruct-v0.1` model with MLFlow on Databricks +# MAGIC +# MAGIC The [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) Large Language Model (LLM) is a instruct fine-tuned version of the [mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1) generative text model using a variety of publicly available conversation datasets. +# MAGIC +# MAGIC Environment for this notebook: +# MAGIC - Runtime: 14.1 GPU ML Runtime +# MAGIC - There could be CUDA incompatability issues to install and use vllm on 13.x GPU ML Runtime. +# MAGIC - Instance: `g5.48xlarge` on AWS (8xA10) + +# COMMAND ---------- + +# MAGIC %pip install -U "mlflow-skinny[databricks]>=2.6.0" +# MAGIC %pip install -U vllm==0.2.4 transformers==4.36.0 megablocks==0.5.0 ray +# MAGIC %pip install -U databricks-sdk +# MAGIC dbutils.library.restartPython() + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## Log the model to MLFlow + +# COMMAND ---------- + +# it is suggested to pin the revision commit hash and not change it for reproducibility because the uploader might change the model afterwards; you can find the commmit history of Mixtral-8x7B-Instruct-v0.1 in https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1/commits/mainn +model = "mistralai/Mixtral-8x7B-Instruct-v0.1" +revision = "3de0408ae8b591d9ac516a2384925dd98ebc66f4" + +from huggingface_hub import snapshot_download + +# If the model has been downloaded in previous cells, this will not repetitively download large model files, but only the remaining files in the repo +snapshot_location = snapshot_download(repo_id=model, revision=revision, cache_dir="/local_disk0/mixtral-8x7b") + +# COMMAND ---------- + +import json +import mlflow +import torch + +from vllm import LLM +from vllm import SamplingParams + +DEFAULT_SYSTEM_PROMPT = """\ +You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. + +If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""" + +INTRO_BLURB = "Below is an instruction that describes a task. Write a response that appropriately completes the request." +PROMPT_FOR_GENERATION_FORMAT = """ +[INST]<> +{system_prompt} +<> + +{instruction} +[/INST] +""".format( + system_prompt=DEFAULT_SYSTEM_PROMPT, + instruction="{instruction}" +) + + +class MixtralInstruct(mlflow.pyfunc.PythonModel): + def load_context(self, context): + """ + This method initializes the tokenizer and language model + using the specified model repository. + """ + self.llm = LLM(model=context.artifacts["repository"], tensor_parallel_size=8) + + def _generate_response(self, prompts, **kwargs): + full_prompts = [ + PROMPT_FOR_GENERATION_FORMAT.format(instruction=prompt) + for prompt in prompts + ] + + sampling_params = SamplingParams(**kwargs) + + outputs = self.llm.generate(full_prompts, sampling_params=sampling_params) + responses = [] + for out in outputs: + prompt_tokens = len(out.prompt_token_ids) + completion_tokens = sum([len(output.token_ids) for output in out.outputs]) + responses.append({ + "request_id": out.request_id, + "object": "text_completion", + "model": "Mixtral-8x7B-Instruct-v0.1", + "choices":[{"text": output.text, "index": output.index, "logprobs": output.logprobs, "finish_reason": output.finish_reason} for output in out.outputs], + "usage": { + "prompt_tokens": prompt_tokens, + "completion_tokens": max([len(output.token_ids) for output in out.outputs]), + "total_tokens": prompt_tokens + completion_tokens + } + }) + return responses + + def predict(self, context, model_input, params=None): + """ + This method generates prediction for the given input. + The input parameters are compatible with `llm/v1/chat` + https://mlflow.org/docs/latest/gateway/index.html#chat + """ + + # The standard parameters for chat routes with type llm/v1/chat can be find at + # https://mlflow.org/docs/latest/gateway/index.html#chat + messages = model_input["prompt"] + candidate_count = params.get("candidate_count", 1) + temperature = params.get("temperature", 1.0) + max_tokens = params.get("max_new_tokens", 100) + stop = params.get("stop", []) + + response_messages = self._generate_response( + messages, + n=candidate_count, + temperature=temperature, + max_tokens=max_tokens, + stop=stop, + ) + + return response_messages + + +# COMMAND ---------- + +# Define prompt template to get the expected features and performance for the chat versions. See our reference code in github for details: https://github.com/facebookresearch/llama/blob/main/llama/generation.py#L212 + +DEFAULT_SYSTEM_PROMPT = """\ +You are a helpful, respectful and honest assistant. Always answer as helpfully as possible, while being safe. Your answers should not include any harmful, unethical, racist, sexist, toxic, dangerous, or illegal content. Please ensure that your responses are socially unbiased and positive in nature. + +If a question does not make any sense, or is not factually coherent, explain why instead of answering something not correct. If you don't know the answer to a question, please don't share false information.""" + + +def build_prompt(instruction): + return f"""[INST]<>\n{DEFAULT_SYSTEM_PROMPT}\n<>\n\n\n{instruction}[/INST]\n""" + + +# COMMAND ---------- + +import mlflow +from mlflow.models import infer_signature + +# Define model signature including params +input_example = {"prompt": "What is Machine Learning?"} +inference_config = { + "temperature": 1.0, + "max_new_tokens": 100, + "do_sample": True, +} +signature = infer_signature( + model_input=input_example, + model_output="Machien Learning is...", + params=inference_config +) + +# Log the model with its details such as artifacts, pip requirements and input example +with mlflow.start_run() as run: + mlflow.pyfunc.log_model( + "model", + python_model=MixtralInstruct(), + artifacts={"repository": + snapshot_location}, + input_example=input_example, + pip_requirements=["torch==2.1.1", "transformers==4.36.0", "accelerate==0.25.0", "torchvision==0.16.1", "vllm==0.2.4", "megablocks==0.5.0", "ray==2.8.1"], + signature=signature, + ) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## Register the model to Unity Catalog +# MAGIC By default, MLflow registers models in the Databricks workspace model registry. To register models in Unity Catalog instead, we follow the [documentation](https://docs.databricks.com/machine-learning/manage-model-lifecycle/index.html) and set the registry server as Databricks Unity Catalog. +# MAGIC +# MAGIC In order to register a model in Unity Catalog, there are [several requirements](https://docs.databricks.com/machine-learning/manage-model-lifecycle/index.html#requirements), such as Unity Catalog must be enabled in your workspace. +# MAGIC + +# COMMAND ---------- + +# Configure MLflow Python client to register model in Unity Catalog +import mlflow + +mlflow.set_registry_uri("databricks-uc") + +# COMMAND ---------- + +# Register model to Unity Catalog +# This may take 2 minutes to complete + +registered_name = "models.default.mixtral_8x7b_instruct" # Note that the UC model name follows the pattern .., corresponding to the catalog, schema, and registered model name + +result = mlflow.register_model( + "runs:/" + run.info.run_id + "/model", + registered_name, +) + +# COMMAND ---------- + +from mlflow import MlflowClient + +client = MlflowClient() + +# Choose the right model version registered in the above cell. +client.set_registered_model_alias(name=registered_name, alias="Champion", version=result.version) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## Load the model from Unity Catalog + +# COMMAND ---------- + +import mlflow + +registered_name = "models.default.mixtral_8x7b_instruct" +loaded_model = mlflow.pyfunc.load_model(f"models:/{registered_name}@Champion") + +# Make a prediction using the loaded model +loaded_model.predict( + {"prompt": "What is large language model?"}, + params={ + "temperature": 0.5, + "max_new_tokens": 100, + } +) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## Create Model Serving Endpoint +# MAGIC Once the model is registered, we can use API to create a Databricks GPU Model Serving Endpoint that serves the `mixtral-8x7b-instruct` model. +# MAGIC +# MAGIC Note that the below deployment requires GPU model serving. For more information on GPU model serving, see the [documentation](https://docs.databricks.com/en/machine-learning/model-serving/create-manage-serving-endpoints.html#gpu). The feature is in Public Preview. + +# COMMAND ---------- + +# Provide a name to the serving endpoint +endpoint_name = 'mixtral-8x7b-instruct' + +# COMMAND ---------- + +# MAGIC %pip install -U databricks-sdk + +# COMMAND ---------- + +from databricks.sdk import WorkspaceClient +from databricks.sdk.service.serving import EndpointCoreConfigInput +w = WorkspaceClient() + +model_version = result # the returned result of mlflow.register_model + +# Specify the type of compute (CPU, GPU_SMALL, GPU_MEDIUM, etc.) +# Choose GPU_MEDIUM on Azure, and `GPU_LARGE` on Azure +workload_type = "GPU_MEDIUM_8" + +config = EndpointCoreConfigInput.from_dict({ + "served_models": [ + { + "name": f'{model_version.name.replace(".", "_")}_{model_version.version}', + "model_name": model_version.name, + "model_version": model_version.version, + "workload_type": workload_type, + "workload_size": "Small", + "scale_to_zero_enabled": "False", + } + ] +}) +w.serving_endpoints.create(name=endpoint_name, config=config) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Once the model serving endpoint is ready, you can query it. + +# COMMAND ---------- + + diff --git a/llm-models/mixtral-8x7b/README.md b/llm-models/mixtral-8x7b/README.md new file mode 100644 index 0000000..7eca31c --- /dev/null +++ b/llm-models/mixtral-8x7b/README.md @@ -0,0 +1,28 @@ + + + +# Example notebooks for the Mixtral-8x7B models on Databricks + +[mistralai/Mixtral-8x7B-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-v0.1) and [mistralai/Mixtral-8x7B-Instruct-v0.1](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) are a is a pretrained generative Sparse Mixture of Experts. + +- Outperforms Llama 2 70B on most benchmarks we tested. +- It gracefully handles a context of 32k tokens. +- It handles English, French, Italian, German and Spanish. +- It shows strong performance in code generation. +- It can be finetuned into an instruction-following model that achieves a score of 8.3 on MT-Bench. + +Mixtral 8x7B is a high-quality sparse mixture of experts model (SMoE) with open weights. Licensed under Apache 2.0.