diff --git a/CHANGELOG.md b/CHANGELOG.md index 5e01aa8a..669931c6 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 This release changes the directory structure of the models cache, such that cached files from the same HuggingFace Hub repository are grouped in a separate subdirectory. This change is meant to simplify the process of manually removing specific models from the cache to free up space. As a result, the cache contents from prior versions are invalidated, so you most likely want to remove the current cache contents. To find the cache location run `elixir -e 'Mix.install([{:bumblebee, "0.4.2"}]); IO.puts(Bumblebee.cache_dir())'` (defaults to the standard cache location for the given operating system). +We also reduced memory usage during parameter loading (both when loading onto the CPU and GPU directly). Previously, larger models sometimes required loading parameters using CPU and only then transfering to the GPU, in order to avoid running out of GPU memory during parameter transformations. With this release this should no longer be the case. Loading parameters now has barely any memory footprint other than the parameters themselves. + ### Added * Notebook on LLaMA 2 to the docs ([#259](https://github.com/elixir-nx/bumblebee/pull/259)) diff --git a/examples/phoenix/README.md b/examples/phoenix/README.md index f280db76..636205f1 100644 --- a/examples/phoenix/README.md +++ b/examples/phoenix/README.md @@ -67,14 +67,15 @@ config :nx, :default_backend, {EXLA.Backend, client: :host} Then, for any expensive computations you can use [`Nx.Defn.compile/3`](https://hexdocs.pm/nx/Nx.Defn.html#compile/3) (or [`Axon.compile/4`](https://hexdocs.pm/axon/Axon.html#compile/4)) passing `compiler: EXLA` as an option. When you use a Bumblebee serving the compilation is handled for you, just make sure to pass `:compile` and `defn_options: [compiler: EXLA]` when creating the serving. -There's a final important detail related to parameters. With the above configuration, a model will run on the GPU, however parameters will be loaded onto the CPU (due to the default backend), so they will need to be copied onto the GPU every time. To avoid that, you want to make sure that parameters are allocated on the same device that the computation runs on. The simplest way to achieve that is passing `preallocate_params: true` to when creating the serving, in addition to `:defn_options`. +There's a final important detail related to parameters. With the above configuration, a model will run on the GPU, however parameters will be loaded onto the CPU (due to the default backend), so they will need to be copied onto the GPU every time. To avoid that, you can load the parameters onto the GPU directly using `Bumblebee.load_model(..., backend: EXLA.Backend)`. + +When building the Bumblebee serving, make sure to specify the compiler and `:compile` shapes, so that the computation is compiled upfront when the serving boots. ```elixir serving = - Bumblebee.Text.TextEmbedding.text_embedding(model_info, tokenizer, + Bumblebee.Text.text_embedding(model_info, tokenizer, compile: [batch_size: 1, sequence_length: 512], - defn_options: [compiler: EXLA], - preallocate_params: true + defn_options: [compiler: EXLA] ) ``` diff --git a/lib/bumblebee.ex b/lib/bumblebee.ex index 1f0ea9b4..5445fe7f 100644 --- a/lib/bumblebee.ex +++ b/lib/bumblebee.ex @@ -704,7 +704,7 @@ defmodule Bumblebee do end end - defp params_file_loader_fun(".safetensors"), do: &Safetensors.read!/1 + defp params_file_loader_fun(".safetensors"), do: &Safetensors.read!(&1, lazy: true) defp params_file_loader_fun(_), do: &Bumblebee.Conversion.PyTorch.Loader.load!/1 @doc """ diff --git a/lib/bumblebee/conversion/pytorch.ex b/lib/bumblebee/conversion/pytorch.ex index 60b49213..7a0bd492 100644 --- a/lib/bumblebee/conversion/pytorch.ex +++ b/lib/bumblebee/conversion/pytorch.ex @@ -81,7 +81,9 @@ defmodule Bumblebee.Conversion.PyTorch do defp with_default_backend(backend, fun), do: Nx.with_default_backend(backend, fun) defp state_dict?(%{} = dict) when not is_struct(dict) do - Enum.all?(dict, fn {key, value} -> is_binary(key) and is_struct(value, Nx.Tensor) end) + Enum.all?(dict, fn {key, value} -> + is_binary(key) and Nx.LazyContainer.impl_for(value) != nil + end) end defp state_dict?(_other), do: false @@ -141,6 +143,7 @@ defmodule Bumblebee.Conversion.PyTorch do {value, diff} = if all_sources_found? do + source_values = Enum.map(source_values, &Nx.to_tensor/1) value = builder_fun.(Enum.reverse(source_values)) case verify_param_shape(param_expr, value) do diff --git a/lib/bumblebee/conversion/pytorch/file_tensor.ex b/lib/bumblebee/conversion/pytorch/file_tensor.ex new file mode 100644 index 00000000..30f8a8d1 --- /dev/null +++ b/lib/bumblebee/conversion/pytorch/file_tensor.ex @@ -0,0 +1,76 @@ +defmodule Bumblebee.Conversion.PyTorch.FileTensor do + @moduledoc false + + defstruct [:shape, :type, :offset, :strides, :storage] +end + +defimpl Nx.LazyContainer, for: Bumblebee.Conversion.PyTorch.FileTensor do + alias Bumblebee.Conversion.PyTorch.Loader + + def traverse(lazy_tensor, acc, fun) do + template = Nx.template(lazy_tensor.shape, lazy_tensor.type) + + load = fn -> + binary = + case lazy_tensor.storage do + {:zip, path, file_name} -> + Loader.open_zip!(path, fn unzip -> + Loader.read_zip_file(unzip, file_name) + end) + + {:file, path, offset, size} -> + File.open!(path, [:read, :raw], fn file -> + {:ok, binary} = :file.pread(file, offset, size) + binary + end) + end + + %{offset: offset, shape: shape, type: type, strides: strides} = lazy_tensor + + {_, bit_unit} = type + byte_unit = div(bit_unit, 8) + size = Tuple.product(shape) + binary = binary_part(binary, offset * byte_unit, size * byte_unit) + binary |> Nx.from_binary(type) |> to_contiguous(shape, strides) + end + + fun.(template, load, acc) + end + + defp to_contiguous(tensor, shape, strides) do + # PyTorch tensors may not be contiguous in memory, so strides are + # used to indicate jumps necessary when traversing each axis. + # Since Nx doesn't have the notion of strides, we transpose the + # tensor, in a way that makes it contiguous, which is equivalent + # to strides being decreasing + + memory_axes_order = + strides + |> Tuple.to_list() + |> Enum.with_index() + |> Enum.sort_by(&elem(&1, 0), :desc) + |> Enum.map(&elem(&1, 1)) + + if memory_axes_order == Nx.axes(shape) do + Nx.reshape(tensor, shape) + else + memory_shape = + memory_axes_order + |> Enum.map(fn axis -> elem(shape, axis) end) + |> List.to_tuple() + + tensor + |> Nx.reshape(memory_shape) + |> Nx.transpose(axes: inverse_permutation(memory_axes_order)) + end + end + + defp inverse_permutation(list) do + list + |> Enum.with_index() + |> Enum.reduce(List.to_tuple(list), fn {src_idx, dest_idx}, inverse -> + put_elem(inverse, src_idx, dest_idx) + end) + |> Tuple.to_list() + end +end diff --git a/lib/bumblebee/conversion/pytorch/loader.ex b/lib/bumblebee/conversion/pytorch/loader.ex index e9645399..6dc08c2e 100644 --- a/lib/bumblebee/conversion/pytorch/loader.ex +++ b/lib/bumblebee/conversion/pytorch/loader.ex @@ -26,47 +26,52 @@ defmodule Bumblebee.Conversion.PyTorch.Loader do end defp load_zip!(path) do - zip_file = Unzip.LocalFile.open(path) - - try do - {:ok, unzip} = Unzip.new(zip_file) - - contents = + open_zip!(path, fn unzip -> + file_name_map = unzip |> Unzip.list_entries() |> Map.new(fn %Unzip.Entry{file_name: file_name} -> - content = - unzip - |> Unzip.file_stream!(file_name) - |> Enum.to_list() - |> IO.iodata_to_binary() - # Strip the root dir from the file name - name = - file_name - |> Path.split() - |> Enum.drop(1) - |> Enum.join("/") - - {name, content} + name = file_name |> Path.split() |> Enum.drop(1) |> Enum.join("/") + {name, file_name} end) + binary = read_zip_file(unzip, Map.fetch!(file_name_map, "data.pkl")) + {term, ""} = - Unpickler.load!(Map.fetch!(contents, "data.pkl"), + Unpickler.load!(binary, object_resolver: &object_resolver/1, persistent_id_resolver: fn {"storage", storage_type, key, _location, _size} -> - binary = Map.fetch!(contents, "data/#{key}") - {:storage, storage_type, binary} + file_name = Map.fetch!(file_name_map, "data/#{key}") + {:storage, storage_type, {:zip, path, file_name}} end ) term + end) + end + + @doc false + def open_zip!(path, fun) do + zip_file = Unzip.LocalFile.open(path) + + try do + {:ok, unzip} = Unzip.new(zip_file) + fun.(unzip) after Unzip.LocalFile.close(zip_file) end end + @doc false + def read_zip_file(unzip, file_name) do + unzip + |> Unzip.file_stream!(file_name) + |> Enum.to_list() + |> IO.iodata_to_binary() + end + defp object_resolver(%{constructor: "collections.OrderedDict", set_items: items}) do {:ok, Map.new(items)} end @@ -76,13 +81,18 @@ defmodule Bumblebee.Conversion.PyTorch.Loader do constructor: "torch._utils._rebuild_tensor_v2", args: [storage, offset, shape, strides, _requires_grad, _backward_hooks] }) do - {:storage, storage_type, binary} = storage - {_, bit_unit} = type = storage_type_to_nx(storage_type) - byte_unit = div(bit_unit, 8) - size = Tuple.product(shape) - binary = binary_part(binary, offset * byte_unit, size * byte_unit) - tensor = binary |> Nx.from_binary(type) |> to_contiguous(shape, strides) - {:ok, tensor} + {:storage, storage_type, storage} = storage + type = storage_type_to_nx(storage_type) + + lazy_tensor = %Bumblebee.Conversion.PyTorch.FileTensor{ + shape: shape, + type: type, + offset: offset, + strides: strides, + storage: storage + } + + {:ok, lazy_tensor} end # See https://github.com/pytorch/pytorch/blob/v1.12.1/torch/_tensor.py#L222-L226 @@ -173,53 +183,17 @@ defmodule Bumblebee.Conversion.PyTorch.Loader do end end - defp to_contiguous(tensor, shape, strides) do - # PyTorch tensors may not be contiguous in memory, so strides are - # used to indicate jumps necessary when traversing each axis. - # Since Nx doesn't have the notion of strides, we transpose the - # tensor, in a way that makes it contiguous, which is equivalent - # to strides being decreasing - - memory_axes_order = - strides - |> Tuple.to_list() - |> Enum.with_index() - |> Enum.sort_by(&elem(&1, 0), :desc) - |> Enum.map(&elem(&1, 1)) - - if memory_axes_order == Nx.axes(shape) do - Nx.reshape(tensor, shape) - else - memory_shape = - memory_axes_order - |> Enum.map(fn axis -> elem(shape, axis) end) - |> List.to_tuple() - - tensor - |> Nx.reshape(memory_shape) - |> Nx.transpose(axes: inverse_permutation(memory_axes_order)) - end - end - - defp inverse_permutation(list) do - list - |> Enum.with_index() - |> Enum.reduce(List.to_tuple(list), fn {src_idx, dest_idx}, inverse -> - put_elem(inverse, src_idx, dest_idx) - end) - |> Tuple.to_list() - end - @legacy_magic_number 119_547_037_146_038_801_333_356 defp load_legacy!(path) do data = File.read!(path) + full_size = byte_size(data) {@legacy_magic_number, data} = Unpickler.load!(data) {_protocol_version, data} = Unpickler.load!(data) {_system_info, data} = Unpickler.load!(data) - binaries = storage_binaries(data) + binaries = storage_binaries(data, full_size) {term, _} = Unpickler.load!(data, @@ -229,16 +203,18 @@ defmodule Bumblebee.Conversion.PyTorch.Loader do {_, bit_unit} = storage_type_to_nx(storage_type) byte_unit = div(bit_unit, 8) - binary = + {file_offset, size} = Map.fetch!(binaries, root_key) + + storage = case view_metadata do nil -> - binaries[root_key] + {:file, path, file_offset, size} {_view_key, offset, view_size} -> - binary_part(binaries[root_key], offset * byte_unit, view_size * byte_unit) + {:file, path, file_offset + offset * byte_unit, view_size * byte_unit} end - {:storage, storage_type, binary} + {:storage, storage_type, storage} {"module", module, _source_file, _source} -> module @@ -248,7 +224,7 @@ defmodule Bumblebee.Conversion.PyTorch.Loader do term end - defp storage_binaries(data) do + defp storage_binaries(data, full_size) do # We first do a dry run on the pickle and extract storage metadata, # then we use that metadata to read the storage binaries that follow @@ -269,16 +245,20 @@ defmodule Bumblebee.Conversion.PyTorch.Loader do {storage_keys, data} = Unpickler.load!(data) - {pairs, ""} = - Enum.map_reduce(storage_keys, data, fn key, data -> + offset = full_size - byte_size(data) + + {pairs, _offset} = + Enum.map_reduce(storage_keys, offset, fn key, offset -> {size, byte_unit} = Map.fetch!(storage_infos, key) bytes = size * byte_unit - # Each storage binary is prefixed with the number of elements. + # Each storage binary is prefixed with the number of elements, + # stored as integer-little-signed-size(64), hence the 8 bytes. # See https://github.com/pytorch/pytorch/blob/v1.11.0/torch/csrc/generic/serialization.cpp#L93-L134 - <<^size::integer-little-signed-size(64), chunk::binary-size(bytes), data::binary>> = data + start_offset = offset + 8 + offset = start_offset + bytes - {{key, chunk}, data} + {{key, {start_offset, bytes}}, offset} end) Map.new(pairs) diff --git a/mix.exs b/mix.exs index 5bf8c197..95a0521e 100644 --- a/mix.exs +++ b/mix.exs @@ -41,7 +41,7 @@ defmodule Bumblebee.MixProject do {:torchx, github: "elixir-nx/nx", sparse: "torchx", override: true, only: [:dev, :test]}, {:nx_image, "~> 0.1.0"}, {:unpickler, "~> 0.1.0"}, - {:safetensors, "~> 0.1.2"}, + {:safetensors, "~> 0.1.3"}, {:castore, "~> 0.1 or ~> 1.0"}, {:jason, "~> 1.4.0"}, {:unzip, "~> 0.10.0"}, diff --git a/mix.lock b/mix.lock index 844dba7c..eb583879 100644 --- a/mix.lock +++ b/mix.lock @@ -29,7 +29,7 @@ "progress_bar": {:hex, :progress_bar, "3.0.0", "f54ff038c2ac540cfbb4c2bfe97c75e7116ead044f3c2b10c9f212452194b5cd", [:mix], [{:decimal, "~> 2.0", [hex: :decimal, repo: "hexpm", optional: false]}], "hexpm", "6981c2b25ab24aecc91a2dc46623658e1399c21a2ae24db986b90d678530f2b7"}, "ranch": {:hex, :ranch, "1.8.0", "8c7a100a139fd57f17327b6413e4167ac559fbc04ca7448e9be9057311597a1d", [:make, :rebar3], [], "hexpm", "49fbcfd3682fab1f5d109351b61257676da1a2fdbe295904176d5e521a2ddfe5"}, "rustler_precompiled": {:hex, :rustler_precompiled, "0.6.2", "d2218ba08a43fa331957f30481d00b666664d7e3861431b02bd3f4f30eec8e5b", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, "~> 0.23", [hex: :rustler, repo: "hexpm", optional: true]}], "hexpm", "b9048eaed8d7d14a53f758c91865cc616608a438d2595f621f6a4b32a5511709"}, - "safetensors": {:hex, :safetensors, "0.1.2", "849434fea20b2ed14b92e74205a925d86039c4ef53efe861e5c7b574c3ba8fa6", [:mix], [{:jason, "~> 1.4", [hex: :jason, repo: "hexpm", optional: false]}, {:nx, "~> 0.5", [hex: :nx, repo: "hexpm", optional: false]}], "hexpm", "298a5c82e34fc3b955464b89c080aa9a2625a47d69148d51113771e19166d4e0"}, + "safetensors": {:hex, :safetensors, "0.1.3", "7ff3c22391e213289c713898481d492c9c28a49ab1d0705b72630fb8360426b2", [:mix], [{:jason, "~> 1.4", [hex: :jason, repo: "hexpm", optional: false]}, {:nx, "~> 0.5", [hex: :nx, repo: "hexpm", optional: false]}], "hexpm", "fe50b53ea59fde4e723dd1a2e31cfdc6013e69343afac84c6be86d6d7c562c14"}, "stb_image": {:hex, :stb_image, "0.6.2", "d680a418416b1d778231d1d16151be3474d187e8505e1bd524aa0d08d2de094f", [:make, :mix], [{:cc_precompiler, "~> 0.1.0", [hex: :cc_precompiler, repo: "hexpm", optional: false]}, {:elixir_make, "~> 0.7.0", [hex: :elixir_make, repo: "hexpm", optional: false]}, {:kino, "~> 0.7", [hex: :kino, repo: "hexpm", optional: true]}, {:nx, "~> 0.4", [hex: :nx, repo: "hexpm", optional: true]}], "hexpm", "231ad012f649dd2bd5ef99e9171e814f3235e8f7c45009355789ac4836044a39"}, "telemetry": {:hex, :telemetry, "1.2.1", "68fdfe8d8f05a8428483a97d7aab2f268aaff24b49e0f599faa091f1d4e7f61c", [:rebar3], [], "hexpm", "dad9ce9d8effc621708f99eac538ef1cbe05d6a874dd741de2e689c47feafed5"}, "tokenizers": {:hex, :tokenizers, "0.4.0", "140283ca74a971391ddbd83cd8cbdb9bd03736f37a1b6989b82d245a95e1eb97", [:mix], [{:castore, "~> 0.1 or ~> 1.0", [hex: :castore, repo: "hexpm", optional: false]}, {:rustler, ">= 0.0.0", [hex: :rustler, repo: "hexpm", optional: true]}, {:rustler_precompiled, "~> 0.6", [hex: :rustler_precompiled, repo: "hexpm", optional: false]}], "hexpm", "ef1a9824f5a893cd3b831c0e5b3d72caa250d2ec462035cc6afef6933b13a82e"}, diff --git a/notebooks/llama.livemd b/notebooks/llama.livemd index b59a8ec1..03e0dcf9 100644 --- a/notebooks/llama.livemd +++ b/notebooks/llama.livemd @@ -32,6 +32,9 @@ Let's load the model and create a serving for text generation: hf_token = System.fetch_env!("LB_HF_TOKEN") repo = {:hf, "meta-llama/Llama-2-7b-chat-hf", auth_token: hf_token} +# Option 1 +# {:ok, model_info} = Bumblebee.load_model(repo, type: :bf16, backend: EXLA.Backend) +# Option 2 and 3 {:ok, model_info} = Bumblebee.load_model(repo, type: :bf16) {:ok, tokenizer} = Bumblebee.load_tokenizer(repo) {:ok, generation_config} = Bumblebee.load_generation_config(repo) @@ -50,8 +53,7 @@ serving = Bumblebee.Text.generation(model_info, tokenizer, generation_config, compile: [batch_size: 1, sequence_length: 1028], stream: true, - # Option 1 - # preallocate_params: true, + # Option 1 and 2 # defn_options: [compiler: EXLA] # Option 3 defn_options: [compiler: EXLA, lazy_transfers: :always] @@ -63,13 +65,11 @@ Kino.start_child({Nx.Serving, name: Llama, serving: serving}) We adjust the generation config to use a non-deterministic generation strategy. The most interesting part, though, is the combination of serving options. -First, note that in the Setup cell we set the default backend to `{EXLA.Backend, client: :host}`, which means that initially we load the parameters onto CPU. This is important, because as the parameters are loaded Bumblebee may need to apply certain operations to them and we don't want to bother the GPU at that point, risking an out-of-memory error. +First, note that in the Setup cell we set the default backend to `{EXLA.Backend, client: :host}`, which means that by default we load the parameters onto CPU. There are a couple combinations of options related to parameters, trading off memory usage for speed: -With that, there are a couple combinations of options related to parameters, trading off memory usage for speed: +1. `Bumblebee.load_model(..., backend: EXLA.Backend)`, `defn_options: [compiler: EXLA]` - load all parameters directly onto the GPU. This requires the most memory, but it should provide the fastest inference time. In case you are using multiple GPUs (and a partitioned serving), you still want to load the parameters onto the CPU first and instead use `preallocate_params: true`, so that the parameters are copied onto each of them. -1. `defn_options: [compiler: EXLA]`, `preallocate_params: true` - move and keep all parameters to the GPU upfront. This requires the most memory, but should provide the fastest inference time. - -2. `defn_options: [compiler: EXLA]` - copy all parameters to the GPU before each computation and discard afterwards. This requires less memory, but the copying increases the inference time. +2. `defn_options: [compiler: EXLA]` - copy all parameters to the GPU before each computation and discard afterwards (or more specifically, when no longer needed in the computation). This requires less memory, but the copying increases the inference time. 3. `defn_options: [compiler: EXLA, lazy_transfers: :always]` - lazily copy parameters to the GPU during the computation as needed. This requires the least memory, at the cost of inference time. diff --git a/test/bumblebee/conversion/pytorch/loader_test.exs b/test/bumblebee/conversion/pytorch/loader_test.exs index 4a1c9b82..1f63352e 100644 --- a/test/bumblebee/conversion/pytorch/loader_test.exs +++ b/test/bumblebee/conversion/pytorch/loader_test.exs @@ -17,7 +17,7 @@ defmodule Bumblebee.Conversion.PyTorch.LoaderTest do test "tensors" do path = Path.join(@dir, "tensors.#{@format}.pt") - assert Loader.load!(path) == [ + assert path |> Loader.load!() |> Enum.map(&Nx.to_tensor/1) == [ Nx.tensor([-1.0, 1.0], type: :f64), Nx.tensor([-1.0, 1.0], type: :f32), Nx.tensor([-1.0, 1.0], type: :f16), @@ -63,7 +63,7 @@ defmodule Bumblebee.Conversion.PyTorch.LoaderTest do test "noncontiguous tensor" do path = Path.join(@dir, "noncontiguous_tensor.#{@format}.pt") - assert Loader.load!(path) == + assert path |> Loader.load!() |> Nx.to_tensor() == Nx.tensor([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]], type: :s64) end @@ -81,9 +81,16 @@ defmodule Bumblebee.Conversion.PyTorch.LoaderTest do # this test is based on https://github.com/pytorch/pytorch/blob/v1.11.0/test/test_serialization.py#L554-L575 path = Path.join(@dir, "storage_view.legacy.pt") - assert Loader.load!(path) == - {{:storage, %Unpickler.Global{scope: "torch", name: "FloatStorage"}, <<0, 0, 0, 0>>}, - {:storage, %Unpickler.Global{scope: "torch", name: "FloatStorage"}, <<0, 0, 0, 0>>}} + assert { + {:storage, %Unpickler.Global{scope: "torch", name: "FloatStorage"}, storage1}, + {:storage, %Unpickler.Global{scope: "torch", name: "FloatStorage"}, storage2} + } = Loader.load!(path) + + assert {:file, path, offset, size} = storage1 + assert path |> File.read!() |> binary_part(offset, size) == <<0, 0, 0, 0>> + + assert {:file, path, offset, size} = storage2 + assert path |> File.read!() |> binary_part(offset, size) == <<0, 0, 0, 0>> end test "raises if the files does not exist" do