Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add image embedding serving #229

Merged
merged 3 commits into from
Jul 31, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
56 changes: 56 additions & 0 deletions lib/bumblebee/vision.ex
Original file line number Diff line number Diff line change
Expand Up @@ -134,4 +134,60 @@ defmodule Bumblebee.Vision do
) :: Nx.Serving.t()
defdelegate image_to_text(model_info, featurizer, tokenizer, generation_config, opts \\ []),
to: Bumblebee.Vision.ImageToText

@type image_embedding_input :: image()
@type image_embedding_output :: %{embedding: Nx.Tensor.t()}
@doc """
Builds serving for image embeddings.

The serving accepts `t:image_embedding_input/0` and returns
`t:image_embedding_output/0`. A list of inputs is also supported.

## Options

* `:output_attribute` - the attribute of the model output map to
retrieve. When the output is a single tensor (rather than a map),
this option is ignored. Defaults to `:pooled_state`

* `:embedding_processor` - a post-processing step to apply to the
embedding. Supported values: `:l2_norm`. By default the output is
returned as is

* `:compile` - compiles all computations for predefined input shapes
during serving initialization. Should be a keyword list with the
following keys:

* `:batch_size` - the maximum batch size of the input. Inputs
are optionally padded to always match this batch size

It is advised to set this option in production and also configure
a defn compiler using `:defn_options` to maximally reduce inference
time.

* `:defn_options` - the options for JIT compilation. Defaults to `[]`

## Examples

{:ok, clip} =
Bumblebee.load_model({:hf, "openai/clip-vit-base-patch32"},
module: Bumblebee.Vision.ClipVision
)
{:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/clip-vit-base-patch32"})
serving = Bumblebee.Vision.ImageEmbedding.image_embedding(clip, featurizer)
image = StbImage.read_file!(path)
Nx.Serving.run(serving, image)
#=> %{
#=> embedding: #Nx.Tensor<
#=> f32[768]
#=> [-0.43403682112693787, 0.09786412119865417, -0.7233262062072754, -0.7707743644714355, 0.5550824403762817, -0.8923342227935791, 0.2687447965145111, 0.9633643627166748, 0.3520320951938629, 0.43195801973342896, 2.1438512802124023, -0.6542983651161194, -1.9736307859420776, 0.1611439287662506, 0.24555791914463043, 0.16985465586185455, 0.9012499451637268, 1.0657984018325806, 1.087411642074585, -0.5864712595939636, 0.3314521908760071, 0.8396108150482178, 0.3906593322753906, 0.13463366031646729, 0.2605385184288025, -0.07457947731018066, 0.4735124707221985, -0.41367805004119873, 0.18244807422161102, 1.4741417169570923, -5.807061195373535, 0.38920706510543823, 0.057687126100063324, 0.060301072895526886, 0.9680367708206177, 0.9670255184173584, 1.3876476287841797, -0.15498873591423035, -0.969764232635498, -0.38127464056015015, 0.05450016260147095, 2.2317700386047363, -0.07926210761070251, -0.11876475065946579, -1.5408644676208496, 0.7505669593811035, 0.9280041456222534, -0.3571934103965759, -1.1390857696533203, ...]
#=> >
#=> }
"""
@spec image_embedding(
Bumblebee.model_info(),
Bumblebee.Featurizer.t(),
keyword()
) :: Nx.Serving.t()
defdelegate image_embedding(model_info, featurizer, opts \\ []),
to: Bumblebee.Vision.ImageEmbedding
end
91 changes: 91 additions & 0 deletions lib/bumblebee/vision/image_embedding.ex
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
defmodule Bumblebee.Vision.ImageEmbedding do
@moduledoc false

alias Bumblebee.Shared

def image_embedding(model_info, featurizer, opts \\ []) do
%{model: model, params: params, spec: spec} = model_info

opts =
Keyword.validate!(opts, [
:compile,
output_attribute: :pooled_state,
embedding_processor: nil,
defn_options: []
])

output_attribute = opts[:output_attribute]
embedding_processor = opts[:embedding_processor]
defn_options = opts[:defn_options]

compile =
if compile = opts[:compile] do
compile
|> Keyword.validate!([:batch_size])
|> Shared.require_options!([:batch_size])
end

batch_size = compile[:batch_size]

{_init_fun, encoder} = Axon.build(model)

embedding_fun = fn params, inputs ->
output = encoder.(params, inputs)

output =
if is_map(output) do
output[output_attribute]
else
output
end

output =
case embedding_processor do
nil ->
output

:l2_norm ->
Bumblebee.Utils.Nx.normalize(output)

other ->
raise ArgumentError,
"expected :embedding_processor to be one of nil or :l2_norm, got: #{inspect(other)}"
end

output
end

Nx.Serving.new(
fn defn_options ->
embedding_fun =
Shared.compile_or_jit(embedding_fun, defn_options, compile != nil, fn ->
inputs = %{
"pixel_values" => Shared.input_template(spec, "pixel_values", [batch_size])
}

[params, inputs]
end)

fn inputs ->
inputs = Shared.maybe_pad(inputs, batch_size)
embedding_fun.(params, inputs)
end
end,
defn_options
)
|> Nx.Serving.process_options(batch_size: batch_size)
|> Nx.Serving.client_preprocessing(fn input ->
{images, multi?} = Shared.validate_serving_input!(input, &Shared.validate_image/1)

inputs = Bumblebee.apply_featurizer(featurizer, images)

{Nx.Batch.concatenate([inputs]), multi?}
end)
|> Nx.Serving.client_postprocessing(fn {embeddings, _metadata}, multi? ->
for embedding <- Bumblebee.Utils.Nx.batch_to_list(embeddings) do
%{embedding: embedding}
end
|> Shared.normalize_output(multi?)
end)
end
end
56 changes: 56 additions & 0 deletions test/bumblebee/vision/image_embedding_test.exs
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
defmodule Bumblebee.Vision.ImageEmbeddingTest do
use ExUnit.Case, async: false

import Bumblebee.TestHelpers

@moduletag model_test_tags()
@images_dir Path.expand("../../fixtures/images", __DIR__)

describe "integration" do
test "returns CLIP Vision embedding (without projection head) for an image" do
{:ok, model_info} =
Bumblebee.load_model({:hf, "openai/clip-vit-base-patch32"},
module: Bumblebee.Vision.ClipVision
)

{:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/clip-vit-base-patch32"})

serving = Bumblebee.Vision.ImageEmbedding.image_embedding(model_info, featurizer)
image = StbImage.read_file!(Path.join(@images_dir, "coco/39769.jpeg"))

assert %{embedding: %Nx.Tensor{} = embedding} = Nx.Serving.run(serving, image)
assert Nx.shape(embedding) == {768}

assert_all_close(
embedding[1..3],
Nx.tensor([0.0978, -0.7233, -0.7707]),
atol: 1.0e-4
)
end

test "returns normalized CLIP Vision embedding (without projection head) for an image" do
{:ok, model_info} =
Bumblebee.load_model({:hf, "openai/clip-vit-base-patch32"},
module: Bumblebee.Vision.ClipVision
)

{:ok, featurizer} = Bumblebee.load_featurizer({:hf, "openai/clip-vit-base-patch32"})

options = [
embedding_processor: :l2_norm
]

serving = Bumblebee.Vision.ImageEmbedding.image_embedding(model_info, featurizer, options)
image = StbImage.read_file!(Path.join(@images_dir, "coco/39769.jpeg"))

assert %{embedding: %Nx.Tensor{} = embedding} = Nx.Serving.run(serving, image)
assert Nx.shape(embedding) == {768}

assert_all_close(
embedding[1..3],
Nx.tensor([0.0036, -0.0269, -0.0286]),
atol: 1.0e-4
)
end
end
end
Loading