diff --git a/requirements.txt b/requirements.txt
index 19a30dc..f291522 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,7 +10,7 @@ torch==2.3.1
 piexif
 yolov5==7.0.13
 torchvision==0.18.1
-transformers
+transformers[torch]
 timm
 pandas>=1.2.4
 ultralytics
diff --git a/sdcat/cluster/cluster.py b/sdcat/cluster/cluster.py
index daaca84..290f673 100644
--- a/sdcat/cluster/cluster.py
+++ b/sdcat/cluster/cluster.py
@@ -74,7 +74,7 @@ def _run_hdbscan_assign(
     if not numerical.empty:
         numerical = numerical.fillna(0)
 
-        # Normalize the numerical data from 0 to 1
+        # Normalize the numerical data from 0 to 1 and add it to the dataframe
         numerical = (numerical - numerical.min()) / (numerical.max() - numerical.min())
 
         df = pd.merge(df, numerical, left_index=True, right_index=True, how='left')
@@ -107,30 +107,15 @@ def _run_hdbscan_assign(
         labels = scan.fit_predict(x)
     else:
         scan = HDBSCAN(
-            metric='l2',
-            allow_single_cluster=True,
-            min_cluster_size=min_cluster_size,
-            min_samples=min_samples,
-            alpha=alpha,
-            cluster_selection_epsilon=cluster_selection_epsilon,
-            cluster_selection_method='leaf')
+                metric='l2',
+                allow_single_cluster=True,
+                min_cluster_size=min_cluster_size,
+                min_samples=min_samples,
+                alpha=alpha,
+                cluster_selection_epsilon=cluster_selection_epsilon,
+                cluster_selection_method='leaf')
         labels = scan.fit_predict(x)
 
-# title_tree = f'HDBSCAN Tree Distances {cluster_selection_epsilon} min_cluster_size {min_cluster_size} min_samples {min_samples} alpha {alpha}'
-# title_linkage = title_tree.replace('Tree Distances', 'Linkage')
-
-# scan.condensed_tree_.plot(select_clusters=True,
-#                             selection_palette=sns.color_palette('deep', 8))
-# plt.title(title_tree)
-# plt.xlabel('Index')
-# plt.savefig(f"{out_path}/{prefix}_condensed_tree.png")
-
-# plt.figure(figsize=(10, 6))
-# scan.single_linkage_tree_.plot(cmap='viridis', colorbar=True)
-# plt.title(title_linkage)
-# plt.xlabel('Index')
-# plt.savefig(f"{out_path}/{prefix}_tree.png")
-
     # Get the unique clusters and sort them; -1 are unassigned clusters
     cluster_df = pd.DataFrame(labels, columns=['cluster'])
     unique_clusters = cluster_df['cluster'].unique().tolist()
@@ -149,7 +134,7 @@ def _run_hdbscan_assign(
     if len(unique_clusters) == 1 and unique_clusters[0] == -1:
         avg_sim_scores = []
         exemplar_df = pd.DataFrame()
-        exemplar_df['cluster'] = len(x)*['Unknown']
+        exemplar_df['cluster'] = len(x) * ['Unknown']
         exemplar_df['embedding'] = x.tolist()
         exemplar_df['image_path'] = ancillary_df['image_path'].tolist()
         clusters = []
@@ -191,6 +176,9 @@ def _run_hdbscan_assign(
     avg_sim_scores = []
     for i, c in enumerate(clusters):
         debug(f'Computing similarity for cluster {i} with {len(c)} samples')
+        if len(c) == 0:
+            avg_sim_scores.append(0)
+            continue
         cosine_sim_matrix = cosine_similarity(image_emb[c])
         avg_sim_scores.append(np.mean(cosine_sim_matrix))
 
@@ -223,7 +211,7 @@ def _run_hdbscan_assign(
     else:
         init = 'spectral'
 
-    # Reduce the dimensionality of the embeddings using UMAP to 2 dimensions for visualization
+    # Reduce the dimensionality of the embeddings using UMAP to 2 dimensions to visualize the clusters
     if have_gpu:
         xx = cuUMAP(init=init,
                     n_components=2,
@@ -233,8 +221,6 @@ def _run_hdbscan_assign(
     else:
         xx = UMAP(init=init,
                   n_components=2,
-                  n_neighbors=3,
-                  min_dist=0.1,
                   metric='cosine',
                   low_memory=True).fit_transform(df.values)
 
@@ -285,14 +271,14 @@ def cluster_vits(
     # Skip cropping if all the crops are already done
     if num_crop != len(df_dets):
         num_processes = min(multiprocessing.cpu_count(), len(df_dets))
-        if roi == True:
-            info(f'ROI crops already exist. Creating square crops in parallel using {multiprocessing.cpu_count()} processes...')
+        if roi is True:
+            info(f'ROI crops already exist. Creating square crops in parallel using {num_processes} processes...')
             with multiprocessing.Pool(num_processes) as pool:
                 args = [(row, 224) for index, row in df_dets.iterrows()]
                 pool.starmap(square_image, args)
         else:
             # Crop and squaring the images in parallel using multiprocessing to speed up the processing
-            info(f'Cropping {len(df_dets)} detections in parallel using {multiprocessing.cpu_count()} processes...')
+            info(f'Cropping {len(df_dets)} detections in parallel using {num_processes} processes...')
             with multiprocessing.Pool(num_processes) as pool:
                 args = [(row, 224) for index, row in df_dets.iterrows()]
                 pool.starmap(crop_square_image, args)
@@ -317,9 +303,17 @@ def cluster_vits(
     for filename in images:
         emb = fetch_embedding(model, filename)
         if len(emb) == 0:
+            # If the embeddings are zero, then the extraction failed; add a zero array
             image_emb.append(np.zeros(384, dtype=np.float32))
         else:
             image_emb.append(emb)
+
+    # If the embeddings are zero, then the extraction failed
+    num_failed = [i for i, e in enumerate(image_emb) if np.all(e == 0)]
+    if len(num_failed) == len(images):
+        warn('Failed to extract embeddings from all images')
+        return pd.DataFrame()
+
     image_emb = np.array(image_emb)
 
     if not (output_path / prefix).exists():
@@ -338,15 +332,15 @@ def cluster_vits(
 
     # Cluster the images
     cluster_sim, exemplar_df, unique_clusters, cluster_means, coverage = _run_hdbscan_assign(prefix,
-                                                                                image_emb,
-                                                                                alpha,
-                                                                                cluster_selection_epsilon,
-                                                                                min_similarity,
-                                                                                min_cluster_size,
-                                                                                min_samples,
-                                                                                use_tsne,
-                                                                                ancillary_df,
-                                                                                output_path / prefix)
+                                                                                             image_emb,
+                                                                                             alpha,
+                                                                                             cluster_selection_epsilon,
+                                                                                             min_similarity,
+                                                                                             min_cluster_size,
+                                                                                             min_samples,
+                                                                                             use_tsne,
+                                                                                             ancillary_df,
+                                                                                             output_path / prefix)
 
     # Get the average similarity across all clusters
     avg_similarity = np.mean(cluster_sim)
diff --git a/sdcat/cluster/commands.py b/sdcat/cluster/commands.py
index dce1e6b..547fbe5 100644
--- a/sdcat/cluster/commands.py
+++ b/sdcat/cluster/commands.py
@@ -246,8 +246,11 @@ def is_day(utc_dt):
     info(df.head(5))
 
     if len(df) > 0:
+        # Replace / with _ in the model name
+        model_machine_friendly = model.replace('/', '_')
+
         # A prefix for the output files to make sure the output is unique for each execution
-        prefix = f'{model}_{datetime.now().strftime("%Y%m%d_%H%M%S")}'
+        prefix = f'{model_machine_friendly}_{datetime.now().strftime("%Y%m%d_%H%M%S")}'
 
         # Cluster the detections
         df_cluster = cluster_vits(prefix, model, df, save_dir, alpha, cluster_selection_epsilon, min_similarity,
diff --git a/sdcat/cluster/embedding.py b/sdcat/cluster/embedding.py
index 5776078..31ea682 100644
--- a/sdcat/cluster/embedding.py
+++ b/sdcat/cluster/embedding.py
@@ -13,45 +13,49 @@
 from sahi.utils.torch import torch
 from torchvision import transforms as pth_transforms
 import cv2
+from transformers import ViTModel, ViTImageProcessor
+
 from sdcat.logger import info, err
 
 
-def cache_embedding(embedding, model_name: str, filename: str):
-    # save numpy array as npy file
-    save(f'{filename}_{model_name}.npy', embedding)
+class ViTWrapper:
+    MODEL_NAME = "google/vit-base-patch16-224"
+    VECTOR_DIMENSIONS = 768
+
+    def __init__(self, device: str = "cpu", reset: bool = False, batch_size: int = 32):
+        self.batch_size = batch_size
 
+        self.model = ViTModel.from_pretrained(self.MODEL_NAME)
+        self.processor = ViTImageProcessor.from_pretrained(self.MODEL_NAME)
 
-def cache_attention(attention, model_name: str, filename: str):
+        # Load the model and processor
+        if 'cuda' in device and torch.cuda.is_available():
+            device_num = int(device.split(":")[-1])
+            info(f"Using GPU device {device_num}")
+            torch.cuda.set_device(device_num)
+            self.device = "cuda"
+            self.model.to("cuda")
+        else:
+            self.device = "cpu"
+
+
+def cache_embedding(embedding, model_name: str, filename: str):
+    model_machine_friendly_name = model_name.replace("/", "_")
     # save numpy array as npy file
-    save(f'{filename}_{model_name}_a.npy', attention)
+    save(f'{filename}_{model_machine_friendly_name}.npy', embedding)
 
 
 def fetch_embedding(model_name: str, filename: str) -> np.array:
+    model_machine_friendly_name = model_name.replace("/", "_")
     # if the npy file exists, return it
-    if os.path.exists(f'{filename}_{model_name}.npy'):
-        data = load(f'{filename}_{model_name}.npy')
+    if os.path.exists(f'{filename}_{model_machine_friendly_name}.npy'):
+        data = load(f'{filename}_{model_machine_friendly_name}.npy')
         return data
     else:
         info(f'No embedding found for {filename}')
     return []
 
 
-def fetch_attention(model_name: str, filename: str) -> np.array:
-    """
-    Fetch the attention map for the given filename and model name
-    :param model_name: Name of the model
-    :param filename: Name of the file
-    :return: Numpy array of the attention map
-    """
-    # if the npy file exists, return it
-    if os.path.exists(f'{filename}_{model_name}_a.npy'):
-        data = load(f'{filename}_{model_name}_a.npy')
-        return data
-    else:
-        info(f'No attention map found for {filename}')
-    return []
-
-
 def has_cached_embedding(model_name: str, filename: str) -> int:
     """
     Check if the given filename has a cached embedding
@@ -59,7 +63,8 @@ def has_cached_embedding(model_name: str, filename: str) -> int:
     :param filename: Name of the file
     :return: 1 if the image has a cached embedding, otherwise 0
     """
-    if os.path.exists(f'{filename}_{model_name}.npy'):
+    model_machine_friendly_name = model_name.replace("/", "_")
+    if os.path.exists(f'{filename}_{model_machine_friendly_name}.npy'):
         return 1
     return 0
 
@@ -71,89 +76,48 @@ def encode_image(filename):
     return keep
 
 
-def compute_embedding(images: list, model_name: str):
+def compute_embedding_vits(images: list, model_name: str, device: str = "cpu"):
     """
     Compute the embedding for the given images using the given model
     :param images: List of image filenames
-    :param model_name: Name of the model
+    :param model_name: Name of the model (i.e. google/vit-base-patch16-224, dinov2_vits16, etc.)
+    :param device: Device to use for the computation (cpu or cuda:0, cuda:1, etc.)
     """
-
-    # Load the model
-    if 'dinov2' in model_name:
-        info(f'Loading model {model_name} from facebookresearch/dinov2...')
-        model = torch.hub.load('facebookresearch/dinov2', model_name)
-    elif 'dino' in model_name:
-        info(f'Loading model {model_name} from facebookresearch/dino:main...')
-        model = torch.hub.load('facebookresearch/dino:main', model_name)
-    else:
-        # TODO: Add more models
-        err(f'Unknown model {model_name}!')
-        return
-
-    # The patch size is in the model name, e.g. dino_vits16 is a 16x16 patch size, dino_vits8 is a 8x8 patch size
-    res = re.findall(r'\d+$', model_name)
-    if len(res) > 0:
-        patch_size = int(res[0])
+    batch_size = 8
+    vit_model = ViTModel.from_pretrained(model_name)
+    processor = ViTImageProcessor.from_pretrained(model_name)
+
+    if 'cuda' in device and torch.cuda.is_available():
+        device_num = int(device.split(":")[-1])
+        info(f"Using GPU device {device_num}")
+        torch.cuda.set_device(device_num)
+        vit_model.to("cuda")
+        device = "cuda"
     else:
-        raise ValueError(f'Could not find patch size in model name {model_name}')
-    info(f'Using patch size {patch_size} for model {model_name}')
-
-    # Load images and generate embeddings
-    device = 'cuda' if torch.cuda.is_available() else 'cpu'
-    with torch.no_grad():
-        # Set the cuda device
-        if torch.cuda.is_available():
-            model = model.to(device)
-
-        for filename in images:
-            # Skip if the embedding already exists
-            if Path(f'{filename}_{model_name}.npy').exists():
+        device = "cpu"
+
+    # Batch process the images
+    batches = [images[i:i + batch_size] for i in range(0, len(images), batch_size)]
+    for batch in batches:
+        try:
+            # Skip running the model if the embeddings already exist
+            if all([has_cached_embedding(model_name, filename) for filename in batch]):
                 continue
 
-            try:
-                # Load the image
-                square_img = Image.open(filename)
-
-                # Do some image processing to reduce the noise in the image
-                # Gaussian blur
-                square_img = square_img.filter(ImageFilter.GaussianBlur(radius=1))
-
-                image = np.array(square_img)
-
-                norm_transform = pth_transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
-                img_tensor = torch.from_numpy(image).permute(2, 0, 1).float() / 255.0
-                # Noramlize the tensor with the mean and std of the ImageNet dataset
-                img_tensor = norm_transform(img_tensor)
-                img_tensor = img_tensor.unsqueeze(0)  # Add batch dimension
-                if 'cuda' in device:
-                    img_tensor = img_tensor.to(device)
-                features = model(img_tensor)
-
-                # TODO: add attention map cach as optional
-                # attentions = model.get_last_selfattention(img_tensor)
-
-                # nh = attentions.shape[1]  # number of head
-
-                # w_featmap = 224 // patch_size
-                # h_featmap = 224 // patch_size
+            images = [Image.open(filename).convert("RGB") for filename in batch]
+            inputs = processor(images=images, return_tensors="pt").to(device)
 
-                # Keep only the output patch attention
-                # attentions = attentions[0, :, 0, 1:].reshape(nh, -1)
-                # attentions = attentions.reshape(nh, w_featmap, h_featmap)
-                # attentions = nn.functional.interpolate(attentions.unsqueeze(0), scale_factor=patch_size, mode="nearest")[
-                #     0].cpu().numpy()
-                #
-                # # Resize the attention map to the original image size
-                # attentions = np.uint8(255 * attentions[0])
+            with torch.no_grad():
+                embeddings = vit_model(**inputs)
 
-                # Get the feature embeddings
-                embeddings = features.squeeze(dim=0)  # Remove batch dimension
-                embeddings = embeddings.cpu().numpy()  # Convert to numpy array
+            batch_embeddings = embeddings.last_hidden_state[:, 0, :].cpu().numpy()
 
-                cache_embedding(embeddings, model_name, filename)  # save the embedding to disk
-                #cache_attention(attentions, model_name, filename)  # save the attention map to disk
-            except Exception as e:
-                err(f'Error processing {filename}: {e}')
+            # Save the embeddings
+            for emb, filename in zip(batch_embeddings, batch):
+                emb = emb.astype(np.float32)
+                cache_embedding(emb, model_name, filename)
+        except Exception as e:
+            err(f'Error processing {batch}: {e}')
 
 
 def compute_norm_embedding(model_name: str, images: list):
@@ -172,7 +136,7 @@ def compute_norm_embedding(model_name: str, images: list):
 
     # If using a GPU, set then skip the parallel CPU processing
     if torch.cuda.is_available():
-        compute_embedding(images, model_name)
+        compute_embedding_vits(images, model_name)
     else:
         # Use a pool of processes to speed up the embedding generation 20 images at a time on each process
         num_processes = min(multiprocessing.cpu_count(), len(images) // 20)
@@ -180,7 +144,7 @@ def compute_norm_embedding(model_name: str, images: list):
         info(f'Using {num_processes} processes to compute {len(images)} embeddings 20 at a time ...')
         with multiprocessing.Pool(num_processes) as pool:
             args = [(images[i:i + 20], model_name) for i in range(0, len(images), 20)]
-            pool.starmap(compute_embedding, args)
+            pool.starmap(compute_embedding_vits, args)
 
 
 def calc_mean_std(image_files: list) -> tuple:
diff --git a/sdcat/cluster/utils.py b/sdcat/cluster/utils.py
index a5c5cc7..d5695e0 100644
--- a/sdcat/cluster/utils.py
+++ b/sdcat/cluster/utils.py
@@ -9,14 +9,13 @@
 from mpl_toolkits.axes_grid1 import ImageGrid
 from pathlib import Path
 
-from sdcat.cluster.embedding import fetch_attention
 from sdcat.logger import debug, warn, exception
 
 
 def cluster_grid(prefix: str, cluster_sim: float, cluster_id: int, cluster_size: int, nb_images_display: int,
                  images: list, output_path: Path):
     """
-    Cluster visualization; create a grid of images both with and without attention map
+    Cluster visualization; create a grid of images
     :param cluster_sim: Cluster similarity
     :param cluster_size: Size of the cluster
     :param cluster_id: Cluster ID
@@ -26,74 +25,60 @@ def cluster_grid(prefix: str, cluster_sim: float, cluster_id: int, cluster_size:
     """
     debug(f'Cluster number {cluster_id} size {len(cluster_size)} similarity {cluster_sim}\n')
 
-    def gen_grid(with_attention: bool):
-        # Plot a grid for each group of images nb_images_display at a time (e.g. 8x8)
-        for i in range(0, len(images), nb_images_display * nb_images_display):
-            fig = plt.figure(figsize=(10., 10.))
-            grid = ImageGrid(fig, 111,  # similar to subplot(111)
-                             nrows_ncols=(nb_images_display, nb_images_display),
-                             # creates nb_images_display x nb_images_display grid of axes
-                             axes_pad=0.025,
-                             share_all=True,
-                             cbar_pad=0.025)
-            images_display = images[i:i + nb_images_display * nb_images_display]
-            page = i // (nb_images_display * nb_images_display)
-
-            # If we have more than 3 pages, then only display the first 3 pages
-            # There can be a large number of pages for detections in common classes
-            if page > 3:
-                break
-
-            total_pages = len(images) // (nb_images_display * nb_images_display)
-            # debug(f"{i} Image filename:", images[j])
-            for j, image in enumerate(images_display):
-                try:
-                    image_square = Image.open(image)
-                    grid[j].imshow(image_square)
-                except Exception as e:
-                    exception(f'Error opening {image} {e}')
-                    continue
-
-                if with_attention:
-                    # Get the attention map
-                    # TODO: remove this or refactor with pass through of model name
-                    attention = fetch_attention('dino_vitb8', image)
-
-                    # Overlay the attention map on top of the original image
-                    grid[j].imshow(attention, cmap='jet', alpha=0.125)
-
-                grid[j].axis('off')
-                # If the verified is in the image name, then add a label to the image in the top center corner
-                if 'verified' in image:
-                    n = Path(image)
-                    title = f"{n.stem.split('_')[0]}"
-                    grid[j].text(30, 10, title, fontsize=8, color='white', ha='center', va='center')
-                # clear the x and y-axis
-                grid[j].set_xticklabels([])
-
-            # Add a title to the figure
-            if total_pages > 1:
-                fig.suptitle(
-                    f"{prefix} Cluster {cluster_id}, Size: {len(cluster_size)}, Similarity: {cluster_sim:.2f}, Page: {page} of {total_pages}",
-                    fontsize=16)
-            else:
-                fig.suptitle(f"{prefix} Cluster {cluster_id}, Size: {len(cluster_size)}, Similarity: {cluster_sim:.2f}",
-                             fontsize=16)
-
-            # Set the background color of the grid to white
-            fig.set_facecolor('white')
-
-            # Write the figure to a file
-            if with_attention:
-                out = output_path / f'{prefix}_cluster_{cluster_id}_p{page}_attention.png'
-            else:
-                out = output_path / f'{prefix}_cluster_{cluster_id}_p{page}.png'
-            debug(f'Writing {out}')
-            fig.savefig(out.as_posix())
-            plt.close(fig)
-
-    gen_grid(with_attention=False)
-    # gen_grid(with_attention=True)
+    # Plot a grid for each group of images nb_images_display at a time (e.g. 8x8)
+    for i in range(0, len(images), nb_images_display * nb_images_display):
+        fig = plt.figure(figsize=(10., 10.))
+        grid = ImageGrid(fig, 111,  # similar to subplot(111)
+                         nrows_ncols=(nb_images_display, nb_images_display),
+                         # creates nb_images_display x nb_images_display grid of axes
+                         axes_pad=0.025,
+                         share_all=True,
+                         cbar_pad=0.025)
+        images_display = images[i:i + nb_images_display * nb_images_display]
+        page = i // (nb_images_display * nb_images_display)
+
+        # If we have more than 3 pages, then only display the first 3 pages
+        # There can be a large number of pages for detections in common classes
+        if page > 3:
+            break
+
+        total_pages = len(images) // (nb_images_display * nb_images_display)
+        # debug(f"{i} Image filename:", images[j])
+        for j, image in enumerate(images_display):
+            try:
+                image_square = Image.open(image)
+                grid[j].imshow(image_square)
+            except Exception as e:
+                exception(f'Error opening {image} {e}')
+                continue
+
+            grid[j].axis('off')
+            # If the verified is in the image name, then add a label to the image in the top center corner
+            if 'verified' in image:
+                n = Path(image)
+                title = f"{n.stem.split('_')[0]}"
+                grid[j].text(30, 10, title, fontsize=8, color='white', ha='center', va='center')
+            # clear the x and y-axis
+            grid[j].set_xticklabels([])
+
+        # Add a title to the figure
+        if total_pages > 1:
+            fig.suptitle(
+                f"{prefix} Cluster {cluster_id}, Size: {len(cluster_size)}, Similarity: {cluster_sim:.2f}, Page: {page} of {total_pages}",
+                fontsize=16)
+        else:
+            fig.suptitle(f"{prefix} Cluster {cluster_id}, Size: {len(cluster_size)}, Similarity: {cluster_sim:.2f}",
+                         fontsize=16)
+
+        # Set the background color of the grid to white
+        fig.set_facecolor('white')
+
+        # Write the figure to a file
+        out = output_path / f'{prefix}_cluster_{cluster_id}_p{page}.png'
+        debug(f'Writing {out}')
+        fig.savefig(out.as_posix())
+        plt.close(fig)
+
 
 
 def square_image(row, square_dim: int):
@@ -132,6 +117,7 @@ def square_image(row, square_dim: int):
         exception(f'Error cropping {row.image_path} {e}')
         raise e
 
+
 def crop_square_image(row, square_dim: int):
     """
     Crop the image to a square padding the shortest dimension, then resize it to square_dim x square_dim
@@ -203,14 +189,8 @@ def crop_square_image(row, square_dim: int):
         img = img.resize((square_dim, square_dim), Image.LANCZOS)
 
         # Save the image
-        # img.save(row.crop_path)
-
-        # Every 10th index, Create a zero byte file to indicate that the crop was successful
-        if Path(row.image_path).stem is 'e1f5e2b8-9e3c-5904-a896-acb3c7a9cbf6':
-            Path(row.crop_path).touch()
-        else:
-            img.save(row.crop_path)
-            img.close()
+        img.save(row.crop_path)
+        img.close()
 
     except Exception as e:
         exception(f'Error cropping {row.image_path} {e}')
diff --git a/sdcat/config/config.ini b/sdcat/config/config.ini
index ef3a1a6..5b4959e 100644
--- a/sdcat/config/config.ini
+++ b/sdcat/config/config.ini
@@ -28,18 +28,18 @@ min_cluster_size = 2
 min_samples = 1
 max_area = 4375000
 min_area = 100
+# Detections not assigned with hdbscan are assigned to the nearest cluster with a similarity > min_similarity
+# This is useful for merging examples not assigned to clusters; set to 0 to disable
+# A value of .9 would be very conservative, while a value of .5 would be very aggressive (merging only somewhat similar detections)
 # min_similarity must be in the range [0, 1]
-# Clusters not assigned with hdbscan are assigned to the nearest cluster with a similarity > min_similarity
 min_similarity = 0.70
-# Examples: dinov2_vits14, dino_vits8, dino_vits16
-# dinov2 models were pretrained on a dataset of 142 M images without any labels
-# dino models were pretrained on ImageNet which contains 1.3 M images with labels
-# dino_vits8 has block_size=8 which can be good for very small objects
-# dino_vits14 has block_size=14
+# google/vit-base-patch16-224 is a model trained on ImageNet21k with 21k classes good for general detection
+# dino models were pretrained on ImageNet which contains 1.3 M images with labels from 1000 classes
 # Smaller block_size means more patches and more accurate fine-grained clustering on smaller objects
-model = dino_vits8
-;model = dinov2_vits14
-;model = dinov2_vitb14
+# Larger block_size means fewer patches and faster processing
+model = google/vit-base-patch16-224
+;model = facebook/dino-vits8
+;model = facebook/dino-vits16
 
 [detect]
 ########################################################################
diff --git a/sdcat/detect/commands.py b/sdcat/detect/commands.py
index fb8bf35..97b20a9 100644
--- a/sdcat/detect/commands.py
+++ b/sdcat/detect/commands.py
@@ -8,8 +8,6 @@
 import cv2
 import pandas as pd
 import torch
-from huggingface_hub import hf_hub_download
-from sahi import AutoDetectionModel
 from sahi.postprocess.combine import nms
 
 from sdcat import common_args
@@ -67,6 +65,7 @@ def run_detect(show: bool, image_dir: str, save_dir: str, model: str,
     create_logger_file('detect')
 
     if not skip_sahi:
+        from sahi import AutoDetectionModel
         if model == 'yolov8s':
             detection_model = AutoDetectionModel.from_pretrained(
                 model_type='yolov8',
@@ -101,6 +100,7 @@ def run_detect(show: bool, image_dir: str, save_dir: str, model: str,
             )
         elif model == 'MBARI/megamidwater':
             # Download model path
+            from huggingface_hub import hf_hub_download
             model_path = hf_hub_download(repo_id="MBARI-org/megamidwater", filename="best.pt")
             detection_model = AutoDetectionModel.from_pretrained(
                 model_type='yolov5',
@@ -111,6 +111,7 @@ def run_detect(show: bool, image_dir: str, save_dir: str, model: str,
             )
         elif model == 'MBARI/uav-yolov5':
             # Download model path
+            from huggingface_hub import hf_hub_download
             model_path = hf_hub_download(repo_id="MBARI-org/uav-yolov5", filename="best.pt")
             detection_model = AutoDetectionModel.from_pretrained(
                 model_type='yolov5',
@@ -121,6 +122,7 @@ def run_detect(show: bool, image_dir: str, save_dir: str, model: str,
             )
         elif model == 'FathomNet/MBARI-315k-yolov5':
             # Download model path
+            from huggingface_hub import hf_hub_download
             model_path = hf_hub_download(repo_id="FathomNet/MBARI-315k-yolov5", filename="mbari_315k_yolov5.pt")
             detection_model = AutoDetectionModel.from_pretrained(
                 model_type='yolov5',