diff --git a/lm_eval/models/hf_vlms.py b/lm_eval/models/hf_vlms.py
index c94c76b26f..dd7d64d7c7 100644
--- a/lm_eval/models/hf_vlms.py
+++ b/lm_eval/models/hf_vlms.py
@@ -17,6 +17,7 @@
     replace_placeholders,
     stop_sequences_criteria,
 )
+from lm_eval.utils import add_padding_if_needed
 
 
 DEFAULT_IMAGE_PLACEHOLDER = "<image>"
@@ -266,7 +267,9 @@ def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str
     def tok_batch_multimodal_encode(
         self,
         strings: List[str],  # note that input signature of this fn is different
-        images: List[List],  # TODO: images are pil.Image at the moment, update typehint
+        images: List[
+            List["PIL.Image.Image"]  # noqa: F821
+        ],  # TODO: images are pil.Image at the moment, update typehint
         padding_side: str = "left",
         left_truncate_len: int = None,
         truncation: bool = False,
@@ -292,15 +295,25 @@ def tok_batch_multimodal_encode(
         images = [img[: self.max_images] for img in images]
         if self.rgb:
             images = [[img.convert("RGB") for img in sublist] for sublist in images]
-
-        encoding = self.processor(
-            images=images,
-            text=strings,
-            truncation=truncation,
-            padding="longest",
-            return_tensors="pt",
-            # **add_special_tokens, # TODO: at least some Processors error out when passing this. How do we control whether text gets BOS added?
-        )
+        try:
+            encoding = self.processor(
+                images=images,
+                text=strings,
+                truncation=truncation,
+                padding="longest",
+                return_tensors="pt",
+                # **add_special_tokens, # TODO: at least some Processors error out when passing this. How do we control whether text gets BOS added?
+            )
+            # Qwen processor errors out if a dimension is too small (defaults to do_resize=True, and that requires a min dimension)
+        except Exception:
+            encoding = self.processor(
+                images=[add_padding_if_needed(image) for image in images],
+                text=strings,
+                truncation=truncation,
+                padding="longest",
+                return_tensors="pt",
+                # **add_special_tokens, # TODO: at least some Processors error out when passing this. How do we control whether text gets BOS added?
+            )
 
         encoding.to(  # TODO: our other tokenization methods in HFLM don't typically move to device. this breaks convention
             self.device, self.model.dtype
diff --git a/lm_eval/utils.py b/lm_eval/utils.py
index 7166e24d07..7a6de73cc7 100644
--- a/lm_eval/utils.py
+++ b/lm_eval/utils.py
@@ -499,3 +499,40 @@ def weighted_f1_score(items):
     preds = unzipped_list[1]
     fscore = f1_score(golds, preds, average="weighted")
     return fscore
+
+
+def add_padding_if_needed(
+    images: List["PIL.Image.Image"],  # noqa: F821
+    min_width: int = 50,
+    min_height: int = 50,
+    color=(255, 255, 255),
+) -> List["PIL.Image.Image"]:  # noqa: F821
+    """Adds (default white) padding to images to make them at least min_width and min_height"""
+    from PIL import ImageOps
+
+    res = []
+    for image in images:
+        width, height = image.size
+
+        if width >= min_width and height >= min_height:
+            return image
+        image = image.convert("RGB")
+        new_width = max(width, min_width)
+        new_height = max(height, min_height)
+
+        delta_width = new_width - width
+        delta_height = new_height - height
+
+        padding_left = delta_width // 2
+        padding_right = delta_width - padding_left
+        padding_top = delta_height // 2
+        padding_bottom = delta_height - padding_top
+        res.append(
+            ImageOps.expand(
+                image,
+                (padding_left, padding_top, padding_right, padding_bottom),
+                fill=color,
+            )
+        )
+
+    return res