diff --git a/lm_eval/models/hf_vlms.py b/lm_eval/models/hf_vlms.py index c94c76b26f..dd7d64d7c7 100644 --- a/lm_eval/models/hf_vlms.py +++ b/lm_eval/models/hf_vlms.py @@ -17,6 +17,7 @@ replace_placeholders, stop_sequences_criteria, ) +from lm_eval.utils import add_padding_if_needed DEFAULT_IMAGE_PLACEHOLDER = "" @@ -266,7 +267,9 @@ def chat_template(self, chat_template: Union[bool, str] = False) -> Optional[str def tok_batch_multimodal_encode( self, strings: List[str], # note that input signature of this fn is different - images: List[List], # TODO: images are pil.Image at the moment, update typehint + images: List[ + List["PIL.Image.Image"] # noqa: F821 + ], # TODO: images are pil.Image at the moment, update typehint padding_side: str = "left", left_truncate_len: int = None, truncation: bool = False, @@ -292,15 +295,25 @@ def tok_batch_multimodal_encode( images = [img[: self.max_images] for img in images] if self.rgb: images = [[img.convert("RGB") for img in sublist] for sublist in images] - - encoding = self.processor( - images=images, - text=strings, - truncation=truncation, - padding="longest", - return_tensors="pt", - # **add_special_tokens, # TODO: at least some Processors error out when passing this. How do we control whether text gets BOS added? - ) + try: + encoding = self.processor( + images=images, + text=strings, + truncation=truncation, + padding="longest", + return_tensors="pt", + # **add_special_tokens, # TODO: at least some Processors error out when passing this. How do we control whether text gets BOS added? + ) + # Qwen processor errors out if a dimension is too small (defaults to do_resize=True, and that requires a min dimension) + except Exception: + encoding = self.processor( + images=[add_padding_if_needed(image) for image in images], + text=strings, + truncation=truncation, + padding="longest", + return_tensors="pt", + # **add_special_tokens, # TODO: at least some Processors error out when passing this. How do we control whether text gets BOS added? + ) encoding.to( # TODO: our other tokenization methods in HFLM don't typically move to device. this breaks convention self.device, self.model.dtype diff --git a/lm_eval/utils.py b/lm_eval/utils.py index 7166e24d07..7a6de73cc7 100644 --- a/lm_eval/utils.py +++ b/lm_eval/utils.py @@ -499,3 +499,40 @@ def weighted_f1_score(items): preds = unzipped_list[1] fscore = f1_score(golds, preds, average="weighted") return fscore + + +def add_padding_if_needed( + images: List["PIL.Image.Image"], # noqa: F821 + min_width: int = 50, + min_height: int = 50, + color=(255, 255, 255), +) -> List["PIL.Image.Image"]: # noqa: F821 + """Adds (default white) padding to images to make them at least min_width and min_height""" + from PIL import ImageOps + + res = [] + for image in images: + width, height = image.size + + if width >= min_width and height >= min_height: + return image + image = image.convert("RGB") + new_width = max(width, min_width) + new_height = max(height, min_height) + + delta_width = new_width - width + delta_height = new_height - height + + padding_left = delta_width // 2 + padding_right = delta_width - padding_left + padding_top = delta_height // 2 + padding_bottom = delta_height - padding_top + res.append( + ImageOps.expand( + image, + (padding_left, padding_top, padding_right, padding_bottom), + fill=color, + ) + ) + + return res