path llava-next

ArthurZucker · ArthurZucker · commit a2a95164b829 · 2024-03-20T20:37:15.000-04:00
diff --git a/src/transformers/image_processing_utils.py b/src/transformers/image_processing_utils.py
@@ -748,6 +748,43 @@ def get_size_dict(
     return size_dict
 
 
+def select_best_resolution(original_size: tuple, possible_resolutions: list) -> tuple:
+    """
+    Selects the best resolution from a list of possible resolutions based on the original size.
+
+    This is done by calculating the effective and wasted resolution for each possible resolution.
+
+    The best fit resolution is the one that maximizes the effective resolution and minimizes the wasted resolution.
+
+    Args:
+        original_size (tuple):
+            The original size of the image in the format (height, width).
+        possible_resolutions (list):
+            A list of possible resolutions in the format [(height1, width1), (height2, width2), ...].
+
+    Returns:
+        tuple: The best fit resolution in the format (height, width).
+    """
+    original_height, original_width = original_size
+    best_fit = None
+    max_effective_resolution = 0
+    min_wasted_resolution = float("inf")
+
+    for height, width in possible_resolutions:
+        scale = min(width / original_width, height / original_height)
+        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
+        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
+        wasted_resolution = (width * height) - effective_resolution
+
+        if effective_resolution > max_effective_resolution or (
+            effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution
+        ):
+            max_effective_resolution = effective_resolution
+            min_wasted_resolution = wasted_resolution
+            best_fit = (height, width)
+
+    return best_fit
+
 ImageProcessingMixin.push_to_hub = copy_func(ImageProcessingMixin.push_to_hub)
 if ImageProcessingMixin.push_to_hub.__doc__ is not None:
     ImageProcessingMixin.push_to_hub.__doc__ = ImageProcessingMixin.push_to_hub.__doc__.format(
diff --git a/src/transformers/models/auto/image_processing_auto.py b/src/transformers/models/auto/image_processing_auto.py
@@ -77,7 +77,7 @@
         ("layoutlmv3", "LayoutLMv3ImageProcessor"),
         ("levit", "LevitImageProcessor"),
         ("llava", "CLIPImageProcessor"),
-        ("llava_next", "CLIPImageProcessor"),
+        ("llava_next", "LlavaNextImageProcessor"),
         ("mask2former", "Mask2FormerImageProcessor"),
         ("maskformer", "MaskFormerImageProcessor"),
         ("mgp-str", "ViTImageProcessor"),
diff --git a/src/transformers/models/llava_next/image_processing_llava_next.py b/src/transformers/models/llava_next/image_processing_llava_next.py
@@ -19,7 +19,7 @@
 
 import numpy as np
 
-from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict, select_best_resolution
 from ...image_transforms import (
     convert_to_rgb,
     get_resize_output_image_size,
@@ -39,7 +39,7 @@
     make_list_of_images,
     to_numpy_array,
     valid_images,
-    validate_preprocess_arguments,
+    validate_preprocess_arguments
 )
 from ...utils import TensorType, is_vision_available, logging
 
@@ -51,43 +51,6 @@
     from PIL import Image
 
 
-def select_best_resolution(original_size: tuple, possible_resolutions: list) -> tuple:
-    """
-    Selects the best resolution from a list of possible resolutions based on the original size.
-
-    This is done by calculating the effective and wasted resolution for each possible resolution.
-
-    The best fit resolution is the one that maximizes the effective resolution and minimizes the wasted resolution.
-
-    Args:
-        original_size (tuple):
-            The original size of the image in the format (height, width).
-        possible_resolutions (list):
-            A list of possible resolutions in the format [(height1, width1), (height2, width2), ...].
-
-    Returns:
-        tuple: The best fit resolution in the format (height, width).
-    """
-    original_height, original_width = original_size
-    best_fit = None
-    max_effective_resolution = 0
-    min_wasted_resolution = float("inf")
-
-    for height, width in possible_resolutions:
-        scale = min(width / original_width, height / original_height)
-        downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
-        effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
-        wasted_resolution = (width * height) - effective_resolution
-
-        if effective_resolution > max_effective_resolution or (
-            effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution
-        ):
-            max_effective_resolution = effective_resolution
-            min_wasted_resolution = wasted_resolution
-            best_fit = (height, width)
-
-    return best_fit
-
 
 def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> List[np.array]:
     """
diff --git a/src/transformers/models/llava_next/modeling_llava_next.py b/src/transformers/models/llava_next/modeling_llava_next.py
@@ -33,7 +33,7 @@
 )
 from ..auto import AutoModel, AutoModelForCausalLM
 from .configuration_llava_next import LlavaNextConfig
-from .image_processing_llava_next import select_best_resolution
+from ...image_processing_utils import select_best_resolution
 
 
 logger = logging.get_logger(__name__)

Original file line number	Diff line number	Diff line change
`@@ -33,7 +33,7 @@`
`33`	`33`	`)`
`34`	`34`	`from ..auto import AutoModel, AutoModelForCausalLM`
`35`	`35`	`from .configuration_llava_next import LlavaNextConfig`
`36`		`-from .image_processing_llava_next import select_best_resolution`
	`36`	`+from ...image_processing_utils import select_best_resolution`
`37`	`37`
`38`	`38`
`39`	`39`	`logger = logging.get_logger(__name__)`