Skip to content

Commit a2a9516

Browse files
committed
path llava-next
1 parent 0788481 commit a2a9516

File tree

4 files changed

+41
-41
lines changed

4 files changed

+41
-41
lines changed

src/transformers/image_processing_utils.py

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -748,6 +748,43 @@ def get_size_dict(
748748
return size_dict
749749

750750

751+
def select_best_resolution(original_size: tuple, possible_resolutions: list) -> tuple:
752+
"""
753+
Selects the best resolution from a list of possible resolutions based on the original size.
754+
755+
This is done by calculating the effective and wasted resolution for each possible resolution.
756+
757+
The best fit resolution is the one that maximizes the effective resolution and minimizes the wasted resolution.
758+
759+
Args:
760+
original_size (tuple):
761+
The original size of the image in the format (height, width).
762+
possible_resolutions (list):
763+
A list of possible resolutions in the format [(height1, width1), (height2, width2), ...].
764+
765+
Returns:
766+
tuple: The best fit resolution in the format (height, width).
767+
"""
768+
original_height, original_width = original_size
769+
best_fit = None
770+
max_effective_resolution = 0
771+
min_wasted_resolution = float("inf")
772+
773+
for height, width in possible_resolutions:
774+
scale = min(width / original_width, height / original_height)
775+
downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
776+
effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
777+
wasted_resolution = (width * height) - effective_resolution
778+
779+
if effective_resolution > max_effective_resolution or (
780+
effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution
781+
):
782+
max_effective_resolution = effective_resolution
783+
min_wasted_resolution = wasted_resolution
784+
best_fit = (height, width)
785+
786+
return best_fit
787+
751788
ImageProcessingMixin.push_to_hub = copy_func(ImageProcessingMixin.push_to_hub)
752789
if ImageProcessingMixin.push_to_hub.__doc__ is not None:
753790
ImageProcessingMixin.push_to_hub.__doc__ = ImageProcessingMixin.push_to_hub.__doc__.format(

src/transformers/models/auto/image_processing_auto.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -77,7 +77,7 @@
7777
("layoutlmv3", "LayoutLMv3ImageProcessor"),
7878
("levit", "LevitImageProcessor"),
7979
("llava", "CLIPImageProcessor"),
80-
("llava_next", "CLIPImageProcessor"),
80+
("llava_next", "LlavaNextImageProcessor"),
8181
("mask2former", "Mask2FormerImageProcessor"),
8282
("maskformer", "MaskFormerImageProcessor"),
8383
("mgp-str", "ViTImageProcessor"),

src/transformers/models/llava_next/image_processing_llava_next.py

Lines changed: 2 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@
1919

2020
import numpy as np
2121

22-
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
22+
from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict, select_best_resolution
2323
from ...image_transforms import (
2424
convert_to_rgb,
2525
get_resize_output_image_size,
@@ -39,7 +39,7 @@
3939
make_list_of_images,
4040
to_numpy_array,
4141
valid_images,
42-
validate_preprocess_arguments,
42+
validate_preprocess_arguments
4343
)
4444
from ...utils import TensorType, is_vision_available, logging
4545

@@ -51,43 +51,6 @@
5151
from PIL import Image
5252

5353

54-
def select_best_resolution(original_size: tuple, possible_resolutions: list) -> tuple:
55-
"""
56-
Selects the best resolution from a list of possible resolutions based on the original size.
57-
58-
This is done by calculating the effective and wasted resolution for each possible resolution.
59-
60-
The best fit resolution is the one that maximizes the effective resolution and minimizes the wasted resolution.
61-
62-
Args:
63-
original_size (tuple):
64-
The original size of the image in the format (height, width).
65-
possible_resolutions (list):
66-
A list of possible resolutions in the format [(height1, width1), (height2, width2), ...].
67-
68-
Returns:
69-
tuple: The best fit resolution in the format (height, width).
70-
"""
71-
original_height, original_width = original_size
72-
best_fit = None
73-
max_effective_resolution = 0
74-
min_wasted_resolution = float("inf")
75-
76-
for height, width in possible_resolutions:
77-
scale = min(width / original_width, height / original_height)
78-
downscaled_width, downscaled_height = int(original_width * scale), int(original_height * scale)
79-
effective_resolution = min(downscaled_width * downscaled_height, original_width * original_height)
80-
wasted_resolution = (width * height) - effective_resolution
81-
82-
if effective_resolution > max_effective_resolution or (
83-
effective_resolution == max_effective_resolution and wasted_resolution < min_wasted_resolution
84-
):
85-
max_effective_resolution = effective_resolution
86-
min_wasted_resolution = wasted_resolution
87-
best_fit = (height, width)
88-
89-
return best_fit
90-
9154

9255
def divide_to_patches(image: np.array, patch_size: int, input_data_format) -> List[np.array]:
9356
"""

src/transformers/models/llava_next/modeling_llava_next.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@
3333
)
3434
from ..auto import AutoModel, AutoModelForCausalLM
3535
from .configuration_llava_next import LlavaNextConfig
36-
from .image_processing_llava_next import select_best_resolution
36+
from ...image_processing_utils import select_best_resolution
3737

3838

3939
logger = logging.get_logger(__name__)

0 commit comments

Comments
 (0)