huggingface · Wauplin · Jul 24, 2025 · Jul 24, 2025 · Jul 24, 2025
diff --git a/docs/source/en/package_reference/inference_types.md b/docs/source/en/package_reference/inference_types.md
@@ -197,6 +197,18 @@ This part of the lib is still under development and will be improved in future r
 
 
 
+## image_to_video
+
+[[autodoc]] huggingface_hub.ImageToVideoInput
+
+[[autodoc]] huggingface_hub.ImageToVideoOutput
+
+[[autodoc]] huggingface_hub.ImageToVideoParameters
+
+[[autodoc]] huggingface_hub.ImageToVideoTargetSize
+
+
+
 ## object_detection
 
 [[autodoc]] huggingface_hub.ObjectDetectionBoundingBox

diff --git a/docs/source/ko/package_reference/inference_types.md b/docs/source/ko/package_reference/inference_types.md
@@ -196,6 +196,18 @@ rendered properly in your Markdown viewer.
 
 
 
+## image_to_video[[huggingface_hub.ImageToVideoInput]]
+
+[[autodoc]] huggingface_hub.ImageToVideoInput
+
+[[autodoc]] huggingface_hub.ImageToVideoOutput
+
+[[autodoc]] huggingface_hub.ImageToVideoParameters
+
+[[autodoc]] huggingface_hub.ImageToVideoTargetSize
+
+
+
 ## object_detection[[huggingface_hub.ObjectDetectionBoundingBox]]
 
 [[autodoc]] huggingface_hub.ObjectDetectionBoundingBox

diff --git a/src/huggingface_hub/__init__.py b/src/huggingface_hub/__init__.py
@@ -372,6 +372,10 @@
         "ImageToTextInput",
         "ImageToTextOutput",
         "ImageToTextParameters",
+        "ImageToVideoInput",
+        "ImageToVideoOutput",
+        "ImageToVideoParameters",
+        "ImageToVideoTargetSize",
         "ObjectDetectionBoundingBox",
         "ObjectDetectionInput",
         "ObjectDetectionOutputElement",
@@ -660,6 +664,10 @@
     "ImageToTextInput",
     "ImageToTextOutput",
     "ImageToTextParameters",
+    "ImageToVideoInput",
+    "ImageToVideoOutput",
+    "ImageToVideoParameters",
+    "ImageToVideoTargetSize",
     "InferenceApi",
     "InferenceClient",
     "InferenceEndpoint",
@@ -1370,6 +1378,10 @@ def __dir__():
         ImageToTextInput,  # noqa: F401
         ImageToTextOutput,  # noqa: F401
         ImageToTextParameters,  # noqa: F401
+        ImageToVideoInput,  # noqa: F401
+        ImageToVideoOutput,  # noqa: F401
+        ImageToVideoParameters,  # noqa: F401
+        ImageToVideoTargetSize,  # noqa: F401
         ObjectDetectionBoundingBox,  # noqa: F401
         ObjectDetectionInput,  # noqa: F401
         ObjectDetectionOutputElement,  # noqa: F401

diff --git a/src/huggingface_hub/inference/_generated/types/__init__.py b/src/huggingface_hub/inference/_generated/types/__init__.py
@@ -85,6 +85,7 @@
     ImageToTextOutput,
     ImageToTextParameters,
 )
+from .image_to_video import ImageToVideoInput, ImageToVideoOutput, ImageToVideoParameters, ImageToVideoTargetSize
 from .object_detection import (
     ObjectDetectionBoundingBox,
     ObjectDetectionInput,

diff --git a/src/huggingface_hub/inference/_generated/types/image_to_video.py b/src/huggingface_hub/inference/_generated/types/image_to_video.py
@@ -0,0 +1,60 @@
+# Inference code generated from the JSON schema spec in @huggingface/tasks.
+#
+# See:
+#   - script: https://github.com/huggingface/huggingface.js/blob/main/packages/tasks/scripts/inference-codegen.ts
+#   - specs:  https://github.com/huggingface/huggingface.js/tree/main/packages/tasks/src/tasks.
+from typing import Any, Optional
+
+from .base import BaseInferenceType, dataclass_with_extra
+
+
+@dataclass_with_extra
+class ImageToVideoTargetSize(BaseInferenceType):
+    """The size in pixel of the output video frames."""
+
+    height: int
+    width: int
+
+
+@dataclass_with_extra
+class ImageToVideoParameters(BaseInferenceType):
+    """Additional inference parameters for Image To Video"""
+
+    guidance_scale: Optional[float] = None
+    """For diffusion models. A higher guidance scale value encourages the model to generate
+    videos closely linked to the text prompt at the expense of lower image quality.
+    """
+    negative_prompt: Optional[str] = None
+    """One prompt to guide what NOT to include in video generation."""
+    num_frames: Optional[float] = None
+    """The num_frames parameter determines how many video frames are generated."""
+    num_inference_steps: Optional[int] = None
+    """The number of denoising steps. More denoising steps usually lead to a higher quality
+    video at the expense of slower inference.
+    """
+    prompt: Optional[str] = None
+    """The text prompt to guide the video generation."""
+    seed: Optional[int] = None
+    """Seed for the random number generator."""
+    target_size: Optional[ImageToVideoTargetSize] = None
+    """The size in pixel of the output video frames."""
+
+
+@dataclass_with_extra
+class ImageToVideoInput(BaseInferenceType):
+    """Inputs for Image To Video inference"""
+
+    inputs: str
+    """The input image data as a base64-encoded string. If no `parameters` are provided, you can
+    also provide the image data as a raw bytes payload.
+    """
+    parameters: Optional[ImageToVideoParameters] = None
+    """Additional inference parameters for Image To Video"""
+
+
+@dataclass_with_extra
+class ImageToVideoOutput(BaseInferenceType):
+    """Outputs of inference for the Image To Video task"""
+
+    video: Any
+    """The generated video returned as raw bytes in the payload."""