From 4533a19ec7ab8b14f395e7bb21dc268c0823e6d7 Mon Sep 17 00:00:00 2001 From: Mo King Date: Mon, 15 Sep 2025 09:34:37 -0400 Subject: [PATCH 1/6] Add timeout options for run_sync --- serverless/endpoints/send-requests.mdx | 27 +++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/serverless/endpoints/send-requests.mdx b/serverless/endpoints/send-requests.mdx index e95760c6..b26664ba 100644 --- a/serverless/endpoints/send-requests.mdx +++ b/serverless/endpoints/send-requests.mdx @@ -112,16 +112,21 @@ export ENDPOINT_ID="YOUR_ENDPOINT_ID" ### `/runsync` -Synchronous jobs wait for completion and return the complete result in a single response. This approach works best for shorter tasks where you need immediate results, interactive applications, and simpler client code without status polling. +Synchronous jobs wait for completion and return the complete result in a single response. This approach works best for shorter tasks where you need immediate results, interactive applications, and simpler client code without status polling. -* **Payload limit**: 20 MB -* **Job availability**: Results are available for 60 seconds after completion +* **Payload limit**: 20 MB +* **Result availability**: 1 minute by default, up to 5 minutes with `?wait=m` or `timeout=s` (depending on the SDK) + +Results are available for 1 minute by default, but you can append `?wait=m` to the request URL to extend the timeout up to 5 minutes, where `m` is the number of milliseconds to store the results, from 1000 (1 second) to 300000 (5 minutes). + +For example, appending `?wait=120000` will keep your results available for 2 minutes. + ```sh curl --request POST \ - --url https://api.runpod.ai/v2/$ENDPOINT_ID/runsync \ + --url https://api.runpod.ai/v2/$ENDPOINT_ID/runsync?wait=120000 \ -H "accept: application/json" \ -H "authorization: $RUNPOD_API_KEY" \ -H "content-type: application/json" \ @@ -130,6 +135,9 @@ curl --request POST \ + +Use the `timeout` parameter to set the duration for results to be available in seconds. For example, `timeout=120` will keep the results for 2 minutes. + ```python import runpod import os @@ -140,7 +148,7 @@ endpoint = runpod.Endpoint(os.getenv("ENDPOINT_ID")) try: run_request = endpoint.run_sync( {"prompt": "Hello, world!"}, - timeout=60, # Timeout in seconds + timeout=120, # Results will be available for 120 seconds (2 minutes), up to 300 (5 minutes) ) print(run_request) except TimeoutError: @@ -149,6 +157,7 @@ except TimeoutError: + ```javascript const { RUNPOD_API_KEY, ENDPOINT_ID } = process.env; import runpodSdk from "runpod-sdk"; @@ -160,6 +169,7 @@ const result = await endpoint.runSync({ "input": { "prompt": "Hello, World!", }, +}, 60000); // Results will be available for 60000 milliseconds (1 minute), up to 300000 (5 minutes) }); console.log(result); @@ -167,6 +177,9 @@ console.log(result); + +Use the `Timeout` parameter to set the duration for results to be available in seconds. For example, `Timeout=60` will keep the results for 60 seconds. + ```go package main @@ -199,7 +212,7 @@ func main() { "prompt": "Hello World", }, }, - Timeout: sdk.Int(120), + Timeout: sdk.Int(60), // Results will be available for 60 seconds, up to 3000 (5 minutes) } output, err := endpoint.RunSync(&jobInput) @@ -826,7 +839,7 @@ You'll see the job status updated to `IN_QUEUE` when the job is retried: ``` -Job results expire after a set period. Asynchronous jobs (`/run`) results are available for 30 minutes, while synchronous jobs (`/runsync`) results are available for 1 minute. Once expired, jobs cannot be retried. +Job results expire after a set period. Asynchronous jobs (`/run`) results are available for 30 minutes, while synchronous jobs (`/runsync`) results are available for 1 minute (up to 5 minutes with `?wait=t`). Once expired, jobs cannot be retried. ### `/purge-queue` From 4a59ea03d19adc75e667d0792e321bfe004ad5e5 Mon Sep 17 00:00:00 2001 From: Mo King Date: Mon, 15 Sep 2025 09:36:04 -0400 Subject: [PATCH 2/6] Update --- serverless/endpoints/send-requests.mdx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/serverless/endpoints/send-requests.mdx b/serverless/endpoints/send-requests.mdx index b26664ba..b8f82e1f 100644 --- a/serverless/endpoints/send-requests.mdx +++ b/serverless/endpoints/send-requests.mdx @@ -115,7 +115,7 @@ export ENDPOINT_ID="YOUR_ENDPOINT_ID" Synchronous jobs wait for completion and return the complete result in a single response. This approach works best for shorter tasks where you need immediate results, interactive applications, and simpler client code without status polling. * **Payload limit**: 20 MB -* **Result availability**: 1 minute by default, up to 5 minutes with `?wait=m` or `timeout=s` (depending on the SDK) +* **Result availability**: 1 minute by default, up to 5 minutes with `?wait=m` (for cURL) or `timeout=s` (for SDKs) From 09144f63523dae2b921752aa5b5fe7f1c6bf6399 Mon Sep 17 00:00:00 2001 From: Mo King Date: Tue, 16 Sep 2025 08:57:47 -0400 Subject: [PATCH 3/6] Address comments, improve "how requests work" --- serverless/endpoints/send-requests.mdx | 92 ++++++++++++++---------- serverless/workers/handler-functions.mdx | 24 ++++--- 2 files changed, 68 insertions(+), 48 deletions(-) diff --git a/serverless/endpoints/send-requests.mdx b/serverless/endpoints/send-requests.mdx index b8f82e1f..a4f06968 100644 --- a/serverless/endpoints/send-requests.mdx +++ b/serverless/endpoints/send-requests.mdx @@ -12,26 +12,39 @@ Serverless endpoints provide synchronous and asynchronous job processing with au ## How requests work -After creating a Serverless [endpoint](/serverless/endpoints/overview), you can start sending it **requests** to submit jobs and retrieve results. A request can include parameters, payloads, and headers that define what the endpoint should process. For example, you can send a `POST` request to submit a job, or a `GET` request to check status of a job, retrieve results, or check endpoint health. +After creating a Serverless [endpoint](/serverless/endpoints/overview), you can start sending it **requests** to submit jobs and retrieve results. -A **job** is a unit of work containing the input data from the request, packaged for processing by your [workers](/serverless/workers/overview). If no worker is immediately available, the job is queued. Once a worker is available, the job is processed by the worker using your [handler function](/serverless/workers/handler-functions). +A request can include parameters, payloads, and headers that define what the endpoint should process. For example, you can send a `POST` request to submit a job, or a `GET` request to check status of a job, retrieve results, or check endpoint health. -When you submit a job request, it can be either synchronous or asynchronous depending on the operation you use: +A **job** is a unit of work containing the input data from the request, packaged for processing by your [workers](/serverless/workers/overview). -- `/runsync` submits a synchronous job. A response is returned as soon as the job is complete. -- `/run` submits an asynchronous job. The job is processed in the background, and you can retrieve the result by sending a `GET` request to the `/status` endpoint. +If no worker is immediately available, the job is queued. Once a worker is available, the job is processed by the worker using your [handler function](/serverless/workers/handler-functions). Queue-based endpoints provide a fixed set of operations for submitting and managing jobs. You can find a full list of operations and examples in the [sections below](/serverless/endpoints/send-requests#operation-overview). +## Sync vs. async + +When you submit a job request, it can be either synchronous or asynchronous depending on the operation you use: + +- `/runsync` submits a synchronous job. + - Client waits for the job to complete before returning the result. + - A response is returned as soon as the job is complete. + - Results are available for 1 minute by default (5 minutes max). + - Ideal for quick responses and interactive applications. +- `/run` submits an asynchronous job. + - The job is processed in the background. + - Retrieve the result by sending a `GET` request to the `/status` endpoint. + - Results are available for 30 minutes after completion. + - Ideal for long-running tasks and batch processing. + + If you need to create an endpoint that supports custom API paths, use [load balancing endpoints](/serverless/load-balancing/overview). ## Request input structure -When submitting a job with `/runsync` or `/run`, your request must include a JSON object the the key `input`, containing the parameters required by your worker's [handler function](/serverless/workers/handler-functions). - -For example: +When submitting a job with `/runsync` or `/run`, your request must include a JSON object with the key `input` containing the parameters required by your worker's [handler function](/serverless/workers/handler-functions). For example: ```json { @@ -41,7 +54,7 @@ For example: } ``` -The exact parameters inside the `input` object depend on your specific worker implementation. Check your worker's documentation for required and optional parameters. +The exact parameters required in the `input` object depend on your specific worker implementation (e.g. `prompt` commonly used for endpoints serving LLMs, but not all workers accept it). Check your worker's documentation for a list of required and optional parameters. ## Send requests from the console @@ -112,17 +125,24 @@ export ENDPOINT_ID="YOUR_ENDPOINT_ID" ### `/runsync` -Synchronous jobs wait for completion and return the complete result in a single response. This approach works best for shorter tasks where you need immediate results, interactive applications, and simpler client code without status polling. +Synchronous jobs wait for completion and return the complete result in a single response. This approach works best for shorter tasks where you need immediate results, interactive applications, and simpler client code without status polling. -* **Payload limit**: 20 MB -* **Result availability**: 1 minute by default, up to 5 minutes with `?wait=m` (for cURL) or `timeout=s` (for SDKs) +`/runsync` requests have a maximum payload size of 20 MB. - - +Results are available for 1 minute by default, but you can append `?wait=x` to the request URL to extend this up to 5 minutes, where `x` is the number of milliseconds to store the results, from 1000 (1 second) to 300000 (5 minutes). + +For example, `?wait=120000` will keep your results available for 2 minutes: -Results are available for 1 minute by default, but you can append `?wait=m` to the request URL to extend the timeout up to 5 minutes, where `m` is the number of milliseconds to store the results, from 1000 (1 second) to 300000 (5 minutes). +```sh +https://api.runpod.ai/v2/$ENDPOINT_ID/runsync?wait=120000 +``` -For example, appending `?wait=120000` will keep your results available for 2 minutes. + +This is only available for `cURL` and standard HTTP request APIs/libraries. + + + + ```sh curl --request POST \ @@ -136,8 +156,6 @@ curl --request POST \ -Use the `timeout` parameter to set the duration for results to be available in seconds. For example, `timeout=120` will keep the results for 2 minutes. - ```python import runpod import os @@ -148,7 +166,7 @@ endpoint = runpod.Endpoint(os.getenv("ENDPOINT_ID")) try: run_request = endpoint.run_sync( {"prompt": "Hello, world!"}, - timeout=120, # Results will be available for 120 seconds (2 minutes), up to 300 (5 minutes) + timeout=60, # Client timeout in seconds ) print(run_request) except TimeoutError: @@ -169,7 +187,8 @@ const result = await endpoint.runSync({ "input": { "prompt": "Hello, World!", }, -}, 60000); // Results will be available for 60000 milliseconds (1 minute), up to 300000 (5 minutes) + timeout: 60000, // Client timeout in milliseconds +}); }); console.log(result); @@ -178,8 +197,6 @@ console.log(result); -Use the `Timeout` parameter to set the duration for results to be available in seconds. For example, `Timeout=60` will keep the results for 60 seconds. - ```go package main @@ -212,7 +229,7 @@ func main() { "prompt": "Hello World", }, }, - Timeout: sdk.Int(60), // Results will be available for 60 seconds, up to 3000 (5 minutes) + Timeout: sdk.Int(60), // Client timeout in seconds } output, err := endpoint.RunSync(&jobInput) @@ -251,8 +268,9 @@ func main() { Asynchronous jobs process in the background and return immediately with a job ID. This approach works best for longer-running tasks that don't require immediate results, operations requiring significant processing time, and managing multiple concurrent jobs. -* **Payload limit**: 10 MB -* **Job availability**: Results are available for 30 minutes after completion +`/run` requests have a maximum payload size of 10 MB. + +Job results are available for 30 minutes after completion. @@ -355,14 +373,22 @@ func main() { + +TODO + + + + +`/run` requests return a response with the job ID and status: + ```json { "id": "eaebd6e7-6a92-4bb8-a911-f996ac5ea99d", "status": "IN_QUEUE" } ``` - - + +Further results must be retrieved using the `/status` endpoint. ### `/status` @@ -1110,14 +1136,4 @@ Here are some common issues and suggested solutions: | Rate limiting | Too many requests in short time | Implement backoff strategy, batch requests when possible | | Missing results | Results expired | Retrieve results within expiration window (30 min for async, 1 min for sync) | -Implementing proper error handling and retry logic will make your integrations more robust and reliable. - -## Related resources - -* [Endpoint configurations](/serverless/endpoints/endpoint-configurations) -* [Python SDK for endpoints](/sdks/python/endpoints) -* [JavaScript SDK for endpoints](/sdks/javascript/endpoints) -* [Go SDK for endpoints](/sdks/go/endpoints) -* [Handler functions](/serverless/workers/handler-functions) -* [Local testing](/serverless/development/local-testing) -* [GitHub integration](/serverless/workers/github-integration) +Implementing proper error handling and retry logic will make your integrations more robust and reliable. \ No newline at end of file diff --git a/serverless/workers/handler-functions.mdx b/serverless/workers/handler-functions.mdx index 150d8197..278b77d7 100644 --- a/serverless/workers/handler-functions.mdx +++ b/serverless/workers/handler-functions.mdx @@ -15,14 +15,16 @@ Before building a handler function, you should understand the structure of job r ```json { - "id": "A_RANDOM_JOB_IDENTIFIER", - "input": { "key": "value" } + "id": "eaebd6e7-6a92-4bb8-a911-f996ac5ea99d", + "input": { + "key": "value" + } } ``` -Your handler will access the `input` field to process the request data. +`id` is a randomly generated unique identifier for the job, while `input` contains the data for your worker to process. -To learn more about endpoint requests, see [Send requests](/serverless/endpoints/send-requests). +To learn more about endpoint requests, see [Send API requests](/serverless/endpoints/send-requests). ## Basic handler implementation @@ -33,13 +35,15 @@ import runpod def handler(job): job_input = job["input"] # Access the input from the request + # Add your custom code here + return "Your job results" runpod.serverless.start({"handler": handler}) # Required ``` -The handler takes a request, extracts the input, processes it, and returns a result. The `runpod.serverless.start()` function launches your serverless application with the specified handler. +The handler takes extracts the input from the job request, processes it, and returns a result. The `runpod.serverless.start()` function launches your serverless application with the specified handler. ## Local testing @@ -73,7 +77,7 @@ You can create several types of handler functions depending on the needs of your ### Standard handlers -The simplest handler type, standard handlers process inputs synchronously and return results directly. +The simplest handler type, standard handlers process inputs synchronously and return them when the job is complete. ```python import runpod @@ -110,7 +114,7 @@ runpod.serverless.start({ }) ``` -By default, outputs from streaming handlers are only available at the `/stream` endpoint. Set `return_aggregate_stream` to `True` to make outputs available from the `/run` and `/runsync` endpoints as well. +By default, outputs from streaming handlers are only available using the `/stream` operation. Set `return_aggregate_stream` to `True` to make outputs available from the `/run` and `/runsync` operations as well. ### Asynchronous handlers @@ -228,7 +232,7 @@ def handler(job): runpod.serverless.start( { "handler": handler, # Required: Specify the sync handler - "return_aggregate_stream": True, # Optional: Aggregate results are accessible via /run endpoint + "return_aggregate_stream": True, # Optional: Aggregate results are accessible via /run operation } ) ``` @@ -269,8 +273,8 @@ A short list of best practices to keep in mind as you build your handler functio Be aware of payload size limits when designing your handler: -* `/run` endpoint: 10 MB -* `/runsync` endpoint: 20 MB +* `/run` operation: 10 MB +* `/runsync` operation: 20 MB If your results exceed these limits, consider stashing them in cloud storage and returning links instead. From 1647e0d692683a0c6887de15b6d45e994ab15ead Mon Sep 17 00:00:00 2001 From: Mo King Date: Tue, 16 Sep 2025 09:06:15 -0400 Subject: [PATCH 4/6] Remove responses from tabs --- serverless/endpoints/send-requests.mdx | 53 +++++++++++--------------- 1 file changed, 22 insertions(+), 31 deletions(-) diff --git a/serverless/endpoints/send-requests.mdx b/serverless/endpoints/send-requests.mdx index a4f06968..81a5da24 100644 --- a/serverless/endpoints/send-requests.mdx +++ b/serverless/endpoints/send-requests.mdx @@ -18,9 +18,9 @@ A request can include parameters, payloads, and headers that define what the end A **job** is a unit of work containing the input data from the request, packaged for processing by your [workers](/serverless/workers/overview). -If no worker is immediately available, the job is queued. Once a worker is available, the job is processed by the worker using your [handler function](/serverless/workers/handler-functions). +If no worker is immediately available, the job is queued. Once a worker is available, the job is processed using your worker's [handler function](/serverless/workers/handler-functions). -Queue-based endpoints provide a fixed set of operations for submitting and managing jobs. You can find a full list of operations and examples in the [sections below](/serverless/endpoints/send-requests#operation-overview). +Queue-based endpoints provide a fixed set of operations for submitting and managing jobs. You can find a full list of operations and sample code in the [sections below](/serverless/endpoints/send-requests#operation-overview). ## Sync vs. async @@ -37,11 +37,6 @@ When you submit a job request, it can be either synchronous or asynchronous depe - Results are available for 30 minutes after completion. - Ideal for long-running tasks and batch processing. - - -If you need to create an endpoint that supports custom API paths, use [load balancing endpoints](/serverless/load-balancing/overview). - - ## Request input structure When submitting a job with `/runsync` or `/run`, your request must include a JSON object with the key `input` containing the parameters required by your worker's [handler function](/serverless/workers/handler-functions). For example: @@ -94,6 +89,10 @@ Here's a quick overview of the operations available for queue-based endpoints: | `/purge-queue` | POST | Clear all pending jobs from the queue without affecting jobs already in progress. | | `/health` | GET | Monitor the operational status of your endpoint, including worker and job statistics. | + +If you need to create an endpoint that supports custom API paths, use [load balancing endpoints](/serverless/load-balancing/overview). + + ## Operation reference Below you'll find detailed explanations and examples for each operation using `cURL` and the Runpod SDK. @@ -138,7 +137,7 @@ https://api.runpod.ai/v2/$ENDPOINT_ID/runsync?wait=120000 ``` -This is only available for `cURL` and standard HTTP request APIs/libraries. +`?wait` is only available for `cURL` and standard HTTP request libraries. @@ -146,7 +145,7 @@ This is only available for `cURL` and standard HTTP request APIs/libraries. ```sh curl --request POST \ - --url https://api.runpod.ai/v2/$ENDPOINT_ID/runsync?wait=120000 \ + --url https://api.runpod.ai/v2/$ENDPOINT_ID/runsync \ -H "accept: application/json" \ -H "authorization: $RUNPOD_API_KEY" \ -H "content-type: application/json" \ @@ -242,10 +241,9 @@ func main() { } ``` + - - -`/runsync` requests return a response as soon as the job is complete: +`/runsync` returns a response as soon as the job is complete: ```json { @@ -261,8 +259,6 @@ func main() { "status": "COMPLETED" } ``` - - ### `/run` @@ -372,14 +368,9 @@ func main() { ``` - - -TODO - - -`/run` requests return a response with the job ID and status: +`/run` returns a response with the job ID and status: ```json { @@ -388,15 +379,21 @@ TODO } ``` -Further results must be retrieved using the `/status` endpoint. +Further results must be retrieved using the `/status` operation. ### `/status` -Check the current state, execution statistics, and results of previously submitted jobs. The status endpoint provides the current job state, execution statistics like queue delay and processing time, and job output if completed. +Check the current state, execution statistics, and results of previously submitted jobs. The status operation provides the current job state, execution statistics like queue delay and processing time, and job output if completed. + + +You can configure time-to-live (TTL) for individual jobs by appending a TTL parameter to the request URL. + +For example, `https://api.runpod.ai/v2/$ENDPOINT_ID/status/YOUR_JOB_ID?ttl=6000` sets the TTL to 6 seconds. + -Replace `YOUR_JOB_ID` with the actual job ID you received in the response to the `/run` request. +Replace `YOUR_JOB_ID` with the actual job ID you received in the response to the `/run` operation. ```sh curl --request GET \ @@ -515,9 +512,9 @@ func main() { ``` - + -`/status` requests return a JSON response with the job status (e.g. `IN_QUEUE`, `IN_PROGRESS`, `COMPLETED`, `FAILED`), and an optional `output` field if the job is completed: +`/status` returns a JSON response with the job status (e.g. `IN_QUEUE`, `IN_PROGRESS`, `COMPLETED`, `FAILED`), and an optional `output` field if the job is completed: ```json { @@ -532,12 +529,6 @@ func main() { "status": "COMPLETED" } ``` - - - - -You can configure time-to-live (TTL) for individual jobs by appending a TTL parameter: `https://api.runpod.ai/v2/$ENDPOINT_ID/status/YOUR_JOB_ID?ttl=6000` sets the TTL to 6 seconds. - ### `/stream` From 08bf56175fa43bf11ff327a48170934eb04977eb Mon Sep 17 00:00:00 2001 From: Mo King Date: Tue, 16 Sep 2025 12:24:07 -0400 Subject: [PATCH 5/6] Remove responses from tabs --- serverless/endpoints/send-requests.mdx | 40 +++---- serverless/storage/model-repo.mdx | 145 +++++++++++++++++++++++++ 2 files changed, 165 insertions(+), 20 deletions(-) create mode 100644 serverless/storage/model-repo.mdx diff --git a/serverless/endpoints/send-requests.mdx b/serverless/endpoints/send-requests.mdx index 81a5da24..60878917 100644 --- a/serverless/endpoints/send-requests.mdx +++ b/serverless/endpoints/send-requests.mdx @@ -659,7 +659,14 @@ func main() { ``` - + + + +The maximum size for a single streamed payload chunk is 1 MB. Larger outputs will be split across multiple chunks. + + +Streaming response format: + ```json [ { @@ -684,12 +691,6 @@ func main() { } ] ``` - - - - -The maximum size for a single streamed payload chunk is 1 MB. Larger outputs will be split across multiple chunks. - ### `/cancel` @@ -824,15 +825,18 @@ func main() { ``` - + + + +`/cancel` requests return a JSON response with the status of the cancel operation: + ```json { "id": "724907fe-7bcc-4e42-998d-52cb93e1421f-u1", "status": "CANCELLED" } ``` - - + ### `/retry` @@ -911,7 +915,11 @@ main(); ``` - + + + +`/purge-queue` operation only affects jobs waiting in the queue. Jobs already in progress will continue to run. + `/purge-queue` requests return a JSON response with the number of jobs removed from the queue and the status of the purge operation: @@ -921,12 +929,6 @@ main(); "status": "completed" } ``` - - - - -`/purge-queue` operation only affects jobs waiting in the queue. Jobs already in progress will continue to run. - ### `/health` @@ -970,7 +972,7 @@ console.log(health); ``` - + `/health` requests return a JSON response with the current status of the endpoint, including the number of jobs completed, failed, in progress, in queue, and retried, as well as the status of workers. @@ -989,8 +991,6 @@ console.log(health); } } ``` - - ## vLLM and OpenAI requests diff --git a/serverless/storage/model-repo.mdx b/serverless/storage/model-repo.mdx new file mode 100644 index 00000000..c67f212b --- /dev/null +++ b/serverless/storage/model-repo.mdx @@ -0,0 +1,145 @@ +--- +title: "Model repository" +sidebarTitle: "Model repository" +description: "Upload models to Runpod to speed up worker starts and reduce costs for your Serverless endpoints." +tag: "BETA" +--- + + + +The private model repository feature is currently in beta. Please [join our Discord](https://discord.gg/runpod) if you'd like to provide feedback. + + + +This guide provides an overview of how to use cached models with your Serverless endpoints, and instructions for managing private models with the [Runpod CLI](/runpodctl/overview). + +The Runpod model repository allows you to upload your models directly to the Runpod ecosystem. By pre-caching models on our infrastructure, you can significantly reduce worker start times, lower costs, and improve the reliability of your Serverless endpoints. + +## Overview + +Using cached models provides several key advantages: + +- **Faster cold start times:** Public models or private models stored in the repository are pre-cached on Runpod's infrastructure, eliminating the need for workers to download them from external sources like Hugging Face. +- **Reduced costs:** You aren't billed for worker time while your model is being downloaded. This is especially impactful for large models that can take several minutes to load. +- **Centralized model management:** Manage all your models directly within Runpod without needing to switch between platforms like Hugging Face or other model repositories. +- **Accelerated deployment:** Deploy pre-cached models instantly without waiting for external downloads or transfers. +- **Version control:** Store and manage different versions of the same model, allowing you to easily switch between versions for testing or production deployments. +- **Smaller container images:** By decoupling models from your container image, you can create smaller, more focused images that contain only your serving logic. + + +## Public vs. private models + +There are two types of cached models: + +**Public models** are popular models that Runpod has pre-cached for all users. These models appear automatically in your model selection dropdown and don't require any upload process. You can start using them immediately when creating or updating endpoints. + +**Private models** are models you upload to the repository using the Runpod CLI (`runpodctl`). Once uploaded, these models appear in the model selection dropdown alongside public models, giving you the same performance benefits while maintaining control over your proprietary or customized models. + +## How it works + +When you select a model during [Serverless endpoint creation](#use-models-in-serverless), Runpod automatically tries to start your workers on hosts that already contain your selected model. + +If no pre-cached host machines are available, the system delays starting your workers until the model download completes, ensuring you still won't be charged for the download time. + +The private model repository feature is available at **no additional cost** during the beta launch period. + +## Manage private models + +Make sure you've [installed the CLI](/runpodctl/install-runpodctl) and configured it with your API key. + +### Upload a model + +You can upload any model from the [Hugging Face Model Hub](https://huggingface.co/models) to the Runpod repository using the model identifier. + +To upload a model from Hugging Face, run the following command: + +```bash +runpodctl create model \ + --provider huggingface \ + --name YOUR_MODEL_NAME +``` + +Replace `YOUR_MODEL_NAME` with the model identifier from Hugging Face. + +For example, to upload the `stable-diffusion-xl-refiner-1.0` model, run: + +```bash +runpodctl create model \ + --provider huggingface \ + --name stabilityai/stable-diffusion-xl-refiner-1.0 + +``` + +### List your models + +To see a list of all models you've uploaded to the repository, run the following command: + +```bash +runpodctl get models +``` + +This will display all the models in your repository, allowing you to confirm successful uploads and check for duplicates. + +You should see output similar to the following: + +```bash +ID NAME SOURCE STATUS SIZE(GB) VERSION(SHORT) +mdl_123 custom-llama-v1 HUGGING_FACE READY 24.7 9f1c2ab +mdl_456 llama31-8b HUGGING_FACE DOWNLOADING - - +``` + +### Get model details + +To get detailed information about a specific model, run: + +```bash +runpodctl get model YOUR_MODEL_ID +``` + +Replace `YOUR_MODEL_ID` with the ID of your uploaded model. + +For example, running `runpodctl get model 4oqrsweux0fkcp` on the example output above would return: + +```shell +provider: huggingface +name: stabilityai/stable-diffusion-xl-refiner-1.0 +createdDate: 2023-08-03T22:31:36.289Z +storagePath: /stabilityai-stable-diffusion-xl-refiner-1.0/ +id: 4oqrsweux0fkcp +bucketId: pllmb-staging-cloud +regionSpecs: +- regionName: Staging + bucketName: pllmb-staging-cloud + multiplier: 8 + maxQuantity: 30 + maxIncrement: 5 + amount: 22 +``` +### Remove a model + +When you no longer need a model uploaded to the private repository, you can remove it using `runpodctl`. This cleans up your repository list and frees up storage space. + +To remove a model, run the following command: + +```bash +runpodctl remove model \ + --provider huggingface \ + --name lodestones/Chroma +``` + + +Before removing a model, ensure that none of your active endpoints are using it. + + +## Use models in Serverless + +When creating a new Serverless endpoint or updating an existing one, you can select models from your private model repository. + +To select a model from the repository, follow these steps: + +1. Navigate to the [Serverless section](https://www.console.runpod.io/serverless) of the Runpod console. +2. Click **New Endpoint**, or edit an existing endpoint. +3. In the **Endpoint Configuration** step, scroll down to **Model (optional)** and click the dropdown. Your uploaded models will be listed under **Organization Repository**. +4. Select your model from the list. +5. Enter a Hugging Face token if you're using a gated model. +6. Complete your endpoint configuration and click **Deploy Endpoint**. \ No newline at end of file From 13def76018487af1ec8159e1154eaa7e472e39e6 Mon Sep 17 00:00:00 2001 From: Mo King Date: Fri, 3 Oct 2025 10:08:14 -0400 Subject: [PATCH 6/6] Delete serverless/storage/model-repo.mdx --- serverless/storage/model-repo.mdx | 145 ------------------------------ 1 file changed, 145 deletions(-) delete mode 100644 serverless/storage/model-repo.mdx diff --git a/serverless/storage/model-repo.mdx b/serverless/storage/model-repo.mdx deleted file mode 100644 index c67f212b..00000000 --- a/serverless/storage/model-repo.mdx +++ /dev/null @@ -1,145 +0,0 @@ ---- -title: "Model repository" -sidebarTitle: "Model repository" -description: "Upload models to Runpod to speed up worker starts and reduce costs for your Serverless endpoints." -tag: "BETA" ---- - - - -The private model repository feature is currently in beta. Please [join our Discord](https://discord.gg/runpod) if you'd like to provide feedback. - - - -This guide provides an overview of how to use cached models with your Serverless endpoints, and instructions for managing private models with the [Runpod CLI](/runpodctl/overview). - -The Runpod model repository allows you to upload your models directly to the Runpod ecosystem. By pre-caching models on our infrastructure, you can significantly reduce worker start times, lower costs, and improve the reliability of your Serverless endpoints. - -## Overview - -Using cached models provides several key advantages: - -- **Faster cold start times:** Public models or private models stored in the repository are pre-cached on Runpod's infrastructure, eliminating the need for workers to download them from external sources like Hugging Face. -- **Reduced costs:** You aren't billed for worker time while your model is being downloaded. This is especially impactful for large models that can take several minutes to load. -- **Centralized model management:** Manage all your models directly within Runpod without needing to switch between platforms like Hugging Face or other model repositories. -- **Accelerated deployment:** Deploy pre-cached models instantly without waiting for external downloads or transfers. -- **Version control:** Store and manage different versions of the same model, allowing you to easily switch between versions for testing or production deployments. -- **Smaller container images:** By decoupling models from your container image, you can create smaller, more focused images that contain only your serving logic. - - -## Public vs. private models - -There are two types of cached models: - -**Public models** are popular models that Runpod has pre-cached for all users. These models appear automatically in your model selection dropdown and don't require any upload process. You can start using them immediately when creating or updating endpoints. - -**Private models** are models you upload to the repository using the Runpod CLI (`runpodctl`). Once uploaded, these models appear in the model selection dropdown alongside public models, giving you the same performance benefits while maintaining control over your proprietary or customized models. - -## How it works - -When you select a model during [Serverless endpoint creation](#use-models-in-serverless), Runpod automatically tries to start your workers on hosts that already contain your selected model. - -If no pre-cached host machines are available, the system delays starting your workers until the model download completes, ensuring you still won't be charged for the download time. - -The private model repository feature is available at **no additional cost** during the beta launch period. - -## Manage private models - -Make sure you've [installed the CLI](/runpodctl/install-runpodctl) and configured it with your API key. - -### Upload a model - -You can upload any model from the [Hugging Face Model Hub](https://huggingface.co/models) to the Runpod repository using the model identifier. - -To upload a model from Hugging Face, run the following command: - -```bash -runpodctl create model \ - --provider huggingface \ - --name YOUR_MODEL_NAME -``` - -Replace `YOUR_MODEL_NAME` with the model identifier from Hugging Face. - -For example, to upload the `stable-diffusion-xl-refiner-1.0` model, run: - -```bash -runpodctl create model \ - --provider huggingface \ - --name stabilityai/stable-diffusion-xl-refiner-1.0 - -``` - -### List your models - -To see a list of all models you've uploaded to the repository, run the following command: - -```bash -runpodctl get models -``` - -This will display all the models in your repository, allowing you to confirm successful uploads and check for duplicates. - -You should see output similar to the following: - -```bash -ID NAME SOURCE STATUS SIZE(GB) VERSION(SHORT) -mdl_123 custom-llama-v1 HUGGING_FACE READY 24.7 9f1c2ab -mdl_456 llama31-8b HUGGING_FACE DOWNLOADING - - -``` - -### Get model details - -To get detailed information about a specific model, run: - -```bash -runpodctl get model YOUR_MODEL_ID -``` - -Replace `YOUR_MODEL_ID` with the ID of your uploaded model. - -For example, running `runpodctl get model 4oqrsweux0fkcp` on the example output above would return: - -```shell -provider: huggingface -name: stabilityai/stable-diffusion-xl-refiner-1.0 -createdDate: 2023-08-03T22:31:36.289Z -storagePath: /stabilityai-stable-diffusion-xl-refiner-1.0/ -id: 4oqrsweux0fkcp -bucketId: pllmb-staging-cloud -regionSpecs: -- regionName: Staging - bucketName: pllmb-staging-cloud - multiplier: 8 - maxQuantity: 30 - maxIncrement: 5 - amount: 22 -``` -### Remove a model - -When you no longer need a model uploaded to the private repository, you can remove it using `runpodctl`. This cleans up your repository list and frees up storage space. - -To remove a model, run the following command: - -```bash -runpodctl remove model \ - --provider huggingface \ - --name lodestones/Chroma -``` - - -Before removing a model, ensure that none of your active endpoints are using it. - - -## Use models in Serverless - -When creating a new Serverless endpoint or updating an existing one, you can select models from your private model repository. - -To select a model from the repository, follow these steps: - -1. Navigate to the [Serverless section](https://www.console.runpod.io/serverless) of the Runpod console. -2. Click **New Endpoint**, or edit an existing endpoint. -3. In the **Endpoint Configuration** step, scroll down to **Model (optional)** and click the dropdown. Your uploaded models will be listed under **Organization Repository**. -4. Select your model from the list. -5. Enter a Hugging Face token if you're using a gated model. -6. Complete your endpoint configuration and click **Deploy Endpoint**. \ No newline at end of file