From 3b6424f98f579cb5061652cff7cd63bfd3a5f9cf Mon Sep 17 00:00:00 2001
From: Mo King <muhsinking@gmail.com>
Date: Mon, 25 Aug 2025 09:54:25 -0400
Subject: [PATCH 1/4] Add model store details to manage-endpoints and
 endpoint-configurations

---
 serverless/endpoints/endpoint-configurations.mdx | 6 ++++++
 serverless/endpoints/manage-endpoints.mdx        | 3 ++-
 2 files changed, 8 insertions(+), 1 deletion(-)
diff --git a/serverless/endpoints/endpoint-configurations.mdx b/serverless/endpoints/endpoint-configurations.mdx
index 74ba468f..98455b8a 100644
--- a/serverless/endpoints/endpoint-configurations.mdx
+++ b/serverless/endpoints/endpoint-configurations.mdx
@@ -143,6 +143,12 @@ The effectiveness of FlashBoot increases exponentially with higher request volum
 
 </Tip>
 
+### Model
+
+You can select from a list of pre-cached ML models using the **Model (optional)** field. Selecting a model signals the system to place your workers on host machines that contain the selected model, resulting in faster cold starts and cost savings (since you won't be charged while the model is downloading).
+
+If the requested model isn't cached on the assigned host, the system will defer starting your workers until after the model finishes downloading, ensuring that you aren't charged during the download process.
+
 ## Advanced endpoint configuration
 
 When configuring advanced settings, remember that each constraint (data center, storage, CUDA version, GPU type) may limit resource availability. For maximum availability and reliability, select all data centers and CUDA versions, and avoid network volumes unless your workload specifically requires them.
diff --git a/serverless/endpoints/manage-endpoints.mdx b/serverless/endpoints/manage-endpoints.mdx
index 4f4a26bc..3274566c 100644
--- a/serverless/endpoints/manage-endpoints.mdx
+++ b/serverless/endpoints/manage-endpoints.mdx
@@ -19,7 +19,8 @@ Create a new Serverless endpoint through the Runpod web interface:
 5. Configure your endpoint settings:
    * **Endpoint Name**: The display name for your endpoint in the console.
    * **Endpoint Type**: Select **Queue** for traditional queue-based processing or **Load balancer** for direct HTTP access (see [Load balancing endpoints](/serverless/load-balancing/overview) for details).
-   * **GPU Configuration**: select the appropriate GPU types and configure worker settings.
+   * **GPU Configuration**: Select the appropriate GPU types and configure worker settings.
+   * **Model (optional)**: Select a model from Hugging Face to optimize worker startup times. When you specify a model, Runpod attempts to place your workers on host machines that already have the model cached locally, resulting in faster cold starts and cost savings (since you won't be charged while the model is downloading). You can either select from the dropdown list of pre-cached models or enter a custom Hugging Face model URL.
    * **Container Configuration**: Edit the container start command, specify the [container disk size](/serverless/storage/overview), and expose HTTP/TCP ports.
    * **Environment Variables**: Add [environment variables](/serverless/development/environment-variables
 http://localhost:3000/community-solutions/overview) for your worker containers.

From fe4cc6fe80b8594a326be368a48dec2efe08357b Mon Sep 17 00:00:00 2001
From: Mo King <muhsinking@gmail.com>
Date: Tue, 16 Sep 2025 08:09:39 -0400
Subject: [PATCH 2/4] Add new CLI pages

---
 runpodctl/reference/runpodctl-create-model.mdx   | 0
 runpodctl/reference/runpodctl-create-pod.mdx     | 1 +
 runpodctl/reference/runpodctl-describe-model.mdx | 0
 runpodctl/reference/runpodctl-get-models.mdx     | 0
 runpodctl/reference/runpodctl-remove-model.mdx   | 0
 5 files changed, 1 insertion(+)
 create mode 100644 runpodctl/reference/runpodctl-create-model.mdx
 create mode 100644 runpodctl/reference/runpodctl-describe-model.mdx
 create mode 100644 runpodctl/reference/runpodctl-get-models.mdx
 create mode 100644 runpodctl/reference/runpodctl-remove-model.mdx

diff --git a/runpodctl/reference/runpodctl-create-model.mdx b/runpodctl/reference/runpodctl-create-model.mdx
new file mode 100644
index 00000000..e69de29b
diff --git a/runpodctl/reference/runpodctl-create-pod.mdx b/runpodctl/reference/runpodctl-create-pod.mdx
index 0f33cdf1..7f5c3d7f 100644
--- a/runpodctl/reference/runpodctl-create-pod.mdx
+++ b/runpodctl/reference/runpodctl-create-pod.mdx
@@ -36,6 +36,7 @@ runpodctl create pod [flags]
     --volumePath string       container volume path (default "/runpod")
     --volumeSize int          persistent volume disk size in GB (default 1)
     --networkVolumeId string  network volume id
+    --model string            model to use with the pod
 ```
 
 ### SEE ALSO
diff --git a/runpodctl/reference/runpodctl-describe-model.mdx b/runpodctl/reference/runpodctl-describe-model.mdx
new file mode 100644
index 00000000..e69de29b
diff --git a/runpodctl/reference/runpodctl-get-models.mdx b/runpodctl/reference/runpodctl-get-models.mdx
new file mode 100644
index 00000000..e69de29b
diff --git a/runpodctl/reference/runpodctl-remove-model.mdx b/runpodctl/reference/runpodctl-remove-model.mdx
new file mode 100644
index 00000000..e69de29b

From b54956b2f16e235cea8a83330bd2eabc0d555a64 Mon Sep 17 00:00:00 2001
From: Mo King <muhsinking@gmail.com>
Date: Tue, 16 Sep 2025 11:33:50 -0400
Subject: [PATCH 3/4] Update "how it works"

---
 serverless/storage/model-repo.mdx | 61 +++++++++++--------------------
 1 file changed, 22 insertions(+), 39 deletions(-)

diff --git a/serverless/storage/model-repo.mdx b/serverless/storage/model-repo.mdx
index 36e34086..d86f5726 100644
--- a/serverless/storage/model-repo.mdx
+++ b/serverless/storage/model-repo.mdx
@@ -19,14 +19,26 @@ The Runpod private model repository allows you to upload your models directly to
 
 Using the private model repository provides several key advantages:
 
-- **Faster cold start times:** Models stored in the repository are pre-cached on Runpod's infrastructure, eliminating the need for workers to download them from external sources like Hugging Face. Our smart scheduler also prioritizes placing your jobs on machines that already have the required model, leading to near-instantaneous starts.
+- **Faster cold start times:** Models stored in the repository are pre-cached on Runpod's infrastructure, eliminating the need for workers to download them from external sources like Hugging Face.
 - **Reduced costs:** You aren't billed for worker time while your model is being downloaded. This is especially impactful for large models that can take several minutes to load.
 - **Improved reliability:** Storing models on Runpod reduces your dependency on external services, which can experience downtime or rate-limiting.
 - **Smaller container images:** By decoupling models from your container image, you can create smaller, more focused images that contain only your serving logic.
 
+## How it works
+
+You can upload and manage models using the Runpod CLI (`runpodctl`). 
+
+When you [select a model](#use-models-in-serverless-endpoints) during Serverless endpoint creation, Runpod will automatically try to start your workers on hosts that already contain your selected model.
+
+If no pre-cached host machines are available, the system will delay starting your workers until the model download completes, so you still won't be charged for the download time.
+
+## Pricing
+
+The private model repository feature is available at no additional cost during the beta launch period.
+
 ## Manage private models
 
-Models can be uploaded and managed using the Runpod CLI (`runpodctl`). Make sure you've [installed runpodctl](/runpodctl/install-runpodctl) and [configured your API key](/runpodctl/overview#configure-your-api-key).
+Make sure you've [installed runpodctl](/runpodctl/install-runpodctl) and [configured your API key](/runpodctl/overview#configure-your-api-key).
 
 ### Upload a model
 
@@ -51,7 +63,7 @@ runpodctl create model \
 
 ```
 
-### List your uploaded models
+### List your models
 
 To see a list of all models you've uploaded to the repository, run the following command:
 
@@ -98,7 +110,7 @@ regionSpecs:
 ```
 ### Remove a model
 
-When you no longer need a model uploaded to the global repository, you can remove it using `runpodctl`. This cleans up your repository list and frees up storage space.
+When you no longer need a model uploaded to the private repository, you can remove it using `runpodctl`. This cleans up your repository list and frees up storage space.
 
 To remove a model, run the following command:
 
@@ -114,42 +126,13 @@ Before removing a model, ensure that none of your active endpoints are using it.
 
 ## Use models in Serverless endpoints
 
-Once you've uploaded a model to the repository, you can use it in your Serverless endpoints to improve initialization performance.
-
-### Select private repository models when creating endpoints
+When creating a new Serverless endpoint or updating an existing one, you can select models from your private model repository.
 
-When creating a new Serverless endpoint or updating an existing one, you can select models from your organization repository.
+To select a model from the repository, follow these steps:
 
-To select a model from the repository, perform the following steps:
-
-1. In the [Runpod console](https://www.console.runpod.io/serverless), go to **Serverless** > **Endpoints**.
+1. Navigate to the [Serverless section](https://www.console.runpod.io/serverless) of the Runpod console.
 2. Click **New Endpoint**, or edit an existing endpoint.
-3. In the **Endpoint Configuration** screen, scroll down to **Model (optional)** and click the dropdown. Your uploaded models will be listed under **Organization Repository**.
+3. In the **Endpoint Configuration** step, scroll down to **Model (optional)** and click the dropdown. Your uploaded models will be listed under **Organization Repository**.
 4. Select your model from the list.
-5. Complete your endpoint configuration and click **Create Endpoint**.
-
-## How it works
-
-### Smart placement
-
-When you use a cached model at endpoint creation time, Runpod automatically checks throughout the infrastructure for hosts that already contain your selected model.
-
-If no pre-cached host workers are available, the system still gracefully provides fallbacks and will download the model to unoccupied workers, prioritizing local access.
-
-### Graceful fallback
-
-For scenarios where pre-cached models aren't immediately available, Runpod implements intelligent fallback handling:
-
-For uncached models:
-- Workers are started immediately during endpoint creation when the specs are selected.
-- Model download happens concurrently in the background, then the download process passes the right weights and config to the container code after extraction.
-
-For cached models:
-- Smart placement: Will check if it exists on machine.
-- Placement with pre-waiting is essentially identical to a fully downloaded model.
-
-The model caching process is largely transparent—you simply specify the desired cached model for your endpoint with deployment, and Runpod handles recognizing and optimizing both initial downloads and subsequent cached access.
-
-## Pricing
-
-The private model repository feature is available at no additional cost during the beta launch period.
\ No newline at end of file
+5. Enter a Hugging Face token if you're using a gated model.
+6. Complete your endpoint configuration and click **Deploy Endpoint**.
\ No newline at end of file

From 5941aa7bb50690d058396b750c25f100c040626d Mon Sep 17 00:00:00 2001
From: Mo King <muhsinking@gmail.com>
Date: Tue, 16 Sep 2025 12:08:09 -0400
Subject: [PATCH 4/4] Explain public vs. private

---
 serverless/storage/model-repo.mdx | 35 ++++++++++++++++++-------------
 1 file changed, 21 insertions(+), 14 deletions(-)

diff --git a/serverless/storage/model-repo.mdx b/serverless/storage/model-repo.mdx
index d86f5726..c67f212b 100644
--- a/serverless/storage/model-repo.mdx
+++ b/serverless/storage/model-repo.mdx
@@ -1,5 +1,5 @@
 ---
-title: "Private model repository"
+title: "Model repository"
 sidebarTitle: "Model repository"
 description: "Upload models to Runpod to speed up worker starts and reduce costs for your Serverless endpoints."
 tag: "BETA"
@@ -11,34 +11,41 @@ The private model repository feature is currently in beta. Please [join our Disc
 
 </Note>
 
-This guide an overview of how the private model repository works, and instructions for managing your models with the [Runpod CLI](/runpodctl/overview).
+This guide provides an overview of how to use cached models with your Serverless endpoints, and instructions for managing private models with the [Runpod CLI](/runpodctl/overview).
 
-The Runpod private model repository allows you to upload your models directly to the Runpod ecosystem. By pre-caching models on our infrastructure, you can significantly reduce worker start times, lower costs, and improve the reliability of your Serverless endpoints.
+The Runpod model repository allows you to upload your models directly to the Runpod ecosystem. By pre-caching models on our infrastructure, you can significantly reduce worker start times, lower costs, and improve the reliability of your Serverless endpoints.
 
 ## Overview
 
-Using the private model repository provides several key advantages:
+Using cached models provides several key advantages:
 
-- **Faster cold start times:** Models stored in the repository are pre-cached on Runpod's infrastructure, eliminating the need for workers to download them from external sources like Hugging Face.
+- **Faster cold start times:** Public models or private models stored in the repository are pre-cached on Runpod's infrastructure, eliminating the need for workers to download them from external sources like Hugging Face.
 - **Reduced costs:** You aren't billed for worker time while your model is being downloaded. This is especially impactful for large models that can take several minutes to load.
-- **Improved reliability:** Storing models on Runpod reduces your dependency on external services, which can experience downtime or rate-limiting.
+- **Centralized model management:** Manage all your models directly within Runpod without needing to switch between platforms like Hugging Face or other model repositories.
+- **Accelerated deployment:** Deploy pre-cached models instantly without waiting for external downloads or transfers.
+- **Version control:** Store and manage different versions of the same model, allowing you to easily switch between versions for testing or production deployments.
 - **Smaller container images:** By decoupling models from your container image, you can create smaller, more focused images that contain only your serving logic.
 
-## How it works
 
-You can upload and manage models using the Runpod CLI (`runpodctl`). 
+## Public vs. private models
+
+There are two types of cached models:
+
+**Public models** are popular models that Runpod has pre-cached for all users. These models appear automatically in your model selection dropdown and don't require any upload process. You can start using them immediately when creating or updating endpoints.
 
-When you [select a model](#use-models-in-serverless-endpoints) during Serverless endpoint creation, Runpod will automatically try to start your workers on hosts that already contain your selected model.
+**Private models** are models you upload to the repository using the Runpod CLI (`runpodctl`). Once uploaded, these models appear in the model selection dropdown alongside public models, giving you the same performance benefits while maintaining control over your proprietary or customized models.
+
+## How it works
 
-If no pre-cached host machines are available, the system will delay starting your workers until the model download completes, so you still won't be charged for the download time.
+When you select a model during [Serverless endpoint creation](#use-models-in-serverless), Runpod automatically tries to start your workers on hosts that already contain your selected model.
 
-## Pricing
+If no pre-cached host machines are available, the system delays starting your workers until the model download completes, ensuring you still won't be charged for the download time.
 
-The private model repository feature is available at no additional cost during the beta launch period.
+The private model repository feature is available at **no additional cost** during the beta launch period.
 
 ## Manage private models
 
-Make sure you've [installed runpodctl](/runpodctl/install-runpodctl) and [configured your API key](/runpodctl/overview#configure-your-api-key).
+Make sure you've [installed the CLI](/runpodctl/install-runpodctl) and configured it with your API key.
 
 ### Upload a model
 
@@ -124,7 +131,7 @@ runpodctl remove model \
 Before removing a model, ensure that none of your active endpoints are using it.
 </Warning>
 
-## Use models in Serverless endpoints
+## Use models in Serverless
 
 When creating a new Serverless endpoint or updating an existing one, you can select models from your private model repository.