diff --git a/docs.json b/docs.json index 009fcc9b..5ee7a47a 100644 --- a/docs.json +++ b/docs.json @@ -75,7 +75,8 @@ "pages": [ "serverless/storage/overview", "serverless/storage/network-volumes", - "serverless/storage/s3-api" + "serverless/storage/s3-api", + "serverless/storage/model-repo" ] }, { diff --git a/runpodctl/reference/runpodctl-create-model.mdx b/runpodctl/reference/runpodctl-create-model.mdx new file mode 100644 index 00000000..d4bb2b9c --- /dev/null +++ b/runpodctl/reference/runpodctl-create-model.mdx @@ -0,0 +1,24 @@ +--- +title: "Create Model" +sidebarTitle: "Create Model" +--- + +## runpodctl create model + +Upload a model to the [private model repository](/serverless/storage/model-repo). + +```sh +runpodctl create model [flags] +``` + +### Options + +```text + --provider string provider of the model + --name string name of the model + -h, --help help for create model +``` + +### See also + +* [runpodctl create](/runpodctl/reference/runpodctl-create) - create a resource diff --git a/runpodctl/reference/runpodctl-create-pod.mdx b/runpodctl/reference/runpodctl-create-pod.mdx index 0f33cdf1..42cf4eee 100644 --- a/runpodctl/reference/runpodctl-create-pod.mdx +++ b/runpodctl/reference/runpodctl-create-pod.mdx @@ -25,7 +25,7 @@ runpodctl create pod [flags] --env strings container arguments --gpuCount int number of GPUs for the pod (default 1) --gpuType string gpu type id, e.g. 'NVIDIA GeForce RTX 3090' --h, --help help for pod + -h, --help help for pod --imageName string container image name --mem int minimum system memory needed (default 20) --name string any pod name for easy reference diff --git a/runpodctl/reference/runpodctl-describe-model.mdx b/runpodctl/reference/runpodctl-describe-model.mdx new file mode 100644 index 00000000..a3ae0e2d --- /dev/null +++ b/runpodctl/reference/runpodctl-describe-model.mdx @@ -0,0 +1,22 @@ +--- +title: "Describe Model" +sidebarTitle: "Describe Model" +--- + +## runpodctl describe model + +Get detailed information about a specific model in your [private model repository](/serverless/storage/model-repo). + +```sh +runpodctl describe model [flags] +``` + +### Options + +```text + -h, --help help for describe model +``` + +### See also + +* [runpodctl describe](/runpodctl/reference/runpodctl-describe) - describe a resource diff --git a/runpodctl/reference/runpodctl-get-models.mdx b/runpodctl/reference/runpodctl-get-models.mdx new file mode 100644 index 00000000..4c560568 --- /dev/null +++ b/runpodctl/reference/runpodctl-get-models.mdx @@ -0,0 +1,22 @@ +--- +title: "Get Models" +sidebarTitle: "Get Models" +--- + +## runpodctl get models + +Get a list of all models in your [private model repository](/serverless/storage/model-repo). + +```sh +runpodctl get models [flags] +``` + +### Options + +```text + -h, --help help for get models +``` + +### See also + +* [runpodctl get](/runpodctl/reference/runpodctl-get) - get a resource diff --git a/runpodctl/reference/runpodctl-remove-model.mdx b/runpodctl/reference/runpodctl-remove-model.mdx new file mode 100644 index 00000000..7faa89ee --- /dev/null +++ b/runpodctl/reference/runpodctl-remove-model.mdx @@ -0,0 +1,22 @@ +--- +title: "Remove Model" +sidebarTitle: "Remove Model" +--- + +## runpodctl remove model + +Remove a model from your [private model repository](/serverless/storage/model-repo). + +```sh +runpodctl remove model [flags] +``` + +### Options + +```text + -h, --help help for remove model +``` + +### See also + +* [runpodctl remove](/runpodctl/reference/runpodctl-remove) - remove a resource diff --git a/serverless/endpoints/endpoint-configurations.mdx b/serverless/endpoints/endpoint-configurations.mdx index d4293b66..59a4d55c 100644 --- a/serverless/endpoints/endpoint-configurations.mdx +++ b/serverless/endpoints/endpoint-configurations.mdx @@ -144,9 +144,11 @@ The effectiveness of FlashBoot increases exponentially with higher request volum ## Model -You can select from a list of pre-cached ML models using the **Model (optional)** field. Selecting a model signals the system to place your workers on host machines that contain the selected model, resulting in faster cold starts and cost savings (since you won't be charged while the model is downloading). +To optimize worker initialization times and reduce costs, you can select from a list of pre-cached ML models or from your [Private model repository](/serverless/storage/model-repo) using the **Model (optional)** field. Selecting a model signals the system to place your workers on host machines that contain the selected model, resulting in faster cold starts and cost savings (since you won't be charged while the model is downloading). -If the requested model isn't cached on the assigned host, the system will defer starting your workers until after the model finishes downloading, ensuring that you aren't charged during the download process. +When you select a model, Runpod automatically places your workers on host machines that already contain your selected model, eliminating download time during worker startup. If no pre-cached hosts are available, the system defers starting workers until model download completes, ensuring you're not charged during the download period. + +To use a private model, first upload your models to the repository using the [Runpod CLI](/runpodctl/reference/runpodctl-create-model), then select it from the **Model (optional)** dropdown when creating or editing your endpoint. ## Advanced settings diff --git a/serverless/endpoints/manage-endpoints.mdx b/serverless/endpoints/manage-endpoints.mdx index c8e68d9c..e798effb 100644 --- a/serverless/endpoints/manage-endpoints.mdx +++ b/serverless/endpoints/manage-endpoints.mdx @@ -22,7 +22,7 @@ To create a new Serverless endpoint through the Runpod web interface: * **Endpoint Name**: The display name for your endpoint in the console. * **Endpoint Type**: Select **Queue** for traditional queue-based processing or **Load balancer** for direct HTTP access (see [Load balancing endpoints](/serverless/load-balancing/overview) for details). * **GPU Configuration**: Select the appropriate GPU types and configure worker settings. - * **Model (optional)**: Select a model from Hugging Face to optimize worker startup times. When you specify a model, Runpod attempts to place your workers on host machines that already have the model cached locally, resulting in faster cold starts and cost savings (since you won't be charged while the model is downloading). Enter a Hugging Face model URL. + * **Model (optional)**: Select a model from Hugging Face to optimize worker startup times. When you specify a model, Runpod attempts to place your workers on host machines that already have the model cached locally, resulting in faster cold starts and cost savings (since you won't be charged while the model is downloading). You can either select from the dropdown list of pre-cached models or enter a custom Hugging Face model URL. * **Container Configuration**: Edit the container start command, specify the [container disk size](/serverless/storage/overview), and expose HTTP/TCP ports. * **Environment Variables**: Add [environment variables](/serverless/development/environment-variables) for your worker containers. 6. Click **Create Endpoint** to deploy. diff --git a/serverless/storage/model-repo.mdx b/serverless/storage/model-repo.mdx new file mode 100644 index 00000000..c67f212b --- /dev/null +++ b/serverless/storage/model-repo.mdx @@ -0,0 +1,145 @@ +--- +title: "Model repository" +sidebarTitle: "Model repository" +description: "Upload models to Runpod to speed up worker starts and reduce costs for your Serverless endpoints." +tag: "BETA" +--- + + + +The private model repository feature is currently in beta. Please [join our Discord](https://discord.gg/runpod) if you'd like to provide feedback. + + + +This guide provides an overview of how to use cached models with your Serverless endpoints, and instructions for managing private models with the [Runpod CLI](/runpodctl/overview). + +The Runpod model repository allows you to upload your models directly to the Runpod ecosystem. By pre-caching models on our infrastructure, you can significantly reduce worker start times, lower costs, and improve the reliability of your Serverless endpoints. + +## Overview + +Using cached models provides several key advantages: + +- **Faster cold start times:** Public models or private models stored in the repository are pre-cached on Runpod's infrastructure, eliminating the need for workers to download them from external sources like Hugging Face. +- **Reduced costs:** You aren't billed for worker time while your model is being downloaded. This is especially impactful for large models that can take several minutes to load. +- **Centralized model management:** Manage all your models directly within Runpod without needing to switch between platforms like Hugging Face or other model repositories. +- **Accelerated deployment:** Deploy pre-cached models instantly without waiting for external downloads or transfers. +- **Version control:** Store and manage different versions of the same model, allowing you to easily switch between versions for testing or production deployments. +- **Smaller container images:** By decoupling models from your container image, you can create smaller, more focused images that contain only your serving logic. + + +## Public vs. private models + +There are two types of cached models: + +**Public models** are popular models that Runpod has pre-cached for all users. These models appear automatically in your model selection dropdown and don't require any upload process. You can start using them immediately when creating or updating endpoints. + +**Private models** are models you upload to the repository using the Runpod CLI (`runpodctl`). Once uploaded, these models appear in the model selection dropdown alongside public models, giving you the same performance benefits while maintaining control over your proprietary or customized models. + +## How it works + +When you select a model during [Serverless endpoint creation](#use-models-in-serverless), Runpod automatically tries to start your workers on hosts that already contain your selected model. + +If no pre-cached host machines are available, the system delays starting your workers until the model download completes, ensuring you still won't be charged for the download time. + +The private model repository feature is available at **no additional cost** during the beta launch period. + +## Manage private models + +Make sure you've [installed the CLI](/runpodctl/install-runpodctl) and configured it with your API key. + +### Upload a model + +You can upload any model from the [Hugging Face Model Hub](https://huggingface.co/models) to the Runpod repository using the model identifier. + +To upload a model from Hugging Face, run the following command: + +```bash +runpodctl create model \ + --provider huggingface \ + --name YOUR_MODEL_NAME +``` + +Replace `YOUR_MODEL_NAME` with the model identifier from Hugging Face. + +For example, to upload the `stable-diffusion-xl-refiner-1.0` model, run: + +```bash +runpodctl create model \ + --provider huggingface \ + --name stabilityai/stable-diffusion-xl-refiner-1.0 + +``` + +### List your models + +To see a list of all models you've uploaded to the repository, run the following command: + +```bash +runpodctl get models +``` + +This will display all the models in your repository, allowing you to confirm successful uploads and check for duplicates. + +You should see output similar to the following: + +```bash +ID NAME SOURCE STATUS SIZE(GB) VERSION(SHORT) +mdl_123 custom-llama-v1 HUGGING_FACE READY 24.7 9f1c2ab +mdl_456 llama31-8b HUGGING_FACE DOWNLOADING - - +``` + +### Get model details + +To get detailed information about a specific model, run: + +```bash +runpodctl get model YOUR_MODEL_ID +``` + +Replace `YOUR_MODEL_ID` with the ID of your uploaded model. + +For example, running `runpodctl get model 4oqrsweux0fkcp` on the example output above would return: + +```shell +provider: huggingface +name: stabilityai/stable-diffusion-xl-refiner-1.0 +createdDate: 2023-08-03T22:31:36.289Z +storagePath: /stabilityai-stable-diffusion-xl-refiner-1.0/ +id: 4oqrsweux0fkcp +bucketId: pllmb-staging-cloud +regionSpecs: +- regionName: Staging + bucketName: pllmb-staging-cloud + multiplier: 8 + maxQuantity: 30 + maxIncrement: 5 + amount: 22 +``` +### Remove a model + +When you no longer need a model uploaded to the private repository, you can remove it using `runpodctl`. This cleans up your repository list and frees up storage space. + +To remove a model, run the following command: + +```bash +runpodctl remove model \ + --provider huggingface \ + --name lodestones/Chroma +``` + + +Before removing a model, ensure that none of your active endpoints are using it. + + +## Use models in Serverless + +When creating a new Serverless endpoint or updating an existing one, you can select models from your private model repository. + +To select a model from the repository, follow these steps: + +1. Navigate to the [Serverless section](https://www.console.runpod.io/serverless) of the Runpod console. +2. Click **New Endpoint**, or edit an existing endpoint. +3. In the **Endpoint Configuration** step, scroll down to **Model (optional)** and click the dropdown. Your uploaded models will be listed under **Organization Repository**. +4. Select your model from the list. +5. Enter a Hugging Face token if you're using a gated model. +6. Complete your endpoint configuration and click **Deploy Endpoint**. \ No newline at end of file