From 68f6ce3d4174d8f54087520d39e68711f152c44e Mon Sep 17 00:00:00 2001 From: grantneumanoracle Date: Fri, 18 Jul 2025 16:33:51 -0700 Subject: [PATCH 01/13] add more blueprint categories and fix links --- INSTALLING_ONTO_EXISTING_CLUSTER_README.md | 6 ++-- README.md | 20 ++++++------ consolidated_bluperints_2.json | 1 + docs/about.md | 10 +++--- docs/api_documentation.md | 20 ++++++------ .../README.md | 2 +- .../working_with_large_models/README.md | 2 +- docs/sample_blueprints/README.md | 30 +++++++++--------- .../lora-benchmarking/README.md | 0 ...ns_lora_finetune_nvidia_sample_recipe.json | 0 .../gpu-health-check/README.md | 2 +- .../healthcheck_fp16_a10.json | 0 .../healthcheck_fp16_h100.json | 0 .../healthcheck_fp32_a10.json | 0 .../lora-fine-tuning/README.md | 0 ...int_bucket_model_open_dataset.backend.json | 0 .../bucket_model_open_dataset.backend.json | 0 .../bucket_par_open_dataset.backend.json | 0 .../closed_model_open_dataset_hf.backend.json | 0 .../open_model_open_dataset_hf.backend.json | 0 .../auto_scaling/README.md | 2 +- .../auto_scaling/autoscaling_blueprint.json | 0 .../cpu-inference/README.md | 0 .../cpu-inference/cpu-inference-gemma.json | 0 .../cpu-inference-mistral-bm.json | 0 .../cpu-inference-mistral-vm.json | 0 .../llm_inference_with_vllm/README.md | 0 .../vllm-closed-hf-model.json | 0 .../vllm-model-from-obj-storage.json | 0 ...m-open-hf-model-api-key-functionality.json | 0 .../vllm-open-hf-model.json | 0 .../mig_multi_instance_gpu/README.md | 2 +- .../mig_enabled_shared_node_pool.json | 0 .../mig_inference_multiple_replicas.json | 0 .../mig_inference_single_replica.json | 0 .../mig_inference_single_replica_10gb.json | 0 .../mig_multi_instance_gpu/mig_slices.png | Bin .../mig_update_node_with_node_name.json | 0 ...pdate_shared_pool_with_node_pool_name.json | 0 .../multi-node-inference/README.md | 10 +++--- .../multinode_inference_BM_A10.json | 0 .../multinode_inference_VM_A10.json | 0 .../exisiting_cluster_installation/README.md | 0 .../add_node_to_control_plane.json | 0 .../llama-stack/README.md | 2 +- .../llama-stack}/llama_stack_basic.json | 0 .../model_storage/README.md | 0 ...oad_closed_hf_model_to_object_storage.json | 0 ...nload_open_hf_model_to_object_storage.json | 0 .../using_rdma_enabled_node_pools/README.md | 2 +- .../rdma_distributed_inference.json | 0 .../rdma_shared_node_pool.json | 0 .../rdma_update_nodes.json | 0 .../other}/whisper_transcription/README.md | 0 .../docs/Whisper_Architecture.pdf | Bin .../examples/test1/test.wav | Bin .../test_all_transcripts_20250601_201349.txt | 0 .../transcription_log_20250601_201340.log | 0 .../transcription_log_20250601_203611.log | 0 .../examples/test2/video1591686795.mp4 | Bin ...86795_all_transcripts_20250601_203730.json | 0 ...686795_all_transcripts_20250601_203730.txt | 0 .../examples/test3/audio1788670787.m4a | Bin ...70787_all_transcripts_20250601_191710.json | 0 ...670787_all_transcripts_20250601_191710.txt | 0 .../transcription_log_20250601_191325.log | 0 .../whisper-transcription-A10.json | 0 .../whisper-transcription-A100.json | 0 .../whisper-transcription-H100.json | 0 .../deployment_groups/README.md | 0 .../deployment_groups}/llama_stack_basic.json | 0 .../shared_node_pools/README.md | 2 +- .../shared_node_pool_A10_BM.json | 0 .../shared_node_pool_A10_VM.json | 0 .../shared_node_pool_B200_BM.json | 0 ...nference_sample_shared_pool_blueprint.json | 0 .../README.md | 0 .../autoscale_with_fss.json | 0 .../teams/README.md | 0 .../teams/create_job_with_team.json | 0 .../teams/create_team.json | 0 81 files changed, 57 insertions(+), 56 deletions(-) create mode 100644 consolidated_bluperints_2.json rename docs/sample_blueprints/{workload_blueprints => gpu_benchmarking}/lora-benchmarking/README.md (100%) rename docs/sample_blueprints/{workload_blueprints => gpu_benchmarking}/lora-benchmarking/mlcommons_lora_finetune_nvidia_sample_recipe.json (100%) rename docs/sample_blueprints/{workload_blueprints => gpu_health_check}/gpu-health-check/README.md (99%) rename docs/sample_blueprints/{workload_blueprints => gpu_health_check}/gpu-health-check/healthcheck_fp16_a10.json (100%) rename docs/sample_blueprints/{workload_blueprints => gpu_health_check}/gpu-health-check/healthcheck_fp16_h100.json (100%) rename docs/sample_blueprints/{workload_blueprints => gpu_health_check}/gpu-health-check/healthcheck_fp32_a10.json (100%) rename docs/sample_blueprints/{workload_blueprints => model_fine_tuning}/lora-fine-tuning/README.md (100%) rename docs/sample_blueprints/{workload_blueprints => model_fine_tuning}/lora-fine-tuning/bucket_checkpoint_bucket_model_open_dataset.backend.json (100%) rename docs/sample_blueprints/{workload_blueprints => model_fine_tuning}/lora-fine-tuning/bucket_model_open_dataset.backend.json (100%) rename docs/sample_blueprints/{workload_blueprints => model_fine_tuning}/lora-fine-tuning/bucket_par_open_dataset.backend.json (100%) rename docs/sample_blueprints/{workload_blueprints => model_fine_tuning}/lora-fine-tuning/closed_model_open_dataset_hf.backend.json (100%) rename docs/sample_blueprints/{workload_blueprints => model_fine_tuning}/lora-fine-tuning/open_model_open_dataset_hf.backend.json (100%) rename docs/sample_blueprints/{platform_feature_blueprints => model_serving}/auto_scaling/README.md (99%) rename docs/sample_blueprints/{platform_feature_blueprints => model_serving}/auto_scaling/autoscaling_blueprint.json (100%) rename docs/sample_blueprints/{workload_blueprints => model_serving}/cpu-inference/README.md (100%) rename docs/sample_blueprints/{workload_blueprints => model_serving}/cpu-inference/cpu-inference-gemma.json (100%) rename docs/sample_blueprints/{workload_blueprints => model_serving}/cpu-inference/cpu-inference-mistral-bm.json (100%) rename docs/sample_blueprints/{workload_blueprints => model_serving}/cpu-inference/cpu-inference-mistral-vm.json (100%) rename docs/sample_blueprints/{workload_blueprints => model_serving}/llm_inference_with_vllm/README.md (100%) rename docs/sample_blueprints/{workload_blueprints => model_serving}/llm_inference_with_vllm/vllm-closed-hf-model.json (100%) rename docs/sample_blueprints/{workload_blueprints => model_serving}/llm_inference_with_vllm/vllm-model-from-obj-storage.json (100%) rename docs/sample_blueprints/{workload_blueprints => model_serving}/llm_inference_with_vllm/vllm-open-hf-model-api-key-functionality.json (100%) rename docs/sample_blueprints/{workload_blueprints => model_serving}/llm_inference_with_vllm/vllm-open-hf-model.json (100%) rename docs/sample_blueprints/{platform_feature_blueprints => model_serving}/mig_multi_instance_gpu/README.md (99%) rename docs/sample_blueprints/{platform_feature_blueprints => model_serving}/mig_multi_instance_gpu/mig_enabled_shared_node_pool.json (100%) rename docs/sample_blueprints/{platform_feature_blueprints => model_serving}/mig_multi_instance_gpu/mig_inference_multiple_replicas.json (100%) rename docs/sample_blueprints/{platform_feature_blueprints => model_serving}/mig_multi_instance_gpu/mig_inference_single_replica.json (100%) rename docs/sample_blueprints/{platform_feature_blueprints => model_serving}/mig_multi_instance_gpu/mig_inference_single_replica_10gb.json (100%) rename docs/sample_blueprints/{platform_feature_blueprints => model_serving}/mig_multi_instance_gpu/mig_slices.png (100%) rename docs/sample_blueprints/{platform_feature_blueprints => model_serving}/mig_multi_instance_gpu/mig_update_node_with_node_name.json (100%) rename docs/sample_blueprints/{platform_feature_blueprints => model_serving}/mig_multi_instance_gpu/mig_update_shared_pool_with_node_pool_name.json (100%) rename docs/sample_blueprints/{workload_blueprints => model_serving}/multi-node-inference/README.md (95%) rename docs/sample_blueprints/{workload_blueprints => model_serving}/multi-node-inference/multinode_inference_BM_A10.json (100%) rename docs/sample_blueprints/{workload_blueprints => model_serving}/multi-node-inference/multinode_inference_VM_A10.json (100%) rename docs/sample_blueprints/{platform_feature_blueprints => other}/exisiting_cluster_installation/README.md (100%) rename docs/sample_blueprints/{platform_feature_blueprints => other}/exisiting_cluster_installation/add_node_to_control_plane.json (100%) rename docs/sample_blueprints/{workload_blueprints => other}/llama-stack/README.md (98%) rename docs/sample_blueprints/{platform_feature_blueprints/deployment_groups => other/llama-stack}/llama_stack_basic.json (100%) rename docs/sample_blueprints/{platform_feature_blueprints => other}/model_storage/README.md (100%) rename docs/sample_blueprints/{platform_feature_blueprints => other}/model_storage/download_closed_hf_model_to_object_storage.json (100%) rename docs/sample_blueprints/{platform_feature_blueprints => other}/model_storage/download_open_hf_model_to_object_storage.json (100%) rename docs/sample_blueprints/{platform_feature_blueprints => other}/using_rdma_enabled_node_pools/README.md (99%) rename docs/sample_blueprints/{platform_feature_blueprints => other}/using_rdma_enabled_node_pools/rdma_distributed_inference.json (100%) rename docs/sample_blueprints/{platform_feature_blueprints => other}/using_rdma_enabled_node_pools/rdma_shared_node_pool.json (100%) rename docs/sample_blueprints/{platform_feature_blueprints => other}/using_rdma_enabled_node_pools/rdma_update_nodes.json (100%) rename docs/{ => sample_blueprints/other}/whisper_transcription/README.md (100%) rename docs/{ => sample_blueprints/other}/whisper_transcription/docs/Whisper_Architecture.pdf (100%) rename docs/{ => sample_blueprints/other}/whisper_transcription/examples/test1/test.wav (100%) rename docs/{ => sample_blueprints/other}/whisper_transcription/examples/test1/test_all_transcripts_20250601_201349.txt (100%) rename docs/{ => sample_blueprints/other}/whisper_transcription/examples/test1/transcription_log_20250601_201340.log (100%) rename docs/{ => sample_blueprints/other}/whisper_transcription/examples/test2/transcription_log_20250601_203611.log (100%) rename docs/{ => sample_blueprints/other}/whisper_transcription/examples/test2/video1591686795.mp4 (100%) rename docs/{ => sample_blueprints/other}/whisper_transcription/examples/test2/video1591686795_all_transcripts_20250601_203730.json (100%) rename docs/{ => sample_blueprints/other}/whisper_transcription/examples/test2/video1591686795_all_transcripts_20250601_203730.txt (100%) rename docs/{ => sample_blueprints/other}/whisper_transcription/examples/test3/audio1788670787.m4a (100%) rename docs/{ => sample_blueprints/other}/whisper_transcription/examples/test3/audio1788670787_all_transcripts_20250601_191710.json (100%) rename docs/{ => sample_blueprints/other}/whisper_transcription/examples/test3/audio1788670787_all_transcripts_20250601_191710.txt (100%) rename docs/{ => sample_blueprints/other}/whisper_transcription/examples/test3/transcription_log_20250601_191325.log (100%) rename docs/{ => sample_blueprints/other}/whisper_transcription/whisper-transcription-A10.json (100%) rename docs/{ => sample_blueprints/other}/whisper_transcription/whisper-transcription-A100.json (100%) rename docs/{ => sample_blueprints/other}/whisper_transcription/whisper-transcription-H100.json (100%) rename docs/sample_blueprints/{platform_feature_blueprints => platform_features}/deployment_groups/README.md (100%) rename docs/sample_blueprints/{workload_blueprints/llama-stack => platform_features/deployment_groups}/llama_stack_basic.json (100%) rename docs/sample_blueprints/{platform_feature_blueprints => platform_features}/shared_node_pools/README.md (97%) rename docs/sample_blueprints/{platform_feature_blueprints => platform_features}/shared_node_pools/shared_node_pool_A10_BM.json (100%) rename docs/sample_blueprints/{platform_feature_blueprints => platform_features}/shared_node_pools/shared_node_pool_A10_VM.json (100%) rename docs/sample_blueprints/{platform_feature_blueprints => platform_features}/shared_node_pools/shared_node_pool_B200_BM.json (100%) rename docs/sample_blueprints/{platform_feature_blueprints => platform_features}/shared_node_pools/vllm_inference_sample_shared_pool_blueprint.json (100%) rename docs/sample_blueprints/{platform_feature_blueprints => platform_features}/startup_liveness_readiness_probes/README.md (100%) rename docs/sample_blueprints/{platform_feature_blueprints => platform_features}/startup_liveness_readiness_probes/autoscale_with_fss.json (100%) rename docs/sample_blueprints/{platform_feature_blueprints => platform_features}/teams/README.md (100%) rename docs/sample_blueprints/{platform_feature_blueprints => platform_features}/teams/create_job_with_team.json (100%) rename docs/sample_blueprints/{platform_feature_blueprints => platform_features}/teams/create_team.json (100%) diff --git a/INSTALLING_ONTO_EXISTING_CLUSTER_README.md b/INSTALLING_ONTO_EXISTING_CLUSTER_README.md index e89158e..3c26107 100644 --- a/INSTALLING_ONTO_EXISTING_CLUSTER_README.md +++ b/INSTALLING_ONTO_EXISTING_CLUSTER_README.md @@ -83,7 +83,7 @@ If you have existing node pools in your original OKE cluster that you'd like Blu - If you get a warning about security, sometimes it takes a bit for the certificates to get signed. This will go away once that process completes on the OKE side. 3. Login with the `Admin Username` and `Admin Password` in the Application information tab. 4. Click the link next to "deployment" which will take you to a page with "Deployment List", and a content box. -5. Paste in the sample blueprint json found [here](docs/sample_blueprints/platform_feature_blueprints/exisiting_cluster_installation/add_node_to_control_plane.json). +5. Paste in the sample blueprint json found [here](docs/sample_blueprints/other/exisiting_cluster_installation/add_node_to_control_plane.json). 6. Modify the "recipe_node_name" field to the private IP address you found in step 1 above. 7. Click "POST". This is a fast operation. 8. Wait about 20 seconds and refresh the page. It should look like: @@ -108,10 +108,10 @@ If you have existing node pools in your original OKE cluster that you'd like Blu - If you get a warning about security, sometimes it takes a bit for the certificates to get signed. This will go away once that process completes on the OKE side. 3. Login with the `Admin Username` and `Admin Password` in the Application information tab. 4. Click the link next to "deployment" which will take you to a page with "Deployment List", and a content box. -5. If you added a node from [Step 4](./INSTALLING_ONTO_EXISTING_CLUSTER_README.md#step-4-add-existing-nodes-to-cluster-optional), use the following shared node pool [blueprint](docs/sample_blueprints/platform_feature_blueprints/shared_node_pools/vllm_inference_sample_shared_pool_blueprint.json). +5. If you added a node from [Step 4](./INSTALLING_ONTO_EXISTING_CLUSTER_README.md#step-4-add-existing-nodes-to-cluster-optional), use the following shared node pool [blueprint](docs/sample_blueprints/platform_features/shared_node_pools/vllm_inference_sample_shared_pool_blueprint.json). - Depending on the node shape, you will need to change: `"recipe_node_shape": "BM.GPU.A10.4"` to match your shape. -6. If you did not add a node, or just want to deploy a fresh node, use the following [blueprint](docs/sample_blueprints/workload_blueprints/llm_inference_with_vllm/vllm-open-hf-model.json). +6. If you did not add a node, or just want to deploy a fresh node, use the following [blueprint](docs/sample_blueprints/model_serving/llm_inference_with_vllm/vllm-open-hf-model.json). 7. Paste the blueprint you selected into context box on the deployment page and click "POST" 8. To monitor the deployment, go back to "Api Root" and click "deployment_logs". - If you are deploying without a shared node pool, it can take 10-30 minutes to bring up a node, depending on shape and whether it is bare-metal or virtual. diff --git a/README.md b/README.md index 10a5bd9..f8409c7 100644 --- a/README.md +++ b/README.md @@ -52,16 +52,16 @@ After you install OCI AI Blueprints to an OKE cluster in your tenancy, you can d | Blueprint | Description | | --------------------------------------------------------------------------------------------- | ----------------------------------------------------------------------------------------------------------------------------------------------- | -| [**LLM & VLM Inference with vLLM**](docs/sample_blueprints/workload_blueprints/llm_inference_with_vllm/README.md) | Deploy Llama 2/3/3.1 7B/8B models using NVIDIA GPU shapes and the vLLM inference engine with auto-scaling. | -| [**Llama Stack**](docs/sample_blueprints/workload_blueprints/llama-stack/README.md) | Complete GenAI runtime with vLLM, ChromaDB, Postgres, and Jaeger for production deployments with unified API for inference, RAG, and telemetry. | -| [**Fine-Tuning Benchmarking**](docs/sample_blueprints/workload_blueprints/lora-benchmarking/README.md) | Run MLCommons quantized Llama-2 70B LoRA finetuning on A100 for performance benchmarking. | -| [**LoRA Fine-Tuning**](docs/sample_blueprints/workload_blueprints/lora-fine-tuning/README.md) | LoRA fine-tuning of custom or HuggingFace models using any dataset. Includes flexible hyperparameter tuning. | -| [**GPU Performance Benchmarking**](docs/sample_blueprints/workload_blueprints/gpu-health-check/README.md) | Comprehensive evaluation of GPU performance to ensure optimal hardware readiness before initiating any intensive computational workload. | -| [**CPU Inference**](docs/sample_blueprints/workload_blueprints/cpu-inference/README.md) | Leverage Ollama to test CPU-based inference with models like Mistral, Gemma, and more. | -| [**Multi-node Inference with RDMA and vLLM**](docs/sample_blueprints/workload_blueprints/multi-node-inference/README.md) | Deploy Llama-405B sized LLMs across multiple nodes with RDMA using H100 nodes with vLLM and LeaderWorkerSet. | -| [**Autoscaling Inference with vLLM**](docs/sample_blueprints/platform_feature_blueprints/auto_scaling/README.md) | Serve LLMs with auto-scaling using KEDA, which scales to multiple GPUs and nodes using application metrics like inference latency. | -| [**LLM Inference with MIG**](docs/sample_blueprints/platform_feature_blueprints/mig_multi_instance_gpu/README.md) | Deploy LLMs to a fraction of a GPU with Nvidia’s multi-instance GPUs and serve them with vLLM. | -| [**Job Queuing**](docs/sample_blueprints/platform_feature_blueprints/teams/README.md) | Take advantage of job queuing and enforce resource quotas and fair sharing between teams. | +| [**LLM & VLM Inference with vLLM**](docs/sample_blueprints/model_serving/llm_inference_with_vllm/README.md) | Deploy Llama 2/3/3.1 7B/8B models using NVIDIA GPU shapes and the vLLM inference engine with auto-scaling. | +| [**Llama Stack**](docs/sample_blueprints/other/llama-stack/README.md) | Complete GenAI runtime with vLLM, ChromaDB, Postgres, and Jaeger for production deployments with unified API for inference, RAG, and telemetry. | +| [**Fine-Tuning Benchmarking**](docs/sample_blueprints/gpu_benchmarking/lora-benchmarking/README.md) | Run MLCommons quantized Llama-2 70B LoRA finetuning on A100 for performance benchmarking. | +| [**LoRA Fine-Tuning**](docs/sample_blueprints/model_fine_tuning/lora-fine-tuning/README.md) | LoRA fine-tuning of custom or HuggingFace models using any dataset. Includes flexible hyperparameter tuning. | +| [**GPU Performance Benchmarking**](docs/sample_blueprints/gpu_health_check/gpu-health-check/README.md) | Comprehensive evaluation of GPU performance to ensure optimal hardware readiness before initiating any intensive computational workload. | +| [**CPU Inference**](docs/sample_blueprints/model_serving/cpu-inference/README.md) | Leverage Ollama to test CPU-based inference with models like Mistral, Gemma, and more. | +| [**Multi-node Inference with RDMA and vLLM**](docs/sample_blueprints/model_serving/multi-node-inference/README.md) | Deploy Llama-405B sized LLMs across multiple nodes with RDMA using H100 nodes with vLLM and LeaderWorkerSet. | +| [**Autoscaling Inference with vLLM**](docs/sample_blueprints/model_serving/auto_scaling/README.md) | Serve LLMs with auto-scaling using KEDA, which scales to multiple GPUs and nodes using application metrics like inference latency. | +| [**LLM Inference with MIG**](docs/sample_blueprints/model_serving/mig_multi_instance_gpu/README.md) | Deploy LLMs to a fraction of a GPU with Nvidia’s multi-instance GPUs and serve them with vLLM. | +| [**Job Queuing**](docs/sample_blueprints/platform_features/teams/README.md) | Take advantage of job queuing and enforce resource quotas and fair sharing between teams. | ## Support & Contact diff --git a/consolidated_bluperints_2.json b/consolidated_bluperints_2.json new file mode 100644 index 0000000..251b764 --- /dev/null +++ b/consolidated_bluperints_2.json @@ -0,0 +1 @@ +[{"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "Fine-Tuning Benchmarking", "blueprint_short_description": "Fine-tune quantized Llama-2-70B model using MLCommons methodology for infrastructure benchmarking", "blueprint_long_description": "The fine-tuning benchmarking blueprint streamlines infrastructure benchmarking for fine-tuning using the MLCommons methodology. It fine-tunes a quantized Llama-2-70B model and a standard dataset.\n\nOnce complete, benchmarking results, such as training time and resource utilization, are available in MLFlow and Grafana for easy tracking. This blueprint enables data-driven infrastructure decisions for your fine-tuning jobs.", "pre_filled_samples": [{"pre_filled_sample_name": "LoRA fine-tuning of quantitized Llama-2-70B model on A100 node using MLCommons methodology", "recipe_id": "mlcommons_lora_finetune_nvidia", "deployment_name": "MLCommons Finetune LORA/PEFT", "recipe_mode": "job", "recipe_node_shape": "BM.GPU.A100.8", "recipe_use_shared_node_pool": false, "recipe_nvidia_gpu_count": 8, "recipe_ephemeral_storage_size": 50, "recipe_replica_count": 1, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_shared_memory_volume_size_limit_in_mb": 100, "input_object_storage": [{"bucket_name": "corrino_mlcommons_llama2_70b_qkv", "mount_location": "/models", "volume_size_in_gbs": 500}, {"bucket_name": "corrino_ml_commons_scrolls_dataset", "mount_location": "/dataset", "volume_size_in_gbs": 100}], "output_object_storage": [{"bucket_name": "corrino_ml_commons_output", "mount_location": "/mlcommons_output", "volume_size_in_gbs": 200}], "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:corrino-recipe-mlcommons", "recipe_container_env": [{"key": "model_name", "value": "regisss/llama2-70b-fused-qkv-mlperf"}, {"key": "Model_Path", "value": "/models"}, {"key": "Dataset_Path", "value": "/dataset"}, {"key": "Lora_R", "value": "16"}, {"key": "Lora_Alpha", "value": "32"}, {"key": "Lora_Dropout", "value": "0.1"}, {"key": "Max_Seq_Len", "value": "8192"}, {"key": "bf16", "value": "true"}, {"key": "Logging_Steps", "value": "24"}, {"key": "Eval_Steps", "value": "48"}, {"key": "Per_Device_Train_Batch_Size", "value": "1"}, {"key": "Gradient_Accumulation_Steps", "value": "1"}, {"key": "Lr_Scheduler_Type", "value": "cosine"}, {"key": "Learning_Rate", "value": "0.0004"}, {"key": "Weight_Decay", "value": "0.0001"}, {"key": "Warmup_Ratio", "value": "0"}, {"key": "Max_Grad_Norm", "value": "0.3"}, {"key": "Use_Gradient_Checkpointing", "value": "true"}, {"key": "Target_Eval_Loss", "value": "0.925"}, {"key": "Use_Peft_Lora", "value": "true"}, {"key": "Max_Steps", "value": "1024"}, {"key": "Use_Flash_Attn", "value": "true"}, {"key": "Seed", "value": "1234"}, {"key": "Lora_Target_Modules", "value": "qkv_proj,o_proj"}, {"key": "Mlflow_Exp_Name", "value": "oci_ai_blueprints_nvidia_recipe"}, {"key": "Output_Dir", "value": "/mlcommons_output"}]}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "GPU Health Check", "blueprint_short_description": "Comprehensive GPU health validation and diagnostics for production readiness", "blueprint_long_description": "This repository offers a robust, pre-check recipe for thorough GPU health validation prior to deploying production or research workloads. Designed to operate seamlessly across both single-node and multi-node environments, this diagnostic toolset enables you to verify that your GPU infrastructure is primed for high-demand experiments. By systematically assessing key performance metrics—such as thermal behavior, power stability, and overall hardware reliability—you can proactively detect and address issues like thermal throttling, power irregularities, and GPU instability. This early-warning system minimizes the risk of unexpected downtime and performance degradation, ensuring that your system consistently operates at peak efficiency and reliability during critical computational tasks.", "pre_filled_samples": [{"pre_filled_sample_name": "2 A10 GPUs with dtype 16", "recipe_id": "healthcheck", "recipe_mode": "job", "deployment_name": "healthcheck_fp16_a10", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:healthcheck_v0.3", "recipe_node_shape": "VM.GPU.A10.2", "output_object_storage": [{"bucket_name": "healthcheck2", "mount_location": "/healthcheck_results", "volume_size_in_gbs": 20}], "recipe_container_command_args": ["--dtype", "float16", "--output_dir", "/healthcheck_results", "--expected_gpus", "A10:2,A100:0,H100:0"], "recipe_replica_count": 1, "recipe_nvidia_gpu_count": 2, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_ephemeral_storage_size": 100, "recipe_shared_memory_volume_size_limit_in_mb": 1000}, {"pre_filled_sample_name": "2 A10 GPUs with dtype 32", "recipe_id": "healthcheck", "recipe_mode": "job", "deployment_name": "healthcheck_fp32_a10", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:healthcheck_v0.3", "recipe_node_shape": "VM.GPU.A10.2", "output_object_storage": [{"bucket_name": "healthcheck2", "mount_location": "/healthcheck_results", "volume_size_in_gbs": 20}], "recipe_container_command_args": ["--dtype", "float32", "--output_dir", "/healthcheck_results", "--expected_gpus", "A10:2,A100:0,H100:0"], "recipe_replica_count": 1, "recipe_nvidia_gpu_count": 2, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_ephemeral_storage_size": 100, "recipe_shared_memory_volume_size_limit_in_mb": 1000}, {"pre_filled_sample_name": "8 H100 GPUs with dtype 16", "recipe_id": "healthcheck", "recipe_mode": "job", "deployment_name": "healthcheck_fp16_h100", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:healthcheck_v0.3", "recipe_node_shape": "BM.GPU.H100.8", "output_object_storage": [{"bucket_name": "healthcheck2", "mount_location": "/healthcheck_results", "volume_size_in_gbs": 20}], "recipe_container_command_args": ["--dtype", "float16", "--output_dir", "/healthcheck_results", "--expected_gpus", "A10:0,A100:0,H100:8"], "recipe_replica_count": 1, "recipe_nvidia_gpu_count": 8, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_ephemeral_storage_size": 100, "recipe_shared_memory_volume_size_limit_in_mb": 1000}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "LoRA Fine-Tuning", "blueprint_short_description": "Efficiently fine-tune large language models using Low-Rank Adaptation", "blueprint_long_description": "This blueprint enables efficient model tuning using Low-Rank Adaptation (LoRA), a highly efficient method of LLM tuning. You can fine-tune a custom LLM or most open-source LLMs from Hugging Face. You can also use a custom dataset or any publicly available dataset from Hugging Face. Once the job is complete, results such as training metrics and logged in MLFlow for analysis. The fine-tuned model is then stored in an object storage bucket, ready for deployment.", "pre_filled_samples": [{"pre_filled_sample_name": "Fine-Tune NousResearch/Meta-Llama-3.1-8B from Object Storage with Dataset from Hugging Face and Checkpoints saved in Object Storage (A10 VM)", "recipe_id": "lora_finetune_nvidia", "deployment_name": "dk_with_checkpoint", "recipe_mode": "job", "recipe_node_shape": "VM.GPU.A10.2", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:finetune_lora_dev", "recipe_nvidia_gpu_count": 2, "recipe_ephemeral_storage_size": 300, "recipe_replica_count": 1, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 500, "recipe_shared_memory_volume_size_limit_in_mb": 100, "recipe_container_env": [{"key": "Mlflow_Endpoint", "value": "http://mlflow.cluster-tools.svc.cluster.local:5000"}, {"key": "Mlflow_Exp_Name", "value": "oci_ai_blueprints_nvidia_recipe"}, {"key": "Mlflow_Run_Name", "value": "Meta-Llama-3.1-8B-local-quotes-checkpoint"}, {"key": "Hf_Token", "value": "None"}, {"key": "Download_Dataset_From_Hf", "value": "true"}, {"key": "Dataset_Name", "value": "Abirate/english_quotes"}, {"key": "Dataset_Sub_Name", "value": "None"}, {"key": "Dataset_Column_To_Use", "value": "quote"}, {"key": "Dataset_Path", "value": "/workspace/datasets"}, {"key": "Download_Model_From_Hf", "value": "false"}, {"key": "Model_Name", "value": "NousResearch/Meta-Llama-3.1-8B"}, {"key": "Model_Path", "value": "/models/NousResearch/Meta-Llama-3.1-8B"}, {"key": "Max_Model_Length", "value": "4096"}, {"key": "Resume_From_Checkpoint", "value": "true"}, {"key": "Checkpoint_Path", "value": "/checkpoint/Bucket-Llama-3.1-8B-english_quotes/checkpoint-1400"}, {"key": "Lora_R", "value": "8"}, {"key": "Lora_Alpha", "value": "16"}, {"key": "Lora_Dropout", "value": "0.1"}, {"key": "Lora_Target_Modules", "value": "q_proj,up_proj,o_proj,k_proj,down_proj,gate_proj,v_proj"}, {"key": "Bias", "value": "none"}, {"key": "Task_Type", "value": "CAUSAL_LM"}, {"key": "Per_Device_Train_Batch_Size", "value": "1"}, {"key": "Gradient_Accumulation_Steps", "value": "1"}, {"key": "Warmup_Steps", "value": "2"}, {"key": "Save_Steps", "value": "100"}, {"key": "Learning_Rate", "value": "0.0002"}, {"key": "Fp16", "value": "true"}, {"key": "Logging_Steps", "value": "1"}, {"key": "Output_Dir", "value": "/tunedmodels/Bucket-Llama-3.1-8B-english_quotes"}, {"key": "Optim", "value": "paged_adamw_8bit"}, {"key": "Number_of_Training_Epochs", "value": "2"}, {"key": "Require_Persistent_Output_Dir", "value": "true"}], "input_object_storage": [{"bucket_name": "corrino_hf_oss_models", "mount_location": "/models", "volume_size_in_gbs": 500}, {"bucket_name": "corrino_tuned_hf_oss_models", "mount_location": "/checkpoint", "volume_size_in_gbs": 500}], "output_object_storage": [{"bucket_name": "corrino_tuned_hf_oss_models", "mount_location": "/tunedmodels", "volume_size_in_gbs": 500}]}, {"pre_filled_sample_name": "Fine-Tune NousResearch/Meta-Llama-3.1-8B from Object Storage with Dataset from Hugging Face on A10 VM", "recipe_id": "lora_finetune_nvidia", "deployment_name": "dk_bucket_model_open_dataset", "recipe_mode": "job", "recipe_node_shape": "VM.GPU.A10.2", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:finetune_lora_dev", "recipe_nvidia_gpu_count": 2, "recipe_ephemeral_storage_size": 300, "recipe_replica_count": 1, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 500, "recipe_shared_memory_volume_size_limit_in_mb": 100, "recipe_container_env": [{"key": "Mlflow_Endpoint", "value": "http://mlflow.cluster-tools.svc.cluster.local:5000"}, {"key": "Mlflow_Exp_Name", "value": "oci_ai_blueprints_nvidia_recipe"}, {"key": "Mlflow_Run_Name", "value": "Meta-Llama-3.1-8B-local-quotes"}, {"key": "Hf_Token", "value": "None"}, {"key": "Download_Dataset_From_Hf", "value": "true"}, {"key": "Dataset_Name", "value": "Abirate/english_quotes"}, {"key": "Dataset_Sub_Name", "value": "None"}, {"key": "Dataset_Column_To_Use", "value": "None"}, {"key": "Dataset_Path", "value": "/workspace/datasets"}, {"key": "Download_Model_From_Hf", "value": "false"}, {"key": "Model_Name", "value": "NousResearch/Meta-Llama-3.1-8B"}, {"key": "Model_Path", "value": "/models/NousResearch/Meta-Llama-3.1-8B"}, {"key": "Max_Model_Length", "value": "8192"}, {"key": "Resume_From_Checkpoint", "value": "false"}, {"key": "Checkpoint_Path", "value": "/checkpoint"}, {"key": "Lora_R", "value": "8"}, {"key": "Lora_Alpha", "value": "32"}, {"key": "Lora_Dropout", "value": "0.1"}, {"key": "Lora_Target_Modules", "value": "q_proj,up_proj,o_proj,k_proj,down_proj,gate_proj,v_proj"}, {"key": "Bias", "value": "none"}, {"key": "Task_Type", "value": "CAUSAL_LM"}, {"key": "Per_Device_Train_Batch_Size", "value": "1"}, {"key": "Gradient_Accumulation_Steps", "value": "1"}, {"key": "Warmup_Steps", "value": "2"}, {"key": "Save_Steps", "value": "100"}, {"key": "Learning_Rate", "value": "0.0002"}, {"key": "Fp16", "value": "true"}, {"key": "Logging_Steps", "value": "1"}, {"key": "Output_Dir", "value": "/tunedmodels/Bucket-Llama-3.1-8B-english_quotes"}, {"key": "Optim", "value": "paged_adamw_8bit"}, {"key": "Number_of_Training_Epochs", "value": "2"}, {"key": "Require_Persistent_Output_Dir", "value": "true"}], "input_object_storage": [{"bucket_name": "corrino_hf_oss_models", "mount_location": "/models", "volume_size_in_gbs": 500}], "output_object_storage": [{"bucket_name": "corrino_tuned_hf_oss_models", "mount_location": "/tunedmodels", "volume_size_in_gbs": 500}]}, {"pre_filled_sample_name": "Fine-Tune NousResearch/Meta-Llama-3.1-8B from Object Storage (PAR link) with Dataset from Hugging Face on A10 VM", "recipe_id": "lora_finetune_nvidia", "deployment_name": "dk_bucket_model_open_dataset", "recipe_mode": "job", "recipe_node_shape": "VM.GPU.A10.2", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:finetune_lora_dev", "recipe_nvidia_gpu_count": 2, "recipe_ephemeral_storage_size": 300, "recipe_replica_count": 1, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 500, "recipe_shared_memory_volume_size_limit_in_mb": 100, "recipe_container_env": [{"key": "Mlflow_Endpoint", "value": "http://mlflow.cluster-tools.svc.cluster.local:5000"}, {"key": "Mlflow_Exp_Name", "value": "oci_ai_blueprints_nvidia_recipe"}, {"key": "Mlflow_Run_Name", "value": "Meta-Llama-3.1-8B-local-quotes"}, {"key": "Hf_Token", "value": "None"}, {"key": "Download_Dataset_From_Hf", "value": "true"}, {"key": "Dataset_Name", "value": "Abirate/english_quotes"}, {"key": "Dataset_Sub_Name", "value": "None"}, {"key": "Dataset_Column_To_Use", "value": "None"}, {"key": "Dataset_Path", "value": "/workspace/datasets"}, {"key": "Download_Model_From_Hf", "value": "false"}, {"key": "Model_Name", "value": "NousResearch/Meta-Llama-3.1-8B"}, {"key": "Model_Path", "value": "/models/NousResearch/Meta-Llama-3.1-8B"}, {"key": "Max_Model_Length", "value": "8192"}, {"key": "Resume_From_Checkpoint", "value": "false"}, {"key": "Checkpoint_Path", "value": "/checkpoint"}, {"key": "Lora_R", "value": "8"}, {"key": "Lora_Alpha", "value": "32"}, {"key": "Lora_Dropout", "value": "0.1"}, {"key": "Lora_Target_Modules", "value": "q_proj,up_proj,o_proj,k_proj,down_proj,gate_proj,v_proj"}, {"key": "Bias", "value": "none"}, {"key": "Task_Type", "value": "CAUSAL_LM"}, {"key": "Per_Device_Train_Batch_Size", "value": "1"}, {"key": "Gradient_Accumulation_Steps", "value": "1"}, {"key": "Warmup_Steps", "value": "2"}, {"key": "Save_Steps", "value": "100"}, {"key": "Learning_Rate", "value": "0.0002"}, {"key": "Fp16", "value": "true"}, {"key": "Logging_Steps", "value": "1"}, {"key": "Output_Dir", "value": "/tunedmodels/Bucket-Llama-3.1-8B-english_quotes"}, {"key": "Optim", "value": "paged_adamw_8bit"}, {"key": "Number_of_Training_Epochs", "value": "2"}, {"key": "Require_Persistent_Output_Dir", "value": "true"}], "input_object_storage": [{"par": "https://objectstorage.us-phoenix-1.oraclecloud.com/p/iv-8F3oSRJ8nsbVaq9ev9kjfkZ3zXItSOCSDWKfRa7zT3aPmNf4MijL_4nw_hvvY/n/iduyx1qnmway/b/corrino_hf_oss_models/o/", "mount_location": "/models", "volume_size_in_gbs": 500, "include": ["NousResearch/Meta-Llama-3.1-8B"]}], "output_object_storage": [{"bucket_name": "corrino_tuned_hf_oss_models", "mount_location": "/tunedmodels", "volume_size_in_gbs": 500}]}, {"pre_filled_sample_name": "Fine-Tune meta-llama/Llama-3.2-1B-Instruct (Closed Model) from Hugging Face with Dataset from Hugging Face on A10 VM", "recipe_id": "lora_finetune_nvidia", "deployment_name": "dk_closed_model_open_dataset", "recipe_mode": "job", "recipe_node_shape": "VM.GPU.A10.2", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:finetune_lora_dev", "recipe_nvidia_gpu_count": 2, "recipe_ephemeral_storage_size": 300, "recipe_replica_count": 1, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 500, "recipe_shared_memory_volume_size_limit_in_mb": 100, "recipe_container_env": [{"key": "Mlflow_Endpoint", "value": "http://mlflow.cluster-tools.svc.cluster.local:5000"}, {"key": "Mlflow_Exp_Name", "value": "oci_ai_blueprints_nvidia_recipe"}, {"key": "Mlflow_Run_Name", "value": "llama-3.2-1B-Instruct-scrolls-gov_report"}, {"key": "Hf_Token", "value": ""}, {"key": "Download_Dataset_From_Hf", "value": "true"}, {"key": "Dataset_Name", "value": "tau/scrolls"}, {"key": "Dataset_Sub_Name", "value": "gov_report"}, {"key": "Dataset_Column_To_Use", "value": "None"}, {"key": "Dataset_Path", "value": "/workspace/datasets"}, {"key": "Download_Model_From_Hf", "value": "true"}, {"key": "Model_Name", "value": "meta-llama/Llama-3.2-1B-Instruct"}, {"key": "Model_Path", "value": "/workspace/models"}, {"key": "Max_Model_Length", "value": "8192"}, {"key": "Resume_From_Checkpoint", "value": "false"}, {"key": "Checkpoint_Path", "value": "/checkpoint"}, {"key": "Lora_R", "value": "8"}, {"key": "Lora_Alpha", "value": "32"}, {"key": "Lora_Dropout", "value": "0.1"}, {"key": "Lora_Target_Modules", "value": "q_proj,up_proj,o_proj,k_proj,down_proj,gate_proj,v_proj"}, {"key": "Bias", "value": "none"}, {"key": "Task_Type", "value": "CAUSAL_LM"}, {"key": "Per_Device_Train_Batch_Size", "value": "1"}, {"key": "Gradient_Accumulation_Steps", "value": "1"}, {"key": "Warmup_Steps", "value": "2"}, {"key": "Save_Steps", "value": "100"}, {"key": "Learning_Rate", "value": "0.0002"}, {"key": "Fp16", "value": "true"}, {"key": "Logging_Steps", "value": "1"}, {"key": "Output_Dir", "value": "/tunedmodels/Llama-3.1-8B-english_quotes"}, {"key": "Optim", "value": "paged_adamw_8bit"}, {"key": "Number_of_Training_Epochs", "value": "2"}, {"key": "Require_Persistent_Output_Dir", "value": "true"}], "output_object_storage": [{"bucket_name": "corrino_tuned_hf_oss_models", "mount_location": "/tunedmodels", "volume_size_in_gbs": 500}]}, {"pre_filled_sample_name": "Fine-Tune NousResearch/Meta-Llama-3.1-8B (Open Model) from Hugging Face with Dataset from Hugging Face on A10 VM", "recipe_id": "lora_finetune_nvidia", "deployment_name": "dk_open_model_open_dataset", "recipe_mode": "job", "recipe_node_shape": "VM.GPU.A10.2", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:finetune_lora_dev", "recipe_nvidia_gpu_count": 2, "recipe_ephemeral_storage_size": 300, "recipe_replica_count": 1, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 500, "recipe_shared_memory_volume_size_limit_in_mb": 100, "recipe_container_env": [{"key": "Mlflow_Endpoint", "value": "http://mlflow.cluster-tools.svc.cluster.local:5000"}, {"key": "Mlflow_Exp_Name", "value": "oci_ai_blueprints_nvidia_recipe"}, {"key": "Mlflow_Run_Name", "value": "oci_ai_blueprints_run"}, {"key": "Hf_Token", "value": "None"}, {"key": "Download_Dataset_From_Hf", "value": "true"}, {"key": "Dataset_Name", "value": "Abirate/english_quotes"}, {"key": "Dataset_Sub_Name", "value": "None"}, {"key": "Dataset_Column_To_Use", "value": "None"}, {"key": "Dataset_Path", "value": "/workspace/datasets"}, {"key": "Download_Model_From_Hf", "value": "true"}, {"key": "Model_Name", "value": "NousResearch/Meta-Llama-3.1-8B"}, {"key": "Model_Path", "value": "/workspace/models"}, {"key": "Max_Model_Length", "value": "8192"}, {"key": "Resume_From_Checkpoint", "value": "false"}, {"key": "Checkpoint_Path", "value": "/checkpoint"}, {"key": "Lora_R", "value": "8"}, {"key": "Lora_Alpha", "value": "32"}, {"key": "Lora_Dropout", "value": "0.1"}, {"key": "Lora_Target_Modules", "value": "q_proj,up_proj,o_proj,k_proj,down_proj,gate_proj,v_proj"}, {"key": "Bias", "value": "none"}, {"key": "Task_Type", "value": "CAUSAL_LM"}, {"key": "Per_Device_Train_Batch_Size", "value": "1"}, {"key": "Gradient_Accumulation_Steps", "value": "1"}, {"key": "Warmup_Steps", "value": "2"}, {"key": "Save_Steps", "value": "100"}, {"key": "Learning_Rate", "value": "0.0002"}, {"key": "Fp16", "value": "true"}, {"key": "Logging_Steps", "value": "1"}, {"key": "Output_Dir", "value": "/tunedmodels/Llama-3.1-8B-english_quotes"}, {"key": "Optim", "value": "paged_adamw_8bit"}, {"key": "Number_of_Training_Epochs", "value": "2"}, {"key": "Require_Persistent_Output_Dir", "value": "true"}], "output_object_storage": [{"bucket_name": "corrino_tuned_hf_oss_models", "mount_location": "/tunedmodels", "volume_size_in_gbs": 500}]}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "Autoscaling", "blueprint_short_description": "Scale inference workloads based on traffic load", "blueprint_long_description": "OCI AI Blueprints supports automatic scaling (autoscaling) of inference workloads to handle varying traffic loads efficiently. This means that when demand increases, OCI AI Blueprints can spin up more pods (containers running your inference jobs) and, if needed, provision additional GPU nodes. When demand decreases, it scales back down to save resources and cost.", "pre_filled_samples": [{"pre_filled_sample_name": "vLLM Inference with Automatic Scaling on VM.GPU.A10.2", "recipe_id": "llm_inference_nvidia", "recipe_mode": "service", "deployment_name": "autoscale_vllm_example", "recipe_image_uri": "docker.io/vllm/vllm-openai:v0.9.1", "recipe_node_shape": "VM.GPU.A10.2", "input_object_storage": [{"par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/qFv5XzocpOoEXjlxL7Q3ZrrCFkx9GkA1fpg97zmnaNEX9WB_WMXLz2rykGuU1hqQ/n/iduyx1qnmway/b/metallama321binstruct/o/", "mount_location": "/models", "volume_size_in_gbs": 100}], "recipe_container_env": [{"key": "tensor_parallel_size", "value": "1"}, {"key": "model_name", "value": ""}, {"key": "Model_Path", "value": "/models"}], "recipe_prometheus_enabled": true, "recipe_replica_count": 1, "recipe_container_port": "8000", "recipe_nvidia_gpu_count": 1, "recipe_container_command_args": ["--model", "$(Model_Path)", "--tensor-parallel-size", "$(tensor_parallel_size)", "--gpu-memory-utilization", "0.95", "--max-model-len", "1024"], "recipe_ephemeral_storage_size": 200, "recipe_node_boot_volume_size_in_gbs": 300, "recipe_node_pool_size": 1, "recipe_shared_memory_volume_size_limit_in_mb": 200, "recipe_startup_probe_params": {"failure_threshold": 30, "endpoint_path": "/health", "port": 8000, "scheme": "HTTP", "initial_delay_seconds": 60, "period_seconds": 2, "success_threshold": 1, "timeout_seconds": 10}, "recipe_liveness_probe_params": {"failure_threshold": 3, "endpoint_path": "/health", "port": 8000, "scheme": "HTTP", "initial_delay_seconds": 65, "period_seconds": 600, "success_threshold": 1, "timeout_seconds": 10}, "recipe_node_autoscaling_params": {"min_nodes": 1, "max_nodes": 2}, "recipe_pod_autoscaling_params": {"min_replicas": 1, "max_replicas": 4}}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "CPU Inference", "blueprint_short_description": "Deploy CPU-based inference with Ollama for cost-effective and GPU-free model serving", "blueprint_long_description": "This blueprint provides a comprehensive framework for testing inference on CPUs using the Ollama platform with a variety of supported models such as Mistral, Gemma, and others available through Ollama. Unlike GPU-dependent solutions, this blueprint is designed for environments where CPU inference is preferred or required. It offers clear guidelines and configuration settings to deploy a robust CPU inference service, enabling thorough performance evaluations and reliability testing. Ollama's lightweight and efficient architecture makes it an ideal solution for developers looking to benchmark and optimize CPU-based inference workloads.\n\nThis blueprint explains how to use CPU inference for running large language models using Ollama. It includes two main deployment strategies:\n\n- Serving pre-saved models directly from Object Storage\n\n- Pulling models from Ollama and saving them to Object Storage", "pre_filled_samples": [{"pre_filled_sample_name": "CPU inference with Mistral and BM.Standard.E4", "recipe_id": "cpu_inference", "recipe_mode": "service", "deployment_name": "cpu Inference mistral BME4", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:cpu_inference_service_v0.2", "recipe_node_shape": "BM.Standard.E4.128", "input_object_storage": [{"par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/0LYMMBRGg_OEm_hzp9BG8BlQx7Ccpy3gY-gRzjQQFZRU6peG0pXyHTRHUGZLp82E/n/iduyx1qnmway/b/ollama-models/o/", "mount_location": "/models", "volume_size_in_gbs": 20}], "recipe_container_env": [{"key": "MODEL_NAME", "value": "mistral"}, {"key": "PROMPT", "value": "What is the capital of France?"}], "recipe_replica_count": 1, "recipe_container_port": "11434", "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_container_command_args": ["--input_directory", "/models", "--model_name", "mistral"], "recipe_ephemeral_storage_size": 100}, {"pre_filled_sample_name": "CPU inference with Gemma and BM.Standard.E5.192", "recipe_id": "cpu_inference", "recipe_mode": "service", "deployment_name": "cpu Inference gemma BME5", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:cpu_inference_service_v0.2", "recipe_node_shape": "BM.Standard.E5.192", "input_object_storage": [{"par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/0LYMMBRGg_OEm_hzp9BG8BlQx7Ccpy3gY-gRzjQQFZRU6peG0pXyHTRHUGZLp82E/n/iduyx1qnmway/b/ollama-models/o/", "mount_location": "/models", "volume_size_in_gbs": 20}], "recipe_container_env": [{"key": "MODEL_NAME", "value": "gemma"}, {"key": "PROMPT", "value": "What is the capital of Germany?"}], "recipe_replica_count": 1, "recipe_container_port": "11434", "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_container_command_args": ["--input_directory", "/models", "--model_name", "gemma"], "recipe_ephemeral_storage_size": 100}, {"pre_filled_sample_name": "CPU inference with mistral and VM.Standard.E4.Flex", "recipe_id": "cpu_inference", "recipe_mode": "service", "deployment_name": "cpu Inference mistral E4Flex", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:cpu_inference_service_v0.2", "recipe_node_shape": "VM.Standard.E4.Flex", "recipe_flex_shape_ocpu_count": 4, "recipe_flex_shape_memory_size_in_gbs": 64, "input_object_storage": [{"par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/0LYMMBRGg_OEm_hzp9BG8BlQx7Ccpy3gY-gRzjQQFZRU6peG0pXyHTRHUGZLp82E/n/iduyx1qnmway/b/ollama-models/o/", "mount_location": "/models", "volume_size_in_gbs": 20}], "recipe_container_env": [{"key": "MODEL_NAME", "value": "mistral"}, {"key": "PROMPT", "value": "What is the capital of Spain?"}], "recipe_replica_count": 1, "recipe_container_port": "11434", "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_container_command_args": ["--input_directory", "/models", "--model_name", "mistral"], "recipe_ephemeral_storage_size": 100}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "LLM Inference with vLLM", "blueprint_short_description": "Deploy open-source LLMs to GPUs for inference with vLLM.", "blueprint_long_description": "This blueprint simplifies the deployment of LLMs using an open-source inference engine called vLLM. You can deploy a custom model or select from a variety of open-source models on Hugging Face.\n\nThe blueprint deploys the model from an object storage bucket to a GPU node in an OKE cluster in your tenancy. Once deployed, you receive a ready-to-use API endpoint to start generating responses from the model. For mission-critical workloads, you can also configure auto-scaling driven by application metrics like inference latency. To summarize, this blueprint streamlines inference deployment, making it easy to scale and integrate into your applications without deep, technical expertise.", "pre_filled_samples": [{"pre_filled_sample_name": "Meta-Llama-3.1-8B-Instruct from OCI Object Storage on VM.GPU.A10.2 with vLLM", "recipe_id": "llm_inference_nvidia", "recipe_mode": "service", "deployment_name": "vllm-model-from-obj-storage", "recipe_image_uri": "docker.io/vllm/vllm-openai:v0.9.1", "recipe_node_shape": "VM.GPU.A10.2", "recipe_prometheus_enabled": true, "input_object_storage": [{"par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/IFknABDAjiiF5LATogUbRCcVQ9KL6aFUC1j-P5NSeUcaB2lntXLaR935rxa-E-u1/n/iduyx1qnmway/b/corrino_hf_oss_models/o/", "mount_location": "/models", "volume_size_in_gbs": 500, "include": ["NousResearch/Meta-Llama-3.1-8B-Instruct"]}], "recipe_container_env": [{"key": "tensor_parallel_size", "value": "2"}, {"key": "model_name", "value": "NousResearch/Meta-Llama-3.1-8B-Instruct"}, {"key": "Model_Path", "value": "/models/NousResearch/Meta-Llama-3.1-8B-Instruct"}], "recipe_replica_count": 1, "recipe_container_port": "8000", "recipe_nvidia_gpu_count": 2, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_container_command_args": ["--model", "$(Model_Path)", "--tensor-parallel-size", "$(tensor_parallel_size)"], "recipe_ephemeral_storage_size": 100, "recipe_shared_memory_volume_size_limit_in_mb": 200}, {"pre_filled_sample_name": "meta-llama/Llama-3.2-11B-Vision (Closed Model) from Hugging Face on VM.GPU.A10.2 with vLLM", "recipe_id": "llm_inference_nvidia", "recipe_mode": "service", "deployment_name": "vllm-closed-hf-model", "recipe_image_uri": "docker.io/vllm/vllm-openai:v0.9.1", "recipe_node_shape": "VM.GPU.A10.2", "recipe_container_env": [{"key": "HF_TOKEN", "value": ""}], "recipe_replica_count": 1, "recipe_container_port": "8000", "recipe_prometheus_enabled": true, "recipe_nvidia_gpu_count": 2, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_container_command_args": ["--model", "meta-llama/Llama-3.2-11B-Vision", "--tensor-parallel-size", "2"], "recipe_ephemeral_storage_size": 100, "recipe_shared_memory_volume_size_limit_in_mb": 200}, {"pre_filled_sample_name": "NousResearch/Meta-Llama-3-8B-Instruct (Open Model) from Hugging Face on VM.GPU.A10.2 with vLLM", "recipe_id": "llm_inference_nvidia", "recipe_mode": "service", "deployment_name": "vllm-open-hf-model", "recipe_image_uri": "docker.io/vllm/vllm-openai:v0.9.1", "recipe_node_shape": "VM.GPU.A10.2", "recipe_prometheus_enabled": true, "recipe_container_env": [{"key": "tensor_parallel_size", "value": "2"}, {"key": "model_name", "value": "NousResearch/Meta-Llama-3-8B-Instruct"}], "recipe_replica_count": 1, "recipe_container_port": "8000", "recipe_nvidia_gpu_count": 2, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_container_command_args": ["--model", "$(model_name)", "--tensor-parallel-size", "$(tensor_parallel_size)"], "recipe_ephemeral_storage_size": 100, "recipe_shared_memory_volume_size_limit_in_mb": 200}, {"pre_filled_sample_name": "NousResearch/Meta-Llama-3-8B-Instruct (Open Model) from Hugging Face on VM.GPU.A10.2 with vLLM and Endpoint API Key", "recipe_id": "llm_inference_nvidia", "recipe_mode": "service", "deployment_name": "vllm-open-hf-model-api-key-functionality", "recipe_image_uri": "docker.io/vllm/vllm-openai:v0.9.1", "recipe_node_shape": "VM.GPU.A10.2", "recipe_prometheus_enabled": true, "recipe_container_env": [{"key": "VLLM_API_KEY", "value": ""}], "recipe_replica_count": 1, "recipe_container_port": "8000", "recipe_nvidia_gpu_count": 2, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_container_command_args": ["--model", "NousResearch/Meta-Llama-3-8B-Instruct", "--tensor-parallel-size", "2"], "recipe_ephemeral_storage_size": 100, "recipe_shared_memory_volume_size_limit_in_mb": 200}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "Multi-Instance GPU (MIG)", "blueprint_short_description": "Partition GPUs into multiple isolated instances for efficient resource sharing and concurrent workloads", "blueprint_long_description": "Multi-Instance GPU (MIG) is a feature of NVIDIA GPUs that allows a single physical GPU to be partitioned into multiple isolated instances, each acting as an independent GPU with dedicated compute, memory, and cache resources. This enables multiple users or workloads to run concurrently on a single GPU without interfering with each other and without virtualization overhead.\n\nMIG is particularly useful when running multiple smaller models that do not require an entire GPU, such as hosting multiple smaller LLMs (Llama-7B, Mistral-7B, or Gemma-2B) on an A100 or H100 GPU. It ensures resource allocation is optimized, preventing one model from monopolizing the entire GPU while maintaining high throughput. This approach is incredibly well-suited for autoscaling scenarios because many more pods can be scheduled onto a single node depending on the MIG configuration.\n\nCurrently, OCI AI Blueprints supports MIG for H100, H200, and B200s with various slice configurations ranging from 7 mini GPUs to full instances. The system supports creating MIG-enabled shared node pools, deploying inference workloads to specific MIG slices, and updating MIG configurations on existing nodes.\n\nTo see supported configurations and resource requests, go to [Mig Configurations](./README.md#mig-configurations).", "pre_filled_samples": [{"pre_filled_sample_name": "MIG-Enabled H100 Shared Node Pool", "deployment_name": "H100_pool_mig", "recipe_mode": "shared_node_pool", "shared_node_pool_size": 1, "shared_node_pool_shape": "BM.GPU.H100.8", "shared_node_pool_boot_volume_size_in_gbs": 1000, "shared_node_pool_mig_config": "all-1g.20gb"}, {"pre_filled_sample_name": "MIG Inference with Multiple Replicas and Autoscaling", "recipe_id": "llm_inference_nvidia", "recipe_mode": "service", "deployment_name": "autoscale_mig", "recipe_image_uri": "docker.io/vllm/vllm-openai:v0.9.1", "recipe_prometheus_enabled": true, "recipe_node_shape": "BM.GPU.H100.8", "recipe_container_env": [{"key": "tensor_parallel_size", "value": "1"}, {"key": "HF_TOKEN", "value": ""}], "recipe_replica_count": 5, "recipe_container_port": "8000", "recipe_nvidia_gpu_count": 1, "recipe_use_shared_node_pool": true, "mig_resource_request": "1g.10gb", "recipe_container_command_args": ["--model", "meta-llama/Llama-3.2-3B-Instruct", "--dtype", "bfloat16", "--tensor-parallel-size", "$(tensor_parallel_size)", "--gpu-memory-utilization", "0.99", "--max-model-len", "16384"], "recipe_ephemeral_storage_size": 30, "recipe_node_boot_volume_size_in_gbs": 300, "recipe_shared_memory_volume_size_limit_in_mb": 1000, "recipe_startup_probe_params": {"failure_threshold": 30, "endpoint_path": "/health", "port": 8000, "scheme": "HTTP", "initial_delay_seconds": 10, "period_seconds": 2, "success_threshold": 1, "timeout_seconds": 1}, "recipe_liveness_probe_params": {"failure_threshold": 3, "endpoint_path": "/health", "port": 8000, "scheme": "HTTP", "initial_delay_seconds": 65, "period_seconds": 600, "success_threshold": 1, "timeout_seconds": 1}, "recipe_pod_autoscaling_params": {"min_replicas": 5, "max_replicas": 10}}, {"pre_filled_sample_name": "MIG Inference Single Replica (20GB Slice)", "recipe_id": "llm_inference_nvidia", "recipe_mode": "service", "deployment_name": "autoscale_mig", "recipe_image_uri": "docker.io/vllm/vllm-openai:v0.9.1", "recipe_node_shape": "BM.GPU.H100.8", "recipe_prometheus_enabled": true, "recipe_container_env": [{"key": "tensor_parallel_size", "value": "1"}, {"key": "HF_TOKEN", "value": ""}], "recipe_replica_count": 1, "recipe_container_port": "8000", "recipe_nvidia_gpu_count": 1, "recipe_use_shared_node_pool": true, "mig_resource_request": "1g.20gb", "recipe_container_command_args": ["--model", "meta-llama/Llama-3.2-3B-Instruct", "--dtype", "bfloat16", "--tensor-parallel-size", "$(tensor_parallel_size)", "--gpu-memory-utilization", "0.99", "--max-model-len", "16384"], "recipe_ephemeral_storage_size": 30, "recipe_node_boot_volume_size_in_gbs": 300, "recipe_shared_memory_volume_size_limit_in_mb": 1000, "recipe_startup_probe_params": {"failure_threshold": 30, "endpoint_path": "/health", "port": 8000, "scheme": "HTTP", "initial_delay_seconds": 10, "period_seconds": 2, "success_threshold": 1, "timeout_seconds": 1}, "recipe_liveness_probe_params": {"failure_threshold": 3, "endpoint_path": "/health", "port": 8000, "scheme": "HTTP", "initial_delay_seconds": 65, "period_seconds": 600, "success_threshold": 1, "timeout_seconds": 1}, "recipe_pod_autoscaling_params": {"min_replicas": 1, "max_replicas": 50}}, {"pre_filled_sample_name": "MIG Inference Single Replica (10GB Slice)", "recipe_id": "llm_inference_nvidia", "recipe_mode": "service", "deployment_name": "autoscale_mig", "recipe_image_uri": "docker.io/vllm/vllm-openai:v0.9.1", "recipe_node_shape": "BM.GPU.H100.8", "recipe_prometheus_enabled": true, "recipe_container_env": [{"key": "tensor_parallel_size", "value": "1"}, {"key": "HF_TOKEN", "value": ""}], "recipe_replica_count": 1, "recipe_container_port": "8000", "recipe_nvidia_gpu_count": 1, "recipe_use_shared_node_pool": true, "mig_resource_request": "1g.10gb", "recipe_container_command_args": ["--model", "meta-llama/Llama-3.2-3B-Instruct", "--dtype", "bfloat16", "--tensor-parallel-size", "$(tensor_parallel_size)", "--gpu-memory-utilization", "0.99", "--max-model-len", "16384"], "recipe_ephemeral_storage_size": 30, "recipe_node_boot_volume_size_in_gbs": 300, "recipe_shared_memory_volume_size_limit_in_mb": 1000, "recipe_startup_probe_params": {"failure_threshold": 30, "endpoint_path": "/health", "port": 8000, "scheme": "HTTP", "initial_delay_seconds": 10, "period_seconds": 2, "success_threshold": 1, "timeout_seconds": 1}, "recipe_liveness_probe_params": {"failure_threshold": 3, "endpoint_path": "/health", "port": 8000, "scheme": "HTTP", "initial_delay_seconds": 65, "period_seconds": 600, "success_threshold": 1, "timeout_seconds": 1}, "recipe_pod_autoscaling_params": {"min_replicas": 1, "max_replicas": 50}}, {"pre_filled_sample_name": "Update MIG Configuration by Node Name", "recipe_mode": "update", "deployment_name": "all-1g10gb", "recipe_node_name": "10.0.10.138", "shared_node_pool_mig_config": "all-1g.10gb"}, {"pre_filled_sample_name": "Update MIG Configuration by Node Pool Name", "recipe_mode": "update", "deployment_name": "all-2g-20gb", "recipe_node_pool_name": "h100migpool", "shared_node_pool_mig_config": "all-2g.20gb"}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "Multi-Node Inference", "blueprint_short_description": "Scale large language model inference across multiple GPU nodes using tensor and pipeline parallelism", "blueprint_long_description": "Multi-node inference enables deploying very large language models that cannot fit within the GPU memory of a single node by distributing the workload across multiple computing nodes. This approach combines tensor parallelism (splitting operations across GPUs within a node) and pipeline parallelism (distributing sequential stages across nodes) to efficiently utilize available hardware resources.\n\nThis blueprint is essential when serving models like Llama-3.3-70B-Instruct that require approximately 150GB of GPU memory, exceeding the capacity of single-node configurations. The system uses vLLM and Ray with LeaderWorkerSet (LWS) to manage distributed state across nodes, creating a cluster with one head node and multiple worker nodes.\n\nThe multi-node approach significantly reduces processing time and improves throughput for both real-time and batch predictions. It requires careful planning to determine the appropriate node shapes and GPU requirements based on model size, precision, and available compute shapes. The system supports shared node pools and optional RDMA connectivity for enhanced performance.\n\nKey benefits include the ability to serve models that exceed single-node memory limits, improved inference throughput through parallel processing, and efficient resource utilization across distributed GPU infrastructure.", "pre_filled_samples": [{"pre_filled_sample_name": "Multi-Node Inference on VM.GPU.A10 Cluster", "recipe_id": "vllm_multinode_inference", "recipe_mode": "service", "deployment_name": "multinode_inference", "recipe_node_shape": "VM.GPU.A10.2", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:ray2430_vllmv083", "input_object_storage": [{"par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/IFknABDAjiiF5LATogUbRCcVQ9KL6aFUC1j-P5NSeUcaB2lntXLaR935rxa-E-u1/n/iduyx1qnmway/b/corrino_hf_oss_models/o/", "mount_location": "/models", "volume_size_in_gbs": 500, "include": ["NousResearch/Meta-Llama-3.1-8B-Instruct"]}], "recipe_replica_count": 1, "recipe_nvidia_gpu_count": 2, "recipe_ephemeral_storage_size": 150, "recipe_shared_memory_volume_size_limit_in_mb": 10000, "recipe_container_port": "8000", "recipe_use_shared_node_pool": true, "multinode_num_nodes_to_use_from_shared_pool": 2, "recipe_container_command_args": ["--port", "8000", "--model", "/models", "--tensor-parallel-size", "2", "--pipeline-parallel-size", "2", "--gpu-memory-utilization", "0.90", "--distributed-executor-backend", "ray"], "recipe_readiness_probe_params": {"endpoint_path": "/health", "port": 8000, "initial_delay_seconds": 20, "period_seconds": 10}}, {"pre_filled_sample_name": "Multi-Node Inference on BM.GPU.A10 Cluster", "recipe_id": "vllm_multinode_inference", "recipe_mode": "service", "deployment_name": "multinode_inference", "recipe_node_shape": "BM.GPU.A10.4", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:ray2430_vllmv083", "input_object_storage": [{"par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/IFknABDAjiiF5LATogUbRCcVQ9KL6aFUC1j-P5NSeUcaB2lntXLaR935rxa-E-u1/n/iduyx1qnmway/b/corrino_hf_oss_models/o/", "mount_location": "/models", "volume_size_in_gbs": 500, "include": ["NousResearch/Meta-Llama-3.1-8B-Instruct"]}], "recipe_replica_count": 1, "recipe_nvidia_gpu_count": 4, "recipe_ephemeral_storage_size": 150, "recipe_shared_memory_volume_size_limit_in_mb": 10000, "recipe_container_port": "8000", "recipe_use_shared_node_pool": true, "multinode_num_nodes_to_use_from_shared_pool": 2, "recipe_container_command_args": ["--port", "8000", "--model", "/models", "--tensor-parallel-size", "4", "--pipeline-parallel-size", "2", "--gpu-memory-utilization", "0.90", "--distributed-executor-backend", "ray"], "recipe_readiness_probe_params": {"endpoint_path": "/health", "port": 8000, "initial_delay_seconds": 20, "period_seconds": 10}}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "Install OCI AI Blueprints onto an Existing OKE Cluster", "blueprint_short_description": "Deploy OCI AI Blueprints on your existing OKE cluster without creating new infrastructure", "blueprint_long_description": "This guide helps you install and use **OCI AI Blueprints** on an existing OKE cluster that was created outside of blueprints and already has workflows running on it. Rather than installing blueprints onto a new cluster, you can leverage an existing cluster with node pools and tools already installed.\n\nThe installation process involves ensuring you have the correct IAM policies in place, retrieving existing cluster OKE and VCN information from the console, deploying the OCI AI Blueprints application onto the existing cluster, and optionally adding existing nodes to be used by blueprints. You can then deploy sample recipes to test functionality.\n\nKey considerations include managing existing tooling like Prometheus, Grafana, or the GPU operator that may already be installed on your cluster. The blueprint installation process can detect and work around these existing components. Additionally, if you have the nvidia-gpu-operator installed and plan to use Multi-Instance GPUs with H100 nodes, special configuration steps are available.\n\nThis approach allows you to:\n\n- Leverage existing cluster resources and configurations\n\n- Add blueprints capabilities without disrupting current workloads\n\n- Utilize existing node pools for blueprint deployments\n\n- Maintain compatibility with pre-installed cluster tools", "pre_filled_samples": [{"pre_filled_sample_name": "Add Existing Node to Control Plane", "recipe_mode": "update", "deployment_name": "startupaddnode", "recipe_node_name": "10.0.10.164", "recipe_node_labels": {"corrino": "a10pool", "corrino/pool-shared-any": "true"}}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "Llama Stack on OCI", "blueprint_short_description": "Pre-packaged GenAI runtime — vLLM + ChromaDB + Postgres (optional Jaeger) ready for one-click deployment", "blueprint_long_description": "Deploy Llama Stack on OCI via OCI AI Blueprints. For more information on Llama Stack: https://github.com/meta-llama/llama-stack\n\nWe are using Postgres for the backend store, chromaDB for the vector database, Jaeger for tracing and vLLM for inference serving.", "pre_filled_samples": [{"pre_filled_sample_name": "Llama 3.1 8B Model with vLLM", "deployment_group": {"name": "group", "deployments": [{"name": "postgres", "recipe": {"recipe_id": "postgres", "deployment_name": "postgres", "recipe_mode": "service", "recipe_node_pool_size": 1, "recipe_node_shape": "VM.Standard.E4.Flex", "recipe_flex_shape_ocpu_count": 2, "recipe_flex_shape_memory_size_in_gbs": 16, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_ephemeral_storage_size": 100, "recipe_image_uri": "docker.io/library/postgres:latest", "recipe_container_port": "5432", "recipe_host_port": "5432", "recipe_container_env": [{"key": "POSTGRES_USER", "value": "llamastack"}, {"key": "POSTGRES_PASSWORD", "value": "llamastack"}, {"key": "POSTGRES_DB", "value": "llamastack"}], "recipe_replica_count": 1}, "exports": ["internal_dns_name"]}, {"name": "chroma", "recipe": {"recipe_id": "chromadb", "deployment_name": "chroma", "recipe_mode": "service", "recipe_node_pool_size": 1, "recipe_node_shape": "VM.Standard.E4.Flex", "recipe_flex_shape_ocpu_count": 2, "recipe_flex_shape_memory_size_in_gbs": 16, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_ephemeral_storage_size": 100, "recipe_image_uri": "docker.io/chromadb/chroma:latest", "recipe_container_port": "8000", "recipe_host_port": "8000", "recipe_container_env": [{"key": "IS_PERSISTENT", "value": "TRUE"}, {"key": "ANONYMIZED_TELEMETRY", "value": "FALSE"}], "recipe_replica_count": 1, "output_object_storage": [{"bucket_name": "chromadb", "mount_location": "/chroma/chroma", "volume_size_in_gbs": 500}]}, "exports": ["internal_dns_name"]}, {"name": "vllm", "recipe": {"recipe_id": "llm_inference_nvidia", "deployment_name": "vllm", "recipe_mode": "service", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:vllmv0.6.6.pos1", "recipe_node_shape": "VM.GPU.A10.2", "input_object_storage": [{"par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/IFknABDAjiiF5LATogUbRCcVQ9KL6aFUC1j-P5NSeUcaB2lntXLaR935rxa-E-u1/n/iduyx1qnmway/b/corrino_hf_oss_models/o/", "mount_location": "/models", "volume_size_in_gbs": 500, "include": ["NousResearch/Meta-Llama-3.1-8B-Instruct"]}], "recipe_container_env": [{"key": "tensor_parallel_size", "value": "2"}, {"key": "model_name", "value": "NousResearch/Meta-Llama-3.1-8B-Instruct"}, {"key": "Model_Path", "value": "/models/NousResearch/Meta-Llama-3.1-8B-Instruct"}], "recipe_replica_count": 1, "recipe_container_port": "8000", "recipe_nvidia_gpu_count": 2, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_container_command_args": ["--model", "$(Model_Path)", "--tensor-parallel-size", "$(tensor_parallel_size)"], "recipe_ephemeral_storage_size": 100, "recipe_shared_memory_volume_size_limit_in_mb": 200}, "exports": ["internal_dns_name"]}, {"name": "jaeger", "recipe": {"recipe_id": "jaeger", "deployment_name": "jaeger", "recipe_mode": "service", "recipe_node_pool_size": 1, "recipe_node_shape": "VM.Standard.E4.Flex", "recipe_flex_shape_ocpu_count": 2, "recipe_flex_shape_memory_size_in_gbs": 16, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_ephemeral_storage_size": 100, "recipe_image_uri": "docker.io/jaegertracing/jaeger:latest", "recipe_container_port": "16686", "recipe_additional_ingress_ports": [{"name": "jaeger", "port": 4318, "path": "/jaeger"}], "recipe_replica_count": 1}, "exports": ["internal_dns_name"]}, {"name": "llamastack_app", "recipe": {"recipe_id": "llamastack_app", "deployment_name": "llamastack_app", "recipe_mode": "service", "recipe_node_pool_size": 1, "recipe_node_shape": "VM.Standard.E4.Flex", "recipe_flex_shape_ocpu_count": 2, "recipe_flex_shape_memory_size_in_gbs": 16, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_ephemeral_storage_size": 100, "recipe_image_uri": "docker.io/llamastack/distribution-postgres-demo:latest", "recipe_container_port": "8321", "recipe_container_env": [{"key": "INFERENCE_MODEL", "value": "/models/NousResearch/Meta-Llama-3.1-8B-Instruct"}, {"key": "VLLM_URL", "value": "http://${vllm.internal_dns_name}/v1"}, {"key": "ENABLE_CHROMADB", "value": "1"}, {"key": "CHROMADB_URL", "value": "http://${chroma.internal_dns_name}:8000"}, {"key": "POSTGRES_HOST", "value": "${postgres.internal_dns_name}"}, {"key": "POSTGRES_PORT", "value": "5432"}, {"key": "POSTGRES_DB", "value": "llamastack"}, {"key": "POSTGRES_USER", "value": "llamastack"}, {"key": "POSTGRES_PASSWORD", "value": "llamastack"}, {"key": "TELEMETRY_SINKS", "value": "console,otel_trace"}, {"key": "OTEL_TRACE_ENDPOINT", "value": "http://${jaeger.internal_dns_name}/jaeger/v1/traces"}], "output_object_storage": [{"bucket_name": "llamastack", "mount_location": "/root/.llama", "volume_size_in_gbs": 100}], "recipe_replica_count": 1}, "depends_on": ["postgres", "chroma", "vllm", "jaeger"]}]}}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "Model Storage", "blueprint_short_description": "Download and store models from HuggingFace to OCI Object Storage for efficient blueprint deployment", "blueprint_long_description": "Model storage is a critical component for AI/ML workloads, providing efficient access to large language models and other AI assets. OCI AI Blueprints supports storing models in OCI Object Storage, which offers faster loading times and better resource management compared to downloading models directly from HuggingFace during container startup.\n\nThis blueprint provides automated workflows to download models from HuggingFace (both open and gated models) and store them in OCI Object Storage buckets. Once stored, these models can be efficiently accessed by inference blueprints through pre-authenticated requests (PARs) or direct bucket access, significantly reducing deployment times and improving reliability.\n\nThe system supports both open-source models that require no authentication and closed/gated models that require HuggingFace tokens for access. Models are downloaded using optimized parallel workers and stored with appropriate volume sizing to accommodate large model files.", "pre_filled_samples": [{"pre_filled_sample_name": "Download Closed HuggingFace Model to Object Storage", "recipe_id": "example", "recipe_mode": "job", "deployment_name": "model_to_object", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:hf_downloader_v1", "recipe_container_command_args": ["meta-llama/Llama-3.2-90B-Vision-Instruct", "--local-dir", "/models", "--max-workers", "4", "--token", ""], "recipe_container_port": "5678", "recipe_node_shape": "VM.Standard.E4.Flex", "recipe_node_pool_size": 1, "recipe_flex_shape_ocpu_count": 4, "recipe_flex_shape_memory_size_in_gbs": 64, "recipe_node_boot_volume_size_in_gbs": 500, "recipe_ephemeral_storage_size": 450, "output_object_storage": [{"bucket_name": "llama3290Bvisioninstruct", "mount_location": "/models", "volume_size_in_gbs": 450}]}, {"pre_filled_sample_name": "Download Open HuggingFace Model to Object Storage", "recipe_id": "example", "recipe_mode": "job", "deployment_name": "model_to_object", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:hf_downloader_v1", "recipe_container_command_args": ["NousResearch/Meta-Llama-3.1-405B-FP8", "--local-dir", "/models", "--max-workers", "16"], "recipe_container_port": "5678", "recipe_node_shape": "VM.Standard.E4.Flex", "recipe_node_pool_size": 1, "recipe_flex_shape_ocpu_count": 16, "recipe_flex_shape_memory_size_in_gbs": 256, "recipe_node_boot_volume_size_in_gbs": 1000, "recipe_ephemeral_storage_size": 900, "output_object_storage": [{"bucket_name": "nousllama31405bfp8", "mount_location": "/models", "volume_size_in_gbs": 800}]}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "Using RDMA Enabled Node Pools", "blueprint_short_description": "Enable high-performance inter-node communication using Remote Direct Memory Access for large-scale AI workloads", "blueprint_long_description": "Remote Direct Memory Access (RDMA) is a protocol that enables one node to read from or write to the memory of another node without involving either machine's CPU or operating system, enabling true zero-copy data transfers and dramatically reducing latency and CPU overhead. In large-scale AI workloads such as multi-node training with AllReduce or disaggregated LLM inference, RDMA can yield tremendous performance gains by significantly reducing communication and copy overhead between nodes.\n\nOCI AI Blueprints uses OCI cluster networks with instance pools to provision RDMA-enabled node pools, supporting high-performance compute shapes including BM.GPU.H100.8, BM.GPU.H200.8, and BM.GPU.B4.8. The system requires custom node images with proper drivers and libraries for RDMA connectivity, which must be imported from the oci-hpc-oke quickstart repository.\n\nRDMA-enabled deployments are particularly valuable for distributing very large language models (like Llama-3.1-405B-Instruct) that exceed single-node GPU memory capacity, requiring distributed inference across multiple nodes with high-bandwidth, low-latency communication. The technology enables efficient tensor and pipeline parallelism by eliminating traditional network communication bottlenecks.\n\nThe implementation supports both creating new RDMA-enabled shared node pools and integrating OCI AI Blueprints with existing RDMA-enabled clusters, providing flexibility for various deployment scenarios and infrastructure configurations.", "pre_filled_samples": [{"pre_filled_sample_name": "RDMA-Enabled H100 Shared Node Pool", "deployment_name": "H100_rdma_pool", "recipe_mode": "shared_node_pool", "shared_node_pool_size": 2, "shared_node_pool_shape": "BM.GPU.H100.8", "shared_node_pool_boot_volume_size_in_gbs": 1000, "recipe_availability_domain": "TrcQ:EU-FRANKFURT-1-AD-3", "recipe_node_image_ocid": "ocid1.image.oc1.eu-frankfurt-1.aaaaaaaakhpy5kt3p6gjmeqbasnndemp6aetlnbkm57hohrkgksuh4476llq", "multinode_rdma_enabled_in_shared_pool": true}, {"pre_filled_sample_name": "RDMA Distributed Inference (405B Model)", "recipe_id": "llm_inference_nvidia", "recipe_mode": "service", "deployment_name": "405b", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:ray2430_vllmv083", "recipe_node_shape": "BM.GPU.H100.8", "recipe_replica_count": 1, "recipe_container_port": "8000", "recipe_nvidia_gpu_count": 8, "recipe_use_shared_node_pool": true, "multinode_rdma_enabled_in_shared_pool": true, "multinode_num_nodes_to_use_from_shared_pool": 2, "input_object_storage": [{"par": "https://iduyx1qnmway.objectstorage.eu-frankfurt-1.oci.customer-oci.com/p/7N2O5JFirNX_CG70t-HPILzHvlTMP4FC9f_eauJVECosqNafIYxwcDwhItQHvaDK/n/iduyx1qnmway/b/llama31405binstruct/o/", "mount_location": "/models", "volume_size_in_gbs": 500}], "recipe_container_env": [{"key": "NCCL_DEBUG", "value": "INFO"}, {"key": "NCCL_DEBUG_SUBSYS", "value": "INIT,NET,ENV"}], "recipe_readiness_probe_params": {"endpoint_path": "/health", "port": 8000, "initial_delay_seconds": 20, "period_seconds": 10}, "recipe_container_command_args": ["--port", "8000", "--model", "/models", "--tensor-parallel-size", "8", "--gpu-memory-utilization", "0.90", "--pipeline-parallel-size", "2", "--distributed-executor-backend", "ray"], "recipe_ephemeral_storage_size": 100, "recipe_shared_memory_volume_size_limit_in_mb": 10000}, {"pre_filled_sample_name": "Update Nodes for RDMA Support", "recipe_mode": "update", "deployment_name": "startupaddnode1", "recipe_node_name": "10.0.10.164", "recipe_node_labels": {"corrino": "h100pool", "corrino/pool-shared-any": "true", "corrino/rdma": "true"}}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "Whisper Transcription API", "blueprint_short_description": "This blueprint provides a complete solution for running **audio/video transcription**, **speaker diarization**, and **summarization** via a RESTful API. It integrates [Faster-Whisper](https://github.com/guillaumekln/faster-whisper) for efficient transcription, [pyannote.audio](https://github.com/pyannote/pyannote-audio) for diarization, and Hugging Face instruction-tuned LLMs (e.g., Mistral-7B) for summarization. It supports multi-GPU acceleration, real-time streaming logs, and JSON/text output formats.", "blueprint_long_description": "---", "pre_filled_samples": [{"pre_filled_sample_name": "Deploy Whisper transcription on A10 GPU for real-time speech-to-text", "recipe_id": "whisper transcription", "recipe_mode": "service", "deployment_name": "whisper-transcription-a10", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:whisper_transcription_v8", "recipe_node_shape": "VM.GPU.A10.2", "recipe_replica_count": 1, "recipe_container_port": "8000", "recipe_nvidia_gpu_count": 2, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_ephemeral_storage_size": 100, "recipe_shared_memory_volume_size_limit_in_mb": 1000}, {"pre_filled_sample_name": "Deploy Whisper transcription on A100 GPU for high-speed processing", "recipe_id": "whisper transcription", "recipe_mode": "service", "deployment_name": "whisper-transcription-a100", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:whisper_transcription_v8", "recipe_node_shape": "BM.GPU.A100.8", "recipe_replica_count": 1, "recipe_container_port": "8000", "recipe_nvidia_gpu_count": 8, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_ephemeral_storage_size": 100, "recipe_shared_memory_volume_size_limit_in_mb": 1000}, {"pre_filled_sample_name": "Deploy Whisper transcription on H100 GPU for next-gen AI workloads", "recipe_id": "whisper transcription", "recipe_mode": "service", "deployment_name": "whisper-transcription-h100", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:whisper_transcription_v8", "recipe_node_shape": "BM.GPU.H100.8", "recipe_replica_count": 1, "recipe_container_port": "8000", "recipe_nvidia_gpu_count": 8, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_ephemeral_storage_size": 100, "recipe_shared_memory_volume_size_limit_in_mb": 1000}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "Deployment Groups", "blueprint_short_description": "Connected multi-container deployments in a single blueprint", "blueprint_long_description": "Deployment Groups let you spin up several deployments — each derived from its own blueprint — in a single `POST /deployment` request and treat them as one cohesive application. OCI AI Blueprints automatically sequences those member deployments according to the depends_on relationships you declare, publishes each deployment’s outputs (such as service URLs or internal dns name) for easy discovery, and then injects those outputs wherever you reference the placeholder `${deployment_name.export_key}` inside downstream blueprints. What once required a series of separate API calls stitched together with hard-coded endpoints can now be expressed declaratively in one step, with OCI AI Blueprints resolving every cross-service connection at runtime.", "pre_filled_samples": [{"pre_filled_sample_name": "Deployment Groups Showcase: Llama Stack", "deployment_group": {"name": "group", "deployments": [{"name": "postgres", "recipe": {"recipe_id": "postgres", "deployment_name": "postgres", "recipe_mode": "service", "recipe_node_pool_size": 1, "recipe_node_shape": "VM.Standard.E4.Flex", "recipe_flex_shape_ocpu_count": 2, "recipe_flex_shape_memory_size_in_gbs": 16, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_ephemeral_storage_size": 100, "recipe_image_uri": "docker.io/library/postgres:latest", "recipe_container_port": "5432", "recipe_host_port": "5432", "recipe_container_env": [{"key": "POSTGRES_USER", "value": "llamastack"}, {"key": "POSTGRES_PASSWORD", "value": "llamastack"}, {"key": "POSTGRES_DB", "value": "llamastack"}], "recipe_replica_count": 1}, "exports": ["internal_dns_name"]}, {"name": "chroma", "recipe": {"recipe_id": "chromadb", "deployment_name": "chroma", "recipe_mode": "service", "recipe_node_pool_size": 1, "recipe_node_shape": "VM.Standard.E4.Flex", "recipe_flex_shape_ocpu_count": 2, "recipe_flex_shape_memory_size_in_gbs": 16, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_ephemeral_storage_size": 100, "recipe_image_uri": "docker.io/chromadb/chroma:latest", "recipe_container_port": "8000", "recipe_host_port": "8000", "recipe_container_env": [{"key": "IS_PERSISTENT", "value": "TRUE"}, {"key": "ANONYMIZED_TELEMETRY", "value": "FALSE"}], "recipe_replica_count": 1, "output_object_storage": [{"bucket_name": "chromadb", "mount_location": "/chroma/chroma", "volume_size_in_gbs": 500}]}, "exports": ["internal_dns_name"]}, {"name": "vllm", "recipe": {"recipe_id": "llm_inference_nvidia", "deployment_name": "vllm", "recipe_mode": "service", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:vllmv0.6.6.pos1", "recipe_node_shape": "VM.GPU.A10.2", "input_object_storage": [{"par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/IFknABDAjiiF5LATogUbRCcVQ9KL6aFUC1j-P5NSeUcaB2lntXLaR935rxa-E-u1/n/iduyx1qnmway/b/corrino_hf_oss_models/o/", "mount_location": "/models", "volume_size_in_gbs": 500, "include": ["NousResearch/Meta-Llama-3.1-8B-Instruct"]}], "recipe_container_env": [{"key": "tensor_parallel_size", "value": "2"}, {"key": "model_name", "value": "NousResearch/Meta-Llama-3.1-8B-Instruct"}, {"key": "Model_Path", "value": "/models/NousResearch/Meta-Llama-3.1-8B-Instruct"}], "recipe_replica_count": 1, "recipe_container_port": "8000", "recipe_nvidia_gpu_count": 2, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_container_command_args": ["--model", "$(Model_Path)", "--tensor-parallel-size", "$(tensor_parallel_size)"], "recipe_ephemeral_storage_size": 100, "recipe_shared_memory_volume_size_limit_in_mb": 200}, "exports": ["internal_dns_name"]}, {"name": "jaeger", "recipe": {"recipe_id": "jaeger", "deployment_name": "jaeger", "recipe_mode": "service", "recipe_node_pool_size": 1, "recipe_node_shape": "VM.Standard.E4.Flex", "recipe_flex_shape_ocpu_count": 2, "recipe_flex_shape_memory_size_in_gbs": 16, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_ephemeral_storage_size": 100, "recipe_image_uri": "docker.io/jaegertracing/jaeger:latest", "recipe_container_port": "16686", "recipe_additional_ingress_ports": [{"name": "jaeger", "port": 4318, "path": "/jaeger"}], "recipe_replica_count": 1}, "exports": ["internal_dns_name"]}, {"name": "llamastack_app", "recipe": {"recipe_id": "llamastack_app", "deployment_name": "llamastack_app", "recipe_mode": "service", "recipe_node_pool_size": 1, "recipe_node_shape": "VM.Standard.E4.Flex", "recipe_flex_shape_ocpu_count": 2, "recipe_flex_shape_memory_size_in_gbs": 16, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_ephemeral_storage_size": 100, "recipe_image_uri": "docker.io/llamastack/distribution-postgres-demo:latest", "recipe_container_port": "8321", "recipe_container_env": [{"key": "INFERENCE_MODEL", "value": "/models/NousResearch/Meta-Llama-3.1-8B-Instruct"}, {"key": "VLLM_URL", "value": "http://${vllm.internal_dns_name}/v1"}, {"key": "ENABLE_CHROMADB", "value": "1"}, {"key": "CHROMADB_URL", "value": "http://${chroma.internal_dns_name}:8000"}, {"key": "POSTGRES_HOST", "value": "${postgres.internal_dns_name}"}, {"key": "POSTGRES_PORT", "value": "5432"}, {"key": "POSTGRES_DB", "value": "llamastack"}, {"key": "POSTGRES_USER", "value": "llamastack"}, {"key": "POSTGRES_PASSWORD", "value": "llamastack"}, {"key": "TELEMETRY_SINKS", "value": "console,otel_trace"}, {"key": "OTEL_TRACE_ENDPOINT", "value": "http://${jaeger.internal_dns_name}/jaeger/v1/traces"}], "output_object_storage": [{"bucket_name": "llamastack", "mount_location": "/root/.llama", "volume_size_in_gbs": 100}], "recipe_replica_count": 1}, "depends_on": ["postgres", "chroma", "vllm", "jaeger"]}]}}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "Shared Node Pools", "blueprint_short_description": "Create persistent node pools for efficient blueprint deployment without infrastructure recycling", "blueprint_long_description": "Shared node pools enable you to launch infrastructure independent of individual blueprints, allowing multiple blueprints to deploy and undeploy on the same underlying infrastructure without the overhead of spinning up new node pools for each deployment. This approach eliminates the time-consuming process of infrastructure provisioning and teardown, particularly beneficial for bare metal shapes that require longer recycle times.\n\nWhen you deploy a standard blueprint, OCI AI Blueprints creates a separate node pool for each blueprint and destroys it upon undeployment. Shared node pools solve this inefficiency by providing persistent infrastructure that can host multiple blueprints simultaneously or sequentially. This is especially valuable when you want to deploy multiple blueprints on the same hardware (e.g., two blueprints each using 2 GPUs on a 4-GPU shape) or need rapid deployment cycles.\n\nThe system supports both selector-based and non-selector deployment strategies. With selectors, you can use naming conventions to ensure specific blueprints land on designated shared node pools, providing precise control over resource allocation. Without selectors, blueprints will deploy to any available shared node pool matching the required shape.\n\nShared node pools are compatible with any blueprint and support all OCI compute shapes, with special considerations for bare metal configurations that require boot volume size specifications.\n\n**Note**: The list of shapes below are supported by Blueprints, but not yet supported by OKE, requiring blueprints to treat them as self-managed nodes. These require:\n\n1. Specifying the Availability Domain of the instance type\n\n2. Specifying the custom image OCID to use for the node\n\nAdditional required fields:\n\n```json\n\n\"recipe_availability_domain\": \"\",\n\n\"recipe_node_image_ocid\": \"\"\n\n```\n\nSee [this recipe](./shared_node_pool_B200_BM.json) as an example for these parameters.\n\n[This document section](../using_rdma_enabled_node_pools/README.md#import-a-custom-image) describes now to import a custom image and provides links to import custom images for various shapes.", "pre_filled_samples": [{"pre_filled_sample_name": "Shared Node Pool for BM.GPU.A10", "deployment_name": "BM.GPU.A10.4 shared pool", "recipe_mode": "shared_node_pool", "shared_node_pool_size": 2, "shared_node_pool_shape": "BM.GPU.A10.4", "shared_node_pool_boot_volume_size_in_gbs": 500}, {"pre_filled_sample_name": "Shared Node Pool for VM.GPU.A10", "deployment_name": "VM.GPU.A10.2 shared pool", "recipe_mode": "shared_node_pool", "shared_node_pool_size": 2, "shared_node_pool_shape": "VM.GPU.A10.2", "shared_node_pool_boot_volume_size_in_gbs": 500}, {"pre_filled_sample_name": "vLLM Inference on Shared Pool", "recipe_id": "llm_inference_nvidia", "recipe_mode": "service", "deployment_name": "vLLM Inference Deployment", "recipe_image_uri": "docker.io/vllm/vllm-openai:v0.9.1", "recipe_node_shape": "BM.GPU.A10.4", "input_object_storage": [{"par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/IFknABDAjiiF5LATogUbRCcVQ9KL6aFUC1j-P5NSeUcaB2lntXLaR935rxa-E-u1/n/iduyx1qnmway/b/corrino_hf_oss_models/o/", "mount_location": "/models", "volume_size_in_gbs": 500, "include": ["NousResearch/Meta-Llama-3.1-8B-Instruct"]}], "recipe_container_env": [{"key": "tensor_parallel_size", "value": "2"}, {"key": "model_name", "value": "NousResearch/Meta-Llama-3.1-8B-Instruct"}, {"key": "Model_Path", "value": "/models/NousResearch/Meta-Llama-3.1-8B-Instruct"}], "recipe_replica_count": 1, "recipe_container_port": "8000", "recipe_nvidia_gpu_count": 2, "recipe_use_shared_node_pool": true, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_container_command_args": ["--model", "$(Model_Path)", "--tensor-parallel-size", "$(tensor_parallel_size)"], "recipe_ephemeral_storage_size": 100, "recipe_shared_memory_volume_size_limit_in_mb": 1000}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "Startup, Liveness, and Readiness Probes", "blueprint_short_description": "Configure application health monitoring and startup validation for reliable service deployment", "blueprint_long_description": "Startup, Liveness, and Readiness Probes are essential Kubernetes tools that ensure your applications are truly ready to serve traffic and remain healthy throughout their lifecycle. These probes are particularly critical for LLM inference services that require time to load model weights before becoming ready to serve requests.\n\nThis blueprint demonstrates how to configure these probes with any OCI AI Blueprint deployment to improve service reliability and prevent traffic routing to unhealthy containers. The probes can be applied to any blueprint type - inference, training, or custom workloads - providing consistent health monitoring across your AI infrastructure.", "pre_filled_samples": [{"pre_filled_sample_name": "vLLM Autoscaling with Health Probes", "recipe_id": "llm_inference_nvidia", "recipe_mode": "service", "deployment_name": "autoscale_with_fss", "recipe_image_uri": "docker.io/vllm/vllm-openai:v0.9.1", "recipe_node_shape": "VM.GPU.A10.2", "recipe_container_env": [{"key": "tensor_parallel_size", "value": "1"}, {"key": "Model_Path", "value": "/models/models/meta-llama/Llama-3.2-1B-Instruct"}], "recipe_replica_count": 1, "recipe_container_port": "8000", "recipe_nvidia_gpu_count": 1, "recipe_container_command_args": ["--model", "$(Model_Path)", "--tensor-parallel-size", "$(tensor_parallel_size)", "--gpu-memory-utilization", "0.99", "--max-model-len", "1024"], "recipe_ephemeral_storage_size": 200, "recipe_node_boot_volume_size_in_gbs": 300, "recipe_node_pool_size": 1, "recipe_shared_memory_volume_size_limit_in_mb": 200, "recipe_startup_probe_params": {"failure_threshold": 30, "endpoint_path": "/health", "port": 8000, "scheme": "HTTP", "initial_delay_seconds": 10, "period_seconds": 2, "success_threshold": 1, "timeout_seconds": 1}, "recipe_liveness_probe_params": {"failure_threshold": 3, "endpoint_path": "/health", "port": 8000, "scheme": "HTTP", "initial_delay_seconds": 65, "period_seconds": 600, "success_threshold": 1, "timeout_seconds": 1}, "recipe_pod_autoscaling_params": {"min_replicas": 1, "max_replicas": 4}, "recipe_node_autoscaling_params": {"min_nodes": 1, "max_nodes": 2}, "input_file_system": [{"file_system_ocid": "ocid1.filesystem.oc1.iad.aaaaaaaaaaklirslnfqwillqojxwiotjmfsc2ylefuzqaaaa", "mount_target_ocid": "ocid1.mounttarget.oc1.iad.aaaaacvipp3o7rlwnfqwillqojxwiotjmfsc2ylefuzqaaaa", "mount_location": "/models", "volume_size_in_gbs": 50}]}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "Teams", "blueprint_short_description": "Enforce resource quotas and fair sharing between teams using Kueue job queuing for efficient cluster utilization", "blueprint_long_description": "Teams in OCI AI Blueprints enables administrators to enforce resource quotas and fair sharing between different organizational units, ensuring efficient allocation of GPU and CPU resources across multiple teams within a shared cluster. The system leverages Kueue, a Kubernetes job queuing system, to manage AI/ML workloads with workload queuing, prioritization, and resource-aware scheduling.\n\nEach team functions as a logical grouping backed by a Kueue ClusterQueue and LocalQueue, with configurable nominal quotas (guaranteed resources), borrowing limits (extra resources when available), and lending limits (idle resources offered to other teams). This approach enables fair sharing, dynamic resource allocation, and improved utilization across workloads while maintaining strict resource boundaries.\n\nThe team system supports multi-tenant clusters where business units, research groups, or customers can be isolated while still sharing idle GPU/CPU capacity. Jobs are admitted based on available quotas and resource policies, with priority thresholds determining which teams can exceed their nominal quotas when extra resources are available.\n\nTeams are particularly valuable for capacity planning, expressing organizational-level GPU budgets in code, and tracking consumption across different groups. The system automatically handles resource borrowing and lending through a shared cohort, ensuring that resources never sit idle while respecting team boundaries and priorities.", "pre_filled_samples": [{"pre_filled_sample_name": "Create Team with Resource Quotas", "recipe_mode": "team", "deployment_name": "create_team", "team": {"team_name": "randomteam", "priority_threshold": 100, "quotas": [{"shape_name": "BM.GPU.H100.8", "cpu_nominal_quota": "10", "cpu_borrowing_limit": "4", "cpu_lending_limit": "4", "mem_nominal_quota": "10", "mem_borrowing_limit": "4", "mem_lending_limit": "4", "gpu_nominal_quota": "10", "gpu_borrowing_limit": "4", "gpu_lending_limit": "4"}, {"shape_name": "VM.GPU.A10.2", "cpu_nominal_quota": "10", "cpu_borrowing_limit": "4", "cpu_lending_limit": "4", "mem_nominal_quota": "10", "mem_borrowing_limit": "4", "mem_lending_limit": "4", "gpu_nominal_quota": "10", "gpu_borrowing_limit": "4", "gpu_lending_limit": "4"}]}}, {"pre_filled_sample_name": "Create Job with Team Assignment", "recipe_id": "healthcheck", "recipe_mode": "job", "deployment_name": "create_job_with_team", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:healthcheck_v0.3", "recipe_node_shape": "VM.GPU.A10.2", "recipe_use_shared_node_pool": true, "recipe_team_info": {"team_name": "randomteam"}, "output_object_storage": [{"bucket_name": "healthcheck2", "mount_location": "/healthcheck_results", "volume_size_in_gbs": 20}], "recipe_container_command_args": ["--dtype", "float16", "--output_dir", "/healthcheck_results", "--expected_gpus", "A10:2,A100:0,H100:0"], "recipe_replica_count": 1, "recipe_nvidia_gpu_count": 2, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_ephemeral_storage_size": 100, "recipe_shared_memory_volume_size_limit_in_mb": 1000, "recipe_container_cpu_count": 4, "recipe_container_memory_size": 20}]}] \ No newline at end of file diff --git a/docs/about.md b/docs/about.md index 2dd3985..aed9498 100644 --- a/docs/about.md +++ b/docs/about.md @@ -36,8 +36,8 @@ | ------------------------------ | --------------------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------------- | | **Customize Blueprints** | Tailor existing OCI AI Blueprints to suit your exact AI workload needs—everything from hyperparameters to node counts and hardware. | [Read More](custom_blueprints/README.md) | | **Updating OCI AI Blueprints** | Keep your OCI AI Blueprints environment current with the latest control plane and portal updates. | [Read More](../INSTALLING_ONTO_EXISTING_CLUSTER_README.md) | -| **Shared Node Pool** | Use longer-lived resources (e.g., bare metal nodes) across multiple blueprints or to persist resources after a blueprint is undeployed. | [Read More](sample_blueprints/platform_feature_blueprints/shared_node_pools/README.md) | -| **Auto-Scaling** | Automatically adjust resource usage based on infrastructure or application-level metrics to optimize performance and costs. | [Read More](sample_blueprints/platform_feature_blueprints/auto_scaling/README.md) | +| **Shared Node Pool** | Use longer-lived resources (e.g., bare metal nodes) across multiple blueprints or to persist resources after a blueprint is undeployed. | [Read More](sample_blueprints/platform_features/shared_node_pools/README.md) | +| **Auto-Scaling** | Automatically adjust resource usage based on infrastructure or application-level metrics to optimize performance and costs. | [Read More](sample_blueprints/model_serving/auto_scaling/README.md) | --- @@ -76,13 +76,13 @@ A: A: Deploy a vLLM blueprint, then use a tool like LLMPerf to run benchmarking against your inference endpoint. Contact us for more details. **Q: Where can I see the full list of blueprints?** -A: All available blueprints are listed [here](sample_blueprints/platform_feature_blueprints/exisiting_cluster_installation/README.md). If you need something custom, please let us know. +A: All available blueprints are listed [here](sample_blueprints/other/exisiting_cluster_installation/README.md). If you need something custom, please let us know. **Q: How do I check logs for troubleshooting?** A: Use `kubectl` to inspect pod logs in your OKE cluster. **Q: Does OCI AI Blueprints support auto-scaling?** -A: Yes, we leverage KEDA for application-driven auto-scaling. See [documentation](sample_blueprints/platform_feature_blueprints/auto_scaling/README.md). +A: Yes, we leverage KEDA for application-driven auto-scaling. See [documentation](sample_blueprints/model_serving/auto_scaling/README.md). **Q: Which GPUs are compatible?** A: Any NVIDIA GPUs available in your OCI region (A10, A100, H100, etc.). @@ -91,4 +91,4 @@ A: Any NVIDIA GPUs available in your OCI region (A10, A100, H100, etc.). A: Yes, though testing on clusters running other workloads is ongoing. We recommend a clean cluster for best stability. **Q: How do I run multiple blueprints on the same node?** -A: Enable shared node pools. [Read more here](sample_blueprints/platform_feature_blueprints/shared_node_pools/README.md). +A: Enable shared node pools. [Read more here](sample_blueprints/platform_features/shared_node_pools/README.md). diff --git a/docs/api_documentation.md b/docs/api_documentation.md index e78125d..08efdf3 100644 --- a/docs/api_documentation.md +++ b/docs/api_documentation.md @@ -36,11 +36,11 @@ | recipe_container_env | string | No | Values of the recipe container init arguments. See the Blueprint Arguments section below for details. Example: `[{"key": "tensor_parallel_size","value": "2"},{"key": "model_name","value": "NousResearch/Meta-Llama-3.1-8B-Instruct"},{"key": "Model_Path","value": "/models/NousResearch/Meta-Llama-3.1-8B-Instruct"}]` | | skip_capacity_validation | boolean | No | Determines whether validation checks on shape capacity are performed before initiating deployment. If your deployment is failing validation due to capacity errors but you believe this not to be true, you should set `skip_capacity_validation` to be `true` in the recipe JSON to bypass all checks for Shape capacity. | -For autoscaling parameters, visit [autoscaling](sample_blueprints/platform_feature_blueprints/auto_scaling/README.md). +For autoscaling parameters, visit [autoscaling](sample_blueprints/model_serving/auto_scaling/README.md). -For multinode inference parameters, visit [multinode inference](sample_blueprints/workload_blueprints/multi-node-inference/README.md) +For multinode inference parameters, visit [multinode inference](sample_blueprints/model_serving/multi-node-inference/README.md) -For MIG parameters, visit [MIG shared pool configurations](sample_blueprints/platform_feature_blueprints/mig_multi_instance_gpu/mig_inference_single_replica.json), [update MIG configuration](sample_blueprints/platform_feature_blueprints/mig_multi_instance_gpu/mig_inference_single_replica.json), and [MIG recipe configuration](sample_blueprints/platform_feature_blueprints/mig_multi_instance_gpu/mig_inference_single_replica.json). +For MIG parameters, visit [MIG shared pool configurations](sample_blueprints/model_serving/mig_multi_instance_gpu/mig_inference_single_replica.json), [update MIG configuration](sample_blueprints/model_serving/mig_multi_instance_gpu/mig_inference_single_replica.json), and [MIG recipe configuration](sample_blueprints/model_serving/mig_multi_instance_gpu/mig_inference_single_replica.json). ### Blueprint Container Arguments @@ -94,13 +94,13 @@ This recipe deploys the vLLM container image. Follow the vLLM docs to pass the c There are 3 blueprints that we are providing out of the box. Following are example recipe.json snippets that you can use to deploy the blueprints quickly for a test run. |Blueprint|Scenario|Sample JSON| |----|----|---- -|LLM Inference using NVIDIA shapes and vLLM|Deployment with default Llama-3.1-8B model using PAR|View sample JSON here [here](sample_blueprints/workload_blueprints/llm_inference_with_vllm/vllm-open-hf-model.json) -|MLCommons Llama-2 Quantized 70B LORA Fine-Tuning on A100|Default deployment with model and dataset ingested using PAR|View sample JSON here [here](sample_blueprints/workload_blueprints/lora-benchmarking/mlcommons_lora_finetune_nvidia_sample_recipe.json) -|LORA Fine-Tune Blueprint|Open Access Model Open Access Dataset Download from Huggingface (no token required)|View sample JSON [here](sample_blueprints/workload_blueprints/lora-fine-tuning/open_model_open_dataset_hf.backend.json) -|LORA Fine-Tune Blueprint|Closed Access Model Open Access Dataset Download from Huggingface (Valid Auth Token Is Required!!)|View sample JSON [here](sample_blueprints/workload_blueprints/lora-fine-tuning/closed_model_open_dataset_hf.backend.json) -|LORA Fine-Tune Blueprint|Bucket Model Open Access Dataset Download from Huggingface (no token required)|View sample JSON [here](sample_blueprints/workload_blueprints/lora-fine-tuning/bucket_par_open_dataset.backend.json) -|LORA Fine-Tune Blueprint|Get Model from Bucket in Another Region / Tenancy using Pre-Authenticated_Requests (PAR) Open Access Dataset Download from Huggingface (no token required)|View sample JSON [here](sample_blueprints/workload_blueprints/lora-fine-tuning/bucket_model_open_dataset.backend.json) -|LORA Fine-Tune Blueprint|Bucket Model Bucket Checkpoint Open Access Dataset Download from Huggingface (no token required)|View sample JSON [here](sample_blueprints/workload_blueprints/lora-fine-tuning/bucket_par_open_dataset.backend.json) +|LLM Inference using NVIDIA shapes and vLLM|Deployment with default Llama-3.1-8B model using PAR|View sample JSON here [here](sample_blueprints/model_serving/llm_inference_with_vllm/vllm-open-hf-model.json) +|MLCommons Llama-2 Quantized 70B LORA Fine-Tuning on A100|Default deployment with model and dataset ingested using PAR|View sample JSON here [here](sample_blueprints/gpu_benchmarking/lora-benchmarking/mlcommons_lora_finetune_nvidia_sample_recipe.json) +|LORA Fine-Tune Blueprint|Open Access Model Open Access Dataset Download from Huggingface (no token required)|View sample JSON [here](sample_blueprints/model_fine_tuning/lora-fine-tuning/open_model_open_dataset_hf.backend.json) +|LORA Fine-Tune Blueprint|Closed Access Model Open Access Dataset Download from Huggingface (Valid Auth Token Is Required!!)|View sample JSON [here](sample_blueprints/model_fine_tuning/lora-fine-tuning/closed_model_open_dataset_hf.backend.json) +|LORA Fine-Tune Blueprint|Bucket Model Open Access Dataset Download from Huggingface (no token required)|View sample JSON [here](sample_blueprints/model_fine_tuning/lora-fine-tuning/bucket_par_open_dataset.backend.json) +|LORA Fine-Tune Blueprint|Get Model from Bucket in Another Region / Tenancy using Pre-Authenticated_Requests (PAR) Open Access Dataset Download from Huggingface (no token required)|View sample JSON [here](sample_blueprints/model_fine_tuning/lora-fine-tuning/bucket_model_open_dataset.backend.json) +|LORA Fine-Tune Blueprint|Bucket Model Bucket Checkpoint Open Access Dataset Download from Huggingface (no token required)|View sample JSON [here](sample_blueprints/model_fine_tuning/lora-fine-tuning/bucket_par_open_dataset.backend.json) ## Undeploy a Blueprint diff --git a/docs/common_workflows/deploying_blueprints_onto_specific_nodes/README.md b/docs/common_workflows/deploying_blueprints_onto_specific_nodes/README.md index a0047c6..6c4f4bd 100644 --- a/docs/common_workflows/deploying_blueprints_onto_specific_nodes/README.md +++ b/docs/common_workflows/deploying_blueprints_onto_specific_nodes/README.md @@ -21,7 +21,7 @@ If you have existing node pools in your original OKE cluster that you'd like Blu 2. Go to the stack and click "Application information". Click the API Url. 3. Login with the `Admin Username` and `Admin Password` in the Application information tab. 4. Click the link next to "deployment" which will take you to a page with "Deployment List", and a content box. -5. Paste in the sample blueprint json found [here](../../sample_blueprints/platform_feature_blueprints/exisiting_cluster_installation/add_node_to_control_plane.json). +5. Paste in the sample blueprint json found [here](../../sample_blueprints/other/exisiting_cluster_installation/add_node_to_control_plane.json). 6. Modify the "recipe_node_name" field to the private IP address you found in step 1 above. 7. Click "POST". This is a fast operation. 8. Wait about 20 seconds and refresh the page. It should look like: diff --git a/docs/common_workflows/working_with_large_models/README.md b/docs/common_workflows/working_with_large_models/README.md index f5da6a8..5d08ae5 100644 --- a/docs/common_workflows/working_with_large_models/README.md +++ b/docs/common_workflows/working_with_large_models/README.md @@ -40,7 +40,7 @@ Steps: 1. Create a bucket in object storage in the same region as the shared node pool (decrease copy times). In our example, we will call this something similar to the name of the model we plan to use: `llama3290Bvisioninstruct` -2. Once the bucket is finished creating, deploy [this blueprint](../../sample_blueprints/platform_feature_blueprints/model_storage/download_closed_hf_model_to_object_storage.json) to copy `meta-llama/Llama-3.2-90B-Vision-Instruct` to the bucket you created. +2. Once the bucket is finished creating, deploy [this blueprint](../../sample_blueprints/other/model_storage/download_closed_hf_model_to_object_storage.json) to copy `meta-llama/Llama-3.2-90B-Vision-Instruct` to the bucket you created. - **Note**: The blueprint assumes you created the bucket using the name `llama3290Bvisioninstruct`. If you changed the name, you will also need to modify it in the example blueprint. diff --git a/docs/sample_blueprints/README.md b/docs/sample_blueprints/README.md index 7f50b74..af4e4aa 100644 --- a/docs/sample_blueprints/README.md +++ b/docs/sample_blueprints/README.md @@ -12,18 +12,18 @@ You may use any blueprint JSON from these categories as the payload in the `/dep | Feature Category | Type | Documentation | Description | | ---------------------------------------------------------------- | -------------- | ---------------------------------------------------- | ------------------------------------------------------------------------------------------- | -| [Autoscaling](platform_feature_blueprints/auto_scaling/README.md) | Inference | [Guide](platform_feature_blueprints/auto_scaling/README.md) | Scale inference workloads based on traffic load with automatic pod and node scaling | -| [CPU Inference](workload_blueprints/cpu-inference/README.md) | Inference | [Guide](workload_blueprints/cpu-inference/README.md) | Deploy CPU-based inference with Ollama for cost-effective and GPU-free model serving | -| [Existing Cluster Installation](platform_feature_blueprints/exisiting_cluster_installation/README.md) | Infrastructure | [Guide](platform_feature_blueprints/exisiting_cluster_installation/README.md) | Deploy OCI AI Blueprints on your existing OKE cluster without creating new infrastructure | -| [GPU Health Check](workload_blueprints/gpu-health-check/README.md) | Diagnostics | [Guide](workload_blueprints/gpu-health-check/README.md) | Comprehensive GPU health validation and diagnostics for production readiness | -| [vLLM Inference](workload_blueprints/llm_inference_with_vllm/README.md) | Inference | [Guide](workload_blueprints/llm_inference_with_vllm/README.md) | Deploy large language models using vLLM for high-performance inference | -| [Llama Stack](workload_blueprints/llama-stack/README.md) | Application | [Guide](workload_blueprints/llama-stack/README.md) | Complete GenAI runtime with vLLM, ChromaDB, Postgres, and Jaeger for production deployments | -| [LoRA Benchmarking](workload_blueprints/lora-benchmarking/README.md) | Training | [Guide](workload_blueprints/lora-benchmarking/README.md) | Benchmark fine-tuning performance using MLCommons methodology | -| [LoRA Fine-Tuning](workload_blueprints/lora-fine-tuning/README.md) | Training | [Guide](workload_blueprints/lora-fine-tuning/README.md) | Efficiently fine-tune large language models using Low-Rank Adaptation | -| [Multi-Instance GPU](platform_feature_blueprints/mig_multi_instance_gpu/README.md) | Infrastructure | [Guide](platform_feature_blueprints/mig_multi_instance_gpu/README.md) | Partition H100 GPUs into multiple isolated instances for efficient resource sharing | -| [Model Storage](platform_feature_blueprints/model_storage/README.md) | Storage | [Guide](platform_feature_blueprints/model_storage/README.md) | Download and store models from HuggingFace to OCI Object Storage | -| [Multi-Node Inference](workload_blueprints/multi-node-inference/README.md) | Inference | [Guide](workload_blueprints/multi-node-inference/README.md) | Scale large language model inference across multiple GPU nodes | -| [Shared Node Pools](platform_feature_blueprints/shared_node_pools/README.md) | Infrastructure | [Guide](platform_feature_blueprints/shared_node_pools/README.md) | Create persistent node pools for efficient blueprint deployment | -| [Teams](platform_feature_blueprints/teams/README.md) | Management | [Guide](platform_feature_blueprints/teams/README.md) | Enforce resource quotas and fair sharing between teams using Kueue | -| [RDMA Node Pools](platform_feature_blueprints/using_rdma_enabled_node_pools/README.md) | Infrastructure | [Guide](platform_feature_blueprints/using_rdma_enabled_node_pools/README.md) | Enable high-performance inter-node communication using Remote Direct Memory Access | -| [Startup & Health Probes](platform_feature_blueprints/startup_liveness_readiness_probes/README.md) | Configuration | [Guide](platform_feature_blueprints/startup_liveness_readiness_probes/README.md) | Configure application health monitoring and startup validation | +| [Autoscaling](model_serving/auto_scaling/README.md) | Inference | [Guide](model_serving/auto_scaling/README.md) | Scale inference workloads based on traffic load with automatic pod and node scaling | +| [CPU Inference](model_serving/cpu-inference/README.md) | Inference | [Guide](model_serving/cpu-inference/README.md) | Deploy CPU-based inference with Ollama for cost-effective and GPU-free model serving | +| [Existing Cluster Installation](other/exisiting_cluster_installation/README.md) | Infrastructure | [Guide](other/exisiting_cluster_installation/README.md) | Deploy OCI AI Blueprints on your existing OKE cluster without creating new infrastructure | +| [GPU Health Check](gpu_health_check/gpu-health-check/README.md) | Diagnostics | [Guide](gpu_health_check/gpu-health-check/README.md) | Comprehensive GPU health validation and diagnostics for production readiness | +| [vLLM Inference](model_serving/llm_inference_with_vllm/README.md) | Inference | [Guide](model_serving/llm_inference_with_vllm/README.md) | Deploy large language models using vLLM for high-performance inference | +| [Llama Stack](other/llama-stack/README.md) | Application | [Guide](other/llama-stack/README.md) | Complete GenAI runtime with vLLM, ChromaDB, Postgres, and Jaeger for production deployments | +| [LoRA Benchmarking](gpu_benchmarking/lora-benchmarking/README.md) | Training | [Guide](gpu_benchmarking/lora-benchmarking/README.md) | Benchmark fine-tuning performance using MLCommons methodology | +| [LoRA Fine-Tuning](model_fine_tuning/lora-fine-tuning/README.md) | Training | [Guide](model_fine_tuning/lora-fine-tuning/README.md) | Efficiently fine-tune large language models using Low-Rank Adaptation | +| [Multi-Instance GPU](model_serving/mig_multi_instance_gpu/README.md) | Infrastructure | [Guide](model_serving/mig_multi_instance_gpu/README.md) | Partition H100 GPUs into multiple isolated instances for efficient resource sharing | +| [Model Storage](other/model_storage/README.md) | Storage | [Guide](other/model_storage/README.md) | Download and store models from HuggingFace to OCI Object Storage | +| [Multi-Node Inference](model_serving/multi-node-inference/README.md) | Inference | [Guide](model_serving/multi-node-inference/README.md) | Scale large language model inference across multiple GPU nodes | +| [Shared Node Pools](platform_features/shared_node_pools/README.md) | Infrastructure | [Guide](platform_features/shared_node_pools/README.md) | Create persistent node pools for efficient blueprint deployment | +| [Teams](platform_features/teams/README.md) | Management | [Guide](platform_features/teams/README.md) | Enforce resource quotas and fair sharing between teams using Kueue | +| [RDMA Node Pools](other/using_rdma_enabled_node_pools/README.md) | Infrastructure | [Guide](other/using_rdma_enabled_node_pools/README.md) | Enable high-performance inter-node communication using Remote Direct Memory Access | +| [Startup & Health Probes](platform_features/startup_liveness_readiness_probes/README.md) | Configuration | [Guide](platform_features/startup_liveness_readiness_probes/README.md) | Configure application health monitoring and startup validation | diff --git a/docs/sample_blueprints/workload_blueprints/lora-benchmarking/README.md b/docs/sample_blueprints/gpu_benchmarking/lora-benchmarking/README.md similarity index 100% rename from docs/sample_blueprints/workload_blueprints/lora-benchmarking/README.md rename to docs/sample_blueprints/gpu_benchmarking/lora-benchmarking/README.md diff --git a/docs/sample_blueprints/workload_blueprints/lora-benchmarking/mlcommons_lora_finetune_nvidia_sample_recipe.json b/docs/sample_blueprints/gpu_benchmarking/lora-benchmarking/mlcommons_lora_finetune_nvidia_sample_recipe.json similarity index 100% rename from docs/sample_blueprints/workload_blueprints/lora-benchmarking/mlcommons_lora_finetune_nvidia_sample_recipe.json rename to docs/sample_blueprints/gpu_benchmarking/lora-benchmarking/mlcommons_lora_finetune_nvidia_sample_recipe.json diff --git a/docs/sample_blueprints/workload_blueprints/gpu-health-check/README.md b/docs/sample_blueprints/gpu_health_check/gpu-health-check/README.md similarity index 99% rename from docs/sample_blueprints/workload_blueprints/gpu-health-check/README.md rename to docs/sample_blueprints/gpu_health_check/gpu-health-check/README.md index 43dc2d9..18881c5 100644 --- a/docs/sample_blueprints/workload_blueprints/gpu-health-check/README.md +++ b/docs/sample_blueprints/gpu_health_check/gpu-health-check/README.md @@ -1,4 +1,4 @@ -# Health Check +# GPU Health Check #### Comprehensive GPU health validation and diagnostics for production readiness diff --git a/docs/sample_blueprints/workload_blueprints/gpu-health-check/healthcheck_fp16_a10.json b/docs/sample_blueprints/gpu_health_check/gpu-health-check/healthcheck_fp16_a10.json similarity index 100% rename from docs/sample_blueprints/workload_blueprints/gpu-health-check/healthcheck_fp16_a10.json rename to docs/sample_blueprints/gpu_health_check/gpu-health-check/healthcheck_fp16_a10.json diff --git a/docs/sample_blueprints/workload_blueprints/gpu-health-check/healthcheck_fp16_h100.json b/docs/sample_blueprints/gpu_health_check/gpu-health-check/healthcheck_fp16_h100.json similarity index 100% rename from docs/sample_blueprints/workload_blueprints/gpu-health-check/healthcheck_fp16_h100.json rename to docs/sample_blueprints/gpu_health_check/gpu-health-check/healthcheck_fp16_h100.json diff --git a/docs/sample_blueprints/workload_blueprints/gpu-health-check/healthcheck_fp32_a10.json b/docs/sample_blueprints/gpu_health_check/gpu-health-check/healthcheck_fp32_a10.json similarity index 100% rename from docs/sample_blueprints/workload_blueprints/gpu-health-check/healthcheck_fp32_a10.json rename to docs/sample_blueprints/gpu_health_check/gpu-health-check/healthcheck_fp32_a10.json diff --git a/docs/sample_blueprints/workload_blueprints/lora-fine-tuning/README.md b/docs/sample_blueprints/model_fine_tuning/lora-fine-tuning/README.md similarity index 100% rename from docs/sample_blueprints/workload_blueprints/lora-fine-tuning/README.md rename to docs/sample_blueprints/model_fine_tuning/lora-fine-tuning/README.md diff --git a/docs/sample_blueprints/workload_blueprints/lora-fine-tuning/bucket_checkpoint_bucket_model_open_dataset.backend.json b/docs/sample_blueprints/model_fine_tuning/lora-fine-tuning/bucket_checkpoint_bucket_model_open_dataset.backend.json similarity index 100% rename from docs/sample_blueprints/workload_blueprints/lora-fine-tuning/bucket_checkpoint_bucket_model_open_dataset.backend.json rename to docs/sample_blueprints/model_fine_tuning/lora-fine-tuning/bucket_checkpoint_bucket_model_open_dataset.backend.json diff --git a/docs/sample_blueprints/workload_blueprints/lora-fine-tuning/bucket_model_open_dataset.backend.json b/docs/sample_blueprints/model_fine_tuning/lora-fine-tuning/bucket_model_open_dataset.backend.json similarity index 100% rename from docs/sample_blueprints/workload_blueprints/lora-fine-tuning/bucket_model_open_dataset.backend.json rename to docs/sample_blueprints/model_fine_tuning/lora-fine-tuning/bucket_model_open_dataset.backend.json diff --git a/docs/sample_blueprints/workload_blueprints/lora-fine-tuning/bucket_par_open_dataset.backend.json b/docs/sample_blueprints/model_fine_tuning/lora-fine-tuning/bucket_par_open_dataset.backend.json similarity index 100% rename from docs/sample_blueprints/workload_blueprints/lora-fine-tuning/bucket_par_open_dataset.backend.json rename to docs/sample_blueprints/model_fine_tuning/lora-fine-tuning/bucket_par_open_dataset.backend.json diff --git a/docs/sample_blueprints/workload_blueprints/lora-fine-tuning/closed_model_open_dataset_hf.backend.json b/docs/sample_blueprints/model_fine_tuning/lora-fine-tuning/closed_model_open_dataset_hf.backend.json similarity index 100% rename from docs/sample_blueprints/workload_blueprints/lora-fine-tuning/closed_model_open_dataset_hf.backend.json rename to docs/sample_blueprints/model_fine_tuning/lora-fine-tuning/closed_model_open_dataset_hf.backend.json diff --git a/docs/sample_blueprints/workload_blueprints/lora-fine-tuning/open_model_open_dataset_hf.backend.json b/docs/sample_blueprints/model_fine_tuning/lora-fine-tuning/open_model_open_dataset_hf.backend.json similarity index 100% rename from docs/sample_blueprints/workload_blueprints/lora-fine-tuning/open_model_open_dataset_hf.backend.json rename to docs/sample_blueprints/model_fine_tuning/lora-fine-tuning/open_model_open_dataset_hf.backend.json diff --git a/docs/sample_blueprints/platform_feature_blueprints/auto_scaling/README.md b/docs/sample_blueprints/model_serving/auto_scaling/README.md similarity index 99% rename from docs/sample_blueprints/platform_feature_blueprints/auto_scaling/README.md rename to docs/sample_blueprints/model_serving/auto_scaling/README.md index 23a089e..bb5c468 100644 --- a/docs/sample_blueprints/platform_feature_blueprints/auto_scaling/README.md +++ b/docs/sample_blueprints/model_serving/auto_scaling/README.md @@ -192,7 +192,7 @@ Pod auto-scaling allows a blueprint to scale within a single node, up to the num #### Additional Considerations: -Pod autoscaling can be paired with startup and liveness probes to verify that a blueprint is both ready to receive requests and continuing to function properly. For more information, visit [our startup and liveness probe doc](../startup_liveness_readiness_probes/README.md). +Pod autoscaling can be paired with startup and liveness probes to verify that a blueprint is both ready to receive requests and continuing to function properly. For more information, visit [our startup and liveness probe doc](../../platform_features/startup_liveness_readiness_probes/README.md). ## Node + Pod Auto-Scaling (Scaling Beyond a Single Node) diff --git a/docs/sample_blueprints/platform_feature_blueprints/auto_scaling/autoscaling_blueprint.json b/docs/sample_blueprints/model_serving/auto_scaling/autoscaling_blueprint.json similarity index 100% rename from docs/sample_blueprints/platform_feature_blueprints/auto_scaling/autoscaling_blueprint.json rename to docs/sample_blueprints/model_serving/auto_scaling/autoscaling_blueprint.json diff --git a/docs/sample_blueprints/workload_blueprints/cpu-inference/README.md b/docs/sample_blueprints/model_serving/cpu-inference/README.md similarity index 100% rename from docs/sample_blueprints/workload_blueprints/cpu-inference/README.md rename to docs/sample_blueprints/model_serving/cpu-inference/README.md diff --git a/docs/sample_blueprints/workload_blueprints/cpu-inference/cpu-inference-gemma.json b/docs/sample_blueprints/model_serving/cpu-inference/cpu-inference-gemma.json similarity index 100% rename from docs/sample_blueprints/workload_blueprints/cpu-inference/cpu-inference-gemma.json rename to docs/sample_blueprints/model_serving/cpu-inference/cpu-inference-gemma.json diff --git a/docs/sample_blueprints/workload_blueprints/cpu-inference/cpu-inference-mistral-bm.json b/docs/sample_blueprints/model_serving/cpu-inference/cpu-inference-mistral-bm.json similarity index 100% rename from docs/sample_blueprints/workload_blueprints/cpu-inference/cpu-inference-mistral-bm.json rename to docs/sample_blueprints/model_serving/cpu-inference/cpu-inference-mistral-bm.json diff --git a/docs/sample_blueprints/workload_blueprints/cpu-inference/cpu-inference-mistral-vm.json b/docs/sample_blueprints/model_serving/cpu-inference/cpu-inference-mistral-vm.json similarity index 100% rename from docs/sample_blueprints/workload_blueprints/cpu-inference/cpu-inference-mistral-vm.json rename to docs/sample_blueprints/model_serving/cpu-inference/cpu-inference-mistral-vm.json diff --git a/docs/sample_blueprints/workload_blueprints/llm_inference_with_vllm/README.md b/docs/sample_blueprints/model_serving/llm_inference_with_vllm/README.md similarity index 100% rename from docs/sample_blueprints/workload_blueprints/llm_inference_with_vllm/README.md rename to docs/sample_blueprints/model_serving/llm_inference_with_vllm/README.md diff --git a/docs/sample_blueprints/workload_blueprints/llm_inference_with_vllm/vllm-closed-hf-model.json b/docs/sample_blueprints/model_serving/llm_inference_with_vllm/vllm-closed-hf-model.json similarity index 100% rename from docs/sample_blueprints/workload_blueprints/llm_inference_with_vllm/vllm-closed-hf-model.json rename to docs/sample_blueprints/model_serving/llm_inference_with_vllm/vllm-closed-hf-model.json diff --git a/docs/sample_blueprints/workload_blueprints/llm_inference_with_vllm/vllm-model-from-obj-storage.json b/docs/sample_blueprints/model_serving/llm_inference_with_vllm/vllm-model-from-obj-storage.json similarity index 100% rename from docs/sample_blueprints/workload_blueprints/llm_inference_with_vllm/vllm-model-from-obj-storage.json rename to docs/sample_blueprints/model_serving/llm_inference_with_vllm/vllm-model-from-obj-storage.json diff --git a/docs/sample_blueprints/workload_blueprints/llm_inference_with_vllm/vllm-open-hf-model-api-key-functionality.json b/docs/sample_blueprints/model_serving/llm_inference_with_vllm/vllm-open-hf-model-api-key-functionality.json similarity index 100% rename from docs/sample_blueprints/workload_blueprints/llm_inference_with_vllm/vllm-open-hf-model-api-key-functionality.json rename to docs/sample_blueprints/model_serving/llm_inference_with_vllm/vllm-open-hf-model-api-key-functionality.json diff --git a/docs/sample_blueprints/workload_blueprints/llm_inference_with_vllm/vllm-open-hf-model.json b/docs/sample_blueprints/model_serving/llm_inference_with_vllm/vllm-open-hf-model.json similarity index 100% rename from docs/sample_blueprints/workload_blueprints/llm_inference_with_vllm/vllm-open-hf-model.json rename to docs/sample_blueprints/model_serving/llm_inference_with_vllm/vllm-open-hf-model.json diff --git a/docs/sample_blueprints/platform_feature_blueprints/mig_multi_instance_gpu/README.md b/docs/sample_blueprints/model_serving/mig_multi_instance_gpu/README.md similarity index 99% rename from docs/sample_blueprints/platform_feature_blueprints/mig_multi_instance_gpu/README.md rename to docs/sample_blueprints/model_serving/mig_multi_instance_gpu/README.md index 3f417c4..11218d2 100644 --- a/docs/sample_blueprints/platform_feature_blueprints/mig_multi_instance_gpu/README.md +++ b/docs/sample_blueprints/model_serving/mig_multi_instance_gpu/README.md @@ -128,7 +128,7 @@ There are two ways to apply a mig configuration to a node pool. #### shared_node_pool: -Apart from the existing requirements for a shared node pool found [here](../shared_node_pools/README.md), the following are additional requirements / options for MIG: +Apart from the existing requirements for a shared node pool found [here](../../platform_features/shared_node_pools/README.md), the following are additional requirements / options for MIG: - `"shared_node_pool_mig_config"` - the mig congfiguration to apply to each node in the node pool. Possible values are in the [Mig Configurations](#mig-configurations). This will apply the node to each node in the pool, but if you want to update a specific node that can be done via the `update` mode described in the next section. - `"recipe_max_pods_per_node"`: [OPTIONAL: DEFAULT = 90] - by default, since MIG can slice up to 56 times for a full BM.GPU.H100.8, the default 31 pods by OKE is insufficient. As part of shared_node_pool deployment for MIG, this value is increased to 90 to fit all slice configurations + some buffer room. The maximum value is proportedly 110. It is not recommended to change this value, as it can not be modified after deployment of a pool. In order to change it, a node must be removed from the pool and re-added with the new value. diff --git a/docs/sample_blueprints/platform_feature_blueprints/mig_multi_instance_gpu/mig_enabled_shared_node_pool.json b/docs/sample_blueprints/model_serving/mig_multi_instance_gpu/mig_enabled_shared_node_pool.json similarity index 100% rename from docs/sample_blueprints/platform_feature_blueprints/mig_multi_instance_gpu/mig_enabled_shared_node_pool.json rename to docs/sample_blueprints/model_serving/mig_multi_instance_gpu/mig_enabled_shared_node_pool.json diff --git a/docs/sample_blueprints/platform_feature_blueprints/mig_multi_instance_gpu/mig_inference_multiple_replicas.json b/docs/sample_blueprints/model_serving/mig_multi_instance_gpu/mig_inference_multiple_replicas.json similarity index 100% rename from docs/sample_blueprints/platform_feature_blueprints/mig_multi_instance_gpu/mig_inference_multiple_replicas.json rename to docs/sample_blueprints/model_serving/mig_multi_instance_gpu/mig_inference_multiple_replicas.json diff --git a/docs/sample_blueprints/platform_feature_blueprints/mig_multi_instance_gpu/mig_inference_single_replica.json b/docs/sample_blueprints/model_serving/mig_multi_instance_gpu/mig_inference_single_replica.json similarity index 100% rename from docs/sample_blueprints/platform_feature_blueprints/mig_multi_instance_gpu/mig_inference_single_replica.json rename to docs/sample_blueprints/model_serving/mig_multi_instance_gpu/mig_inference_single_replica.json diff --git a/docs/sample_blueprints/platform_feature_blueprints/mig_multi_instance_gpu/mig_inference_single_replica_10gb.json b/docs/sample_blueprints/model_serving/mig_multi_instance_gpu/mig_inference_single_replica_10gb.json similarity index 100% rename from docs/sample_blueprints/platform_feature_blueprints/mig_multi_instance_gpu/mig_inference_single_replica_10gb.json rename to docs/sample_blueprints/model_serving/mig_multi_instance_gpu/mig_inference_single_replica_10gb.json diff --git a/docs/sample_blueprints/platform_feature_blueprints/mig_multi_instance_gpu/mig_slices.png b/docs/sample_blueprints/model_serving/mig_multi_instance_gpu/mig_slices.png similarity index 100% rename from docs/sample_blueprints/platform_feature_blueprints/mig_multi_instance_gpu/mig_slices.png rename to docs/sample_blueprints/model_serving/mig_multi_instance_gpu/mig_slices.png diff --git a/docs/sample_blueprints/platform_feature_blueprints/mig_multi_instance_gpu/mig_update_node_with_node_name.json b/docs/sample_blueprints/model_serving/mig_multi_instance_gpu/mig_update_node_with_node_name.json similarity index 100% rename from docs/sample_blueprints/platform_feature_blueprints/mig_multi_instance_gpu/mig_update_node_with_node_name.json rename to docs/sample_blueprints/model_serving/mig_multi_instance_gpu/mig_update_node_with_node_name.json diff --git a/docs/sample_blueprints/platform_feature_blueprints/mig_multi_instance_gpu/mig_update_shared_pool_with_node_pool_name.json b/docs/sample_blueprints/model_serving/mig_multi_instance_gpu/mig_update_shared_pool_with_node_pool_name.json similarity index 100% rename from docs/sample_blueprints/platform_feature_blueprints/mig_multi_instance_gpu/mig_update_shared_pool_with_node_pool_name.json rename to docs/sample_blueprints/model_serving/mig_multi_instance_gpu/mig_update_shared_pool_with_node_pool_name.json diff --git a/docs/sample_blueprints/workload_blueprints/multi-node-inference/README.md b/docs/sample_blueprints/model_serving/multi-node-inference/README.md similarity index 95% rename from docs/sample_blueprints/workload_blueprints/multi-node-inference/README.md rename to docs/sample_blueprints/model_serving/multi-node-inference/README.md index 4dad738..dcd36cb 100644 --- a/docs/sample_blueprints/workload_blueprints/multi-node-inference/README.md +++ b/docs/sample_blueprints/model_serving/multi-node-inference/README.md @@ -53,13 +53,13 @@ Use multi-node inference whenever you are trying to use a very large model that ## RDMA + Multinode Inference -Want to use RDMA with multinode inference? [See here for details](../../platform_feature_blueprints/using_rdma_enabled_node_pools/README.md) +Want to use RDMA with multinode inference? [See here for details](../../other/using_rdma_enabled_node_pools/README.md) ## How to use it? We are using [vLLM](https://docs.vllm.ai/en/latest/serving/distributed_serving.html) and [Ray](https://github.com/ray-project/ray) using the [LeaderWorkerSet (LWS)](https://github.com/kubernetes-sigs/lws) to manage state between multiple nodes. -In order to use multi-node inference in an OCI Blueprint, first deploy a shared node pool with blueprints using [this recipe](../../platform_feature_blueprints/shared_node_pools/shared_node_pool_A10_VM.json). +In order to use multi-node inference in an OCI Blueprint, first deploy a shared node pool with blueprints using [this recipe](../../platform_features/shared_node_pools/shared_node_pool_A10_VM.json). Then, use the following blueprint to deploy serving software: [LINK](multinode_inference_VM_A10.json) @@ -93,9 +93,9 @@ The following parameters are required: - `multinode_num_nodes_to_use_from_shared_pool` -> the total number of nodes (as an integer) you want to use to serve this model. This number must be less than the size of the shared node pool, and will only use schedulable nodes in the pool. -- [OPTIONAL] `"multinode_rdma_enabled_in_shared_pool": true` -> If you have provisioned RDMA enabled shared node pools in your cluster - enable RDMA communication between nodes. This will fail validation if RDMA is not supported for shape type, or node is missing appropriate labels described in [linked doc](../../platform_feature_blueprints/using_rdma_enabled_node_pools/README.md). +- [OPTIONAL] `"multinode_rdma_enabled_in_shared_pool": true` -> If you have provisioned RDMA enabled shared node pools in your cluster - enable RDMA communication between nodes. This will fail validation if RDMA is not supported for shape type, or node is missing appropriate labels described in [linked doc](../../other/using_rdma_enabled_node_pools/README.md). -- [OPTIONAL] `recipe_readiness_probe_params` -> Readiness probe to ensure that service is ready to serve requests. Parameter details found [here](../../platform_feature_blueprints/startup_liveness_readiness_probes/README.md). +- [OPTIONAL] `recipe_readiness_probe_params` -> Readiness probe to ensure that service is ready to serve requests. Parameter details found [here](../../platform_features/startup_liveness_readiness_probes/README.md). ## Requirements @@ -113,7 +113,7 @@ Follow these 6 simple steps to deploy your multi-node inference using OCI AI Blu 1. **Deploy your shared node pool** - Deploy a shared node pool containing at least 2 nodes for inference. Note: Existing shared node pools can be used! - - as a template, follow [this BM.A10](../../platform_feature_blueprints/shared_node_pools/shared_node_pool_A10_BM.json) or [this VM.A10](../../platform_feature_blueprints/shared_node_pools/shared_node_pool_A10_VM.json). + - as a template, follow [this BM.A10](../../platform_features/shared_node_pools/shared_node_pool_A10_BM.json) or [this VM.A10](../../platform_features/shared_node_pools/shared_node_pool_A10_VM.json). 2. **Create Your Deployment Blueprint** - Create a JSON configuration (blueprint) that defines your RayCluster. Key parameters include: - `"recipe_mode": "service"` diff --git a/docs/sample_blueprints/workload_blueprints/multi-node-inference/multinode_inference_BM_A10.json b/docs/sample_blueprints/model_serving/multi-node-inference/multinode_inference_BM_A10.json similarity index 100% rename from docs/sample_blueprints/workload_blueprints/multi-node-inference/multinode_inference_BM_A10.json rename to docs/sample_blueprints/model_serving/multi-node-inference/multinode_inference_BM_A10.json diff --git a/docs/sample_blueprints/workload_blueprints/multi-node-inference/multinode_inference_VM_A10.json b/docs/sample_blueprints/model_serving/multi-node-inference/multinode_inference_VM_A10.json similarity index 100% rename from docs/sample_blueprints/workload_blueprints/multi-node-inference/multinode_inference_VM_A10.json rename to docs/sample_blueprints/model_serving/multi-node-inference/multinode_inference_VM_A10.json diff --git a/docs/sample_blueprints/platform_feature_blueprints/exisiting_cluster_installation/README.md b/docs/sample_blueprints/other/exisiting_cluster_installation/README.md similarity index 100% rename from docs/sample_blueprints/platform_feature_blueprints/exisiting_cluster_installation/README.md rename to docs/sample_blueprints/other/exisiting_cluster_installation/README.md diff --git a/docs/sample_blueprints/platform_feature_blueprints/exisiting_cluster_installation/add_node_to_control_plane.json b/docs/sample_blueprints/other/exisiting_cluster_installation/add_node_to_control_plane.json similarity index 100% rename from docs/sample_blueprints/platform_feature_blueprints/exisiting_cluster_installation/add_node_to_control_plane.json rename to docs/sample_blueprints/other/exisiting_cluster_installation/add_node_to_control_plane.json diff --git a/docs/sample_blueprints/workload_blueprints/llama-stack/README.md b/docs/sample_blueprints/other/llama-stack/README.md similarity index 98% rename from docs/sample_blueprints/workload_blueprints/llama-stack/README.md rename to docs/sample_blueprints/other/llama-stack/README.md index d2d6460..ab6336b 100644 --- a/docs/sample_blueprints/workload_blueprints/llama-stack/README.md +++ b/docs/sample_blueprints/other/llama-stack/README.md @@ -74,7 +74,7 @@ Llama Stack has many different use cases and are thoroughly detailed here, in th 1. How can I configure the vLLM pre-filled sample (e.g. I want to deploy a different model with vLLM; a custom model)? -- Any vLLM inference server and model that is compatible with vLLM will work with the Llama Stack implementation. Follow our [llm_inference_with_vllm blueprint](../llm_inference_with_vllm/README.md) for more details on setting up vLLM. +- Any vLLM inference server and model that is compatible with vLLM will work with the Llama Stack implementation. Follow our [llm_inference_with_vllm blueprint](../../model_serving/llm_inference_with_vllm/README.md) for more details on setting up vLLM. 2. Can I use a different inference engine than vLLM? diff --git a/docs/sample_blueprints/platform_feature_blueprints/deployment_groups/llama_stack_basic.json b/docs/sample_blueprints/other/llama-stack/llama_stack_basic.json similarity index 100% rename from docs/sample_blueprints/platform_feature_blueprints/deployment_groups/llama_stack_basic.json rename to docs/sample_blueprints/other/llama-stack/llama_stack_basic.json diff --git a/docs/sample_blueprints/platform_feature_blueprints/model_storage/README.md b/docs/sample_blueprints/other/model_storage/README.md similarity index 100% rename from docs/sample_blueprints/platform_feature_blueprints/model_storage/README.md rename to docs/sample_blueprints/other/model_storage/README.md diff --git a/docs/sample_blueprints/platform_feature_blueprints/model_storage/download_closed_hf_model_to_object_storage.json b/docs/sample_blueprints/other/model_storage/download_closed_hf_model_to_object_storage.json similarity index 100% rename from docs/sample_blueprints/platform_feature_blueprints/model_storage/download_closed_hf_model_to_object_storage.json rename to docs/sample_blueprints/other/model_storage/download_closed_hf_model_to_object_storage.json diff --git a/docs/sample_blueprints/platform_feature_blueprints/model_storage/download_open_hf_model_to_object_storage.json b/docs/sample_blueprints/other/model_storage/download_open_hf_model_to_object_storage.json similarity index 100% rename from docs/sample_blueprints/platform_feature_blueprints/model_storage/download_open_hf_model_to_object_storage.json rename to docs/sample_blueprints/other/model_storage/download_open_hf_model_to_object_storage.json diff --git a/docs/sample_blueprints/platform_feature_blueprints/using_rdma_enabled_node_pools/README.md b/docs/sample_blueprints/other/using_rdma_enabled_node_pools/README.md similarity index 99% rename from docs/sample_blueprints/platform_feature_blueprints/using_rdma_enabled_node_pools/README.md rename to docs/sample_blueprints/other/using_rdma_enabled_node_pools/README.md index 6dd6da5..5cd3a78 100644 --- a/docs/sample_blueprints/platform_feature_blueprints/using_rdma_enabled_node_pools/README.md +++ b/docs/sample_blueprints/other/using_rdma_enabled_node_pools/README.md @@ -88,7 +88,7 @@ One of the images in the table below must be imported into your tenancy in the c Once the image has been imported, it is now possible to deploy a shared node pool with RDMA connectivity with AI blueprints. -In addition to the parameters described in [the shared node pool doc](../shared_node_pools/README.md#without-selector), the following additional parameters are required: +In addition to the parameters described in [the shared node pool doc](../../platform_features/shared_node_pools/README.md#without-selector), the following additional parameters are required: - `"recipe_availability_domain": ""` -> full availability domain name where you have capacity for nodes. Examples: `"TrcQ:AP-MELBOURNE-1-AD-1"`, `"TrcQ:EU-FRANKFURT-1-AD-3"`. These can generally be found in the console via Hamburger (top left) -> Governance & Administration -> Tenancy Management -> Limits, Quotas and Usage diff --git a/docs/sample_blueprints/platform_feature_blueprints/using_rdma_enabled_node_pools/rdma_distributed_inference.json b/docs/sample_blueprints/other/using_rdma_enabled_node_pools/rdma_distributed_inference.json similarity index 100% rename from docs/sample_blueprints/platform_feature_blueprints/using_rdma_enabled_node_pools/rdma_distributed_inference.json rename to docs/sample_blueprints/other/using_rdma_enabled_node_pools/rdma_distributed_inference.json diff --git a/docs/sample_blueprints/platform_feature_blueprints/using_rdma_enabled_node_pools/rdma_shared_node_pool.json b/docs/sample_blueprints/other/using_rdma_enabled_node_pools/rdma_shared_node_pool.json similarity index 100% rename from docs/sample_blueprints/platform_feature_blueprints/using_rdma_enabled_node_pools/rdma_shared_node_pool.json rename to docs/sample_blueprints/other/using_rdma_enabled_node_pools/rdma_shared_node_pool.json diff --git a/docs/sample_blueprints/platform_feature_blueprints/using_rdma_enabled_node_pools/rdma_update_nodes.json b/docs/sample_blueprints/other/using_rdma_enabled_node_pools/rdma_update_nodes.json similarity index 100% rename from docs/sample_blueprints/platform_feature_blueprints/using_rdma_enabled_node_pools/rdma_update_nodes.json rename to docs/sample_blueprints/other/using_rdma_enabled_node_pools/rdma_update_nodes.json diff --git a/docs/whisper_transcription/README.md b/docs/sample_blueprints/other/whisper_transcription/README.md similarity index 100% rename from docs/whisper_transcription/README.md rename to docs/sample_blueprints/other/whisper_transcription/README.md diff --git a/docs/whisper_transcription/docs/Whisper_Architecture.pdf b/docs/sample_blueprints/other/whisper_transcription/docs/Whisper_Architecture.pdf similarity index 100% rename from docs/whisper_transcription/docs/Whisper_Architecture.pdf rename to docs/sample_blueprints/other/whisper_transcription/docs/Whisper_Architecture.pdf diff --git a/docs/whisper_transcription/examples/test1/test.wav b/docs/sample_blueprints/other/whisper_transcription/examples/test1/test.wav similarity index 100% rename from docs/whisper_transcription/examples/test1/test.wav rename to docs/sample_blueprints/other/whisper_transcription/examples/test1/test.wav diff --git a/docs/whisper_transcription/examples/test1/test_all_transcripts_20250601_201349.txt b/docs/sample_blueprints/other/whisper_transcription/examples/test1/test_all_transcripts_20250601_201349.txt similarity index 100% rename from docs/whisper_transcription/examples/test1/test_all_transcripts_20250601_201349.txt rename to docs/sample_blueprints/other/whisper_transcription/examples/test1/test_all_transcripts_20250601_201349.txt diff --git a/docs/whisper_transcription/examples/test1/transcription_log_20250601_201340.log b/docs/sample_blueprints/other/whisper_transcription/examples/test1/transcription_log_20250601_201340.log similarity index 100% rename from docs/whisper_transcription/examples/test1/transcription_log_20250601_201340.log rename to docs/sample_blueprints/other/whisper_transcription/examples/test1/transcription_log_20250601_201340.log diff --git a/docs/whisper_transcription/examples/test2/transcription_log_20250601_203611.log b/docs/sample_blueprints/other/whisper_transcription/examples/test2/transcription_log_20250601_203611.log similarity index 100% rename from docs/whisper_transcription/examples/test2/transcription_log_20250601_203611.log rename to docs/sample_blueprints/other/whisper_transcription/examples/test2/transcription_log_20250601_203611.log diff --git a/docs/whisper_transcription/examples/test2/video1591686795.mp4 b/docs/sample_blueprints/other/whisper_transcription/examples/test2/video1591686795.mp4 similarity index 100% rename from docs/whisper_transcription/examples/test2/video1591686795.mp4 rename to docs/sample_blueprints/other/whisper_transcription/examples/test2/video1591686795.mp4 diff --git a/docs/whisper_transcription/examples/test2/video1591686795_all_transcripts_20250601_203730.json b/docs/sample_blueprints/other/whisper_transcription/examples/test2/video1591686795_all_transcripts_20250601_203730.json similarity index 100% rename from docs/whisper_transcription/examples/test2/video1591686795_all_transcripts_20250601_203730.json rename to docs/sample_blueprints/other/whisper_transcription/examples/test2/video1591686795_all_transcripts_20250601_203730.json diff --git a/docs/whisper_transcription/examples/test2/video1591686795_all_transcripts_20250601_203730.txt b/docs/sample_blueprints/other/whisper_transcription/examples/test2/video1591686795_all_transcripts_20250601_203730.txt similarity index 100% rename from docs/whisper_transcription/examples/test2/video1591686795_all_transcripts_20250601_203730.txt rename to docs/sample_blueprints/other/whisper_transcription/examples/test2/video1591686795_all_transcripts_20250601_203730.txt diff --git a/docs/whisper_transcription/examples/test3/audio1788670787.m4a b/docs/sample_blueprints/other/whisper_transcription/examples/test3/audio1788670787.m4a similarity index 100% rename from docs/whisper_transcription/examples/test3/audio1788670787.m4a rename to docs/sample_blueprints/other/whisper_transcription/examples/test3/audio1788670787.m4a diff --git a/docs/whisper_transcription/examples/test3/audio1788670787_all_transcripts_20250601_191710.json b/docs/sample_blueprints/other/whisper_transcription/examples/test3/audio1788670787_all_transcripts_20250601_191710.json similarity index 100% rename from docs/whisper_transcription/examples/test3/audio1788670787_all_transcripts_20250601_191710.json rename to docs/sample_blueprints/other/whisper_transcription/examples/test3/audio1788670787_all_transcripts_20250601_191710.json diff --git a/docs/whisper_transcription/examples/test3/audio1788670787_all_transcripts_20250601_191710.txt b/docs/sample_blueprints/other/whisper_transcription/examples/test3/audio1788670787_all_transcripts_20250601_191710.txt similarity index 100% rename from docs/whisper_transcription/examples/test3/audio1788670787_all_transcripts_20250601_191710.txt rename to docs/sample_blueprints/other/whisper_transcription/examples/test3/audio1788670787_all_transcripts_20250601_191710.txt diff --git a/docs/whisper_transcription/examples/test3/transcription_log_20250601_191325.log b/docs/sample_blueprints/other/whisper_transcription/examples/test3/transcription_log_20250601_191325.log similarity index 100% rename from docs/whisper_transcription/examples/test3/transcription_log_20250601_191325.log rename to docs/sample_blueprints/other/whisper_transcription/examples/test3/transcription_log_20250601_191325.log diff --git a/docs/whisper_transcription/whisper-transcription-A10.json b/docs/sample_blueprints/other/whisper_transcription/whisper-transcription-A10.json similarity index 100% rename from docs/whisper_transcription/whisper-transcription-A10.json rename to docs/sample_blueprints/other/whisper_transcription/whisper-transcription-A10.json diff --git a/docs/whisper_transcription/whisper-transcription-A100.json b/docs/sample_blueprints/other/whisper_transcription/whisper-transcription-A100.json similarity index 100% rename from docs/whisper_transcription/whisper-transcription-A100.json rename to docs/sample_blueprints/other/whisper_transcription/whisper-transcription-A100.json diff --git a/docs/whisper_transcription/whisper-transcription-H100.json b/docs/sample_blueprints/other/whisper_transcription/whisper-transcription-H100.json similarity index 100% rename from docs/whisper_transcription/whisper-transcription-H100.json rename to docs/sample_blueprints/other/whisper_transcription/whisper-transcription-H100.json diff --git a/docs/sample_blueprints/platform_feature_blueprints/deployment_groups/README.md b/docs/sample_blueprints/platform_features/deployment_groups/README.md similarity index 100% rename from docs/sample_blueprints/platform_feature_blueprints/deployment_groups/README.md rename to docs/sample_blueprints/platform_features/deployment_groups/README.md diff --git a/docs/sample_blueprints/workload_blueprints/llama-stack/llama_stack_basic.json b/docs/sample_blueprints/platform_features/deployment_groups/llama_stack_basic.json similarity index 100% rename from docs/sample_blueprints/workload_blueprints/llama-stack/llama_stack_basic.json rename to docs/sample_blueprints/platform_features/deployment_groups/llama_stack_basic.json diff --git a/docs/sample_blueprints/platform_feature_blueprints/shared_node_pools/README.md b/docs/sample_blueprints/platform_features/shared_node_pools/README.md similarity index 97% rename from docs/sample_blueprints/platform_feature_blueprints/shared_node_pools/README.md rename to docs/sample_blueprints/platform_features/shared_node_pools/README.md index 4630cda..1a59758 100644 --- a/docs/sample_blueprints/platform_feature_blueprints/shared_node_pools/README.md +++ b/docs/sample_blueprints/platform_features/shared_node_pools/README.md @@ -23,7 +23,7 @@ Additional required fields: See [this recipe](./shared_node_pool_B200_BM.json) as an example for these parameters. -[This document section](../using_rdma_enabled_node_pools/README.md#import-a-custom-image) describes now to import a custom image and provides links to import custom images for various shapes. +[This document section](../../other/using_rdma_enabled_node_pools/README.md) describes now to import a custom image and provides links to import custom images for various shapes. ## Pre-Filled Samples diff --git a/docs/sample_blueprints/platform_feature_blueprints/shared_node_pools/shared_node_pool_A10_BM.json b/docs/sample_blueprints/platform_features/shared_node_pools/shared_node_pool_A10_BM.json similarity index 100% rename from docs/sample_blueprints/platform_feature_blueprints/shared_node_pools/shared_node_pool_A10_BM.json rename to docs/sample_blueprints/platform_features/shared_node_pools/shared_node_pool_A10_BM.json diff --git a/docs/sample_blueprints/platform_feature_blueprints/shared_node_pools/shared_node_pool_A10_VM.json b/docs/sample_blueprints/platform_features/shared_node_pools/shared_node_pool_A10_VM.json similarity index 100% rename from docs/sample_blueprints/platform_feature_blueprints/shared_node_pools/shared_node_pool_A10_VM.json rename to docs/sample_blueprints/platform_features/shared_node_pools/shared_node_pool_A10_VM.json diff --git a/docs/sample_blueprints/platform_feature_blueprints/shared_node_pools/shared_node_pool_B200_BM.json b/docs/sample_blueprints/platform_features/shared_node_pools/shared_node_pool_B200_BM.json similarity index 100% rename from docs/sample_blueprints/platform_feature_blueprints/shared_node_pools/shared_node_pool_B200_BM.json rename to docs/sample_blueprints/platform_features/shared_node_pools/shared_node_pool_B200_BM.json diff --git a/docs/sample_blueprints/platform_feature_blueprints/shared_node_pools/vllm_inference_sample_shared_pool_blueprint.json b/docs/sample_blueprints/platform_features/shared_node_pools/vllm_inference_sample_shared_pool_blueprint.json similarity index 100% rename from docs/sample_blueprints/platform_feature_blueprints/shared_node_pools/vllm_inference_sample_shared_pool_blueprint.json rename to docs/sample_blueprints/platform_features/shared_node_pools/vllm_inference_sample_shared_pool_blueprint.json diff --git a/docs/sample_blueprints/platform_feature_blueprints/startup_liveness_readiness_probes/README.md b/docs/sample_blueprints/platform_features/startup_liveness_readiness_probes/README.md similarity index 100% rename from docs/sample_blueprints/platform_feature_blueprints/startup_liveness_readiness_probes/README.md rename to docs/sample_blueprints/platform_features/startup_liveness_readiness_probes/README.md diff --git a/docs/sample_blueprints/platform_feature_blueprints/startup_liveness_readiness_probes/autoscale_with_fss.json b/docs/sample_blueprints/platform_features/startup_liveness_readiness_probes/autoscale_with_fss.json similarity index 100% rename from docs/sample_blueprints/platform_feature_blueprints/startup_liveness_readiness_probes/autoscale_with_fss.json rename to docs/sample_blueprints/platform_features/startup_liveness_readiness_probes/autoscale_with_fss.json diff --git a/docs/sample_blueprints/platform_feature_blueprints/teams/README.md b/docs/sample_blueprints/platform_features/teams/README.md similarity index 100% rename from docs/sample_blueprints/platform_feature_blueprints/teams/README.md rename to docs/sample_blueprints/platform_features/teams/README.md diff --git a/docs/sample_blueprints/platform_feature_blueprints/teams/create_job_with_team.json b/docs/sample_blueprints/platform_features/teams/create_job_with_team.json similarity index 100% rename from docs/sample_blueprints/platform_feature_blueprints/teams/create_job_with_team.json rename to docs/sample_blueprints/platform_features/teams/create_job_with_team.json diff --git a/docs/sample_blueprints/platform_feature_blueprints/teams/create_team.json b/docs/sample_blueprints/platform_features/teams/create_team.json similarity index 100% rename from docs/sample_blueprints/platform_feature_blueprints/teams/create_team.json rename to docs/sample_blueprints/platform_features/teams/create_team.json From acc850f915e608617f0b1b5ebf31aa7791e2c36a Mon Sep 17 00:00:00 2001 From: grantneumanoracle Date: Fri, 18 Jul 2025 16:38:47 -0700 Subject: [PATCH 02/13] fix whisper transcription readme and remove unneeded file --- consolidated_bluperints_2.json | 1 - .../other/whisper_transcription/README.md | 88 +++++++++++-------- 2 files changed, 50 insertions(+), 39 deletions(-) delete mode 100644 consolidated_bluperints_2.json diff --git a/consolidated_bluperints_2.json b/consolidated_bluperints_2.json deleted file mode 100644 index 251b764..0000000 --- a/consolidated_bluperints_2.json +++ /dev/null @@ -1 +0,0 @@ -[{"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "Fine-Tuning Benchmarking", "blueprint_short_description": "Fine-tune quantized Llama-2-70B model using MLCommons methodology for infrastructure benchmarking", "blueprint_long_description": "The fine-tuning benchmarking blueprint streamlines infrastructure benchmarking for fine-tuning using the MLCommons methodology. It fine-tunes a quantized Llama-2-70B model and a standard dataset.\n\nOnce complete, benchmarking results, such as training time and resource utilization, are available in MLFlow and Grafana for easy tracking. This blueprint enables data-driven infrastructure decisions for your fine-tuning jobs.", "pre_filled_samples": [{"pre_filled_sample_name": "LoRA fine-tuning of quantitized Llama-2-70B model on A100 node using MLCommons methodology", "recipe_id": "mlcommons_lora_finetune_nvidia", "deployment_name": "MLCommons Finetune LORA/PEFT", "recipe_mode": "job", "recipe_node_shape": "BM.GPU.A100.8", "recipe_use_shared_node_pool": false, "recipe_nvidia_gpu_count": 8, "recipe_ephemeral_storage_size": 50, "recipe_replica_count": 1, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_shared_memory_volume_size_limit_in_mb": 100, "input_object_storage": [{"bucket_name": "corrino_mlcommons_llama2_70b_qkv", "mount_location": "/models", "volume_size_in_gbs": 500}, {"bucket_name": "corrino_ml_commons_scrolls_dataset", "mount_location": "/dataset", "volume_size_in_gbs": 100}], "output_object_storage": [{"bucket_name": "corrino_ml_commons_output", "mount_location": "/mlcommons_output", "volume_size_in_gbs": 200}], "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:corrino-recipe-mlcommons", "recipe_container_env": [{"key": "model_name", "value": "regisss/llama2-70b-fused-qkv-mlperf"}, {"key": "Model_Path", "value": "/models"}, {"key": "Dataset_Path", "value": "/dataset"}, {"key": "Lora_R", "value": "16"}, {"key": "Lora_Alpha", "value": "32"}, {"key": "Lora_Dropout", "value": "0.1"}, {"key": "Max_Seq_Len", "value": "8192"}, {"key": "bf16", "value": "true"}, {"key": "Logging_Steps", "value": "24"}, {"key": "Eval_Steps", "value": "48"}, {"key": "Per_Device_Train_Batch_Size", "value": "1"}, {"key": "Gradient_Accumulation_Steps", "value": "1"}, {"key": "Lr_Scheduler_Type", "value": "cosine"}, {"key": "Learning_Rate", "value": "0.0004"}, {"key": "Weight_Decay", "value": "0.0001"}, {"key": "Warmup_Ratio", "value": "0"}, {"key": "Max_Grad_Norm", "value": "0.3"}, {"key": "Use_Gradient_Checkpointing", "value": "true"}, {"key": "Target_Eval_Loss", "value": "0.925"}, {"key": "Use_Peft_Lora", "value": "true"}, {"key": "Max_Steps", "value": "1024"}, {"key": "Use_Flash_Attn", "value": "true"}, {"key": "Seed", "value": "1234"}, {"key": "Lora_Target_Modules", "value": "qkv_proj,o_proj"}, {"key": "Mlflow_Exp_Name", "value": "oci_ai_blueprints_nvidia_recipe"}, {"key": "Output_Dir", "value": "/mlcommons_output"}]}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "GPU Health Check", "blueprint_short_description": "Comprehensive GPU health validation and diagnostics for production readiness", "blueprint_long_description": "This repository offers a robust, pre-check recipe for thorough GPU health validation prior to deploying production or research workloads. Designed to operate seamlessly across both single-node and multi-node environments, this diagnostic toolset enables you to verify that your GPU infrastructure is primed for high-demand experiments. By systematically assessing key performance metrics—such as thermal behavior, power stability, and overall hardware reliability—you can proactively detect and address issues like thermal throttling, power irregularities, and GPU instability. This early-warning system minimizes the risk of unexpected downtime and performance degradation, ensuring that your system consistently operates at peak efficiency and reliability during critical computational tasks.", "pre_filled_samples": [{"pre_filled_sample_name": "2 A10 GPUs with dtype 16", "recipe_id": "healthcheck", "recipe_mode": "job", "deployment_name": "healthcheck_fp16_a10", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:healthcheck_v0.3", "recipe_node_shape": "VM.GPU.A10.2", "output_object_storage": [{"bucket_name": "healthcheck2", "mount_location": "/healthcheck_results", "volume_size_in_gbs": 20}], "recipe_container_command_args": ["--dtype", "float16", "--output_dir", "/healthcheck_results", "--expected_gpus", "A10:2,A100:0,H100:0"], "recipe_replica_count": 1, "recipe_nvidia_gpu_count": 2, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_ephemeral_storage_size": 100, "recipe_shared_memory_volume_size_limit_in_mb": 1000}, {"pre_filled_sample_name": "2 A10 GPUs with dtype 32", "recipe_id": "healthcheck", "recipe_mode": "job", "deployment_name": "healthcheck_fp32_a10", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:healthcheck_v0.3", "recipe_node_shape": "VM.GPU.A10.2", "output_object_storage": [{"bucket_name": "healthcheck2", "mount_location": "/healthcheck_results", "volume_size_in_gbs": 20}], "recipe_container_command_args": ["--dtype", "float32", "--output_dir", "/healthcheck_results", "--expected_gpus", "A10:2,A100:0,H100:0"], "recipe_replica_count": 1, "recipe_nvidia_gpu_count": 2, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_ephemeral_storage_size": 100, "recipe_shared_memory_volume_size_limit_in_mb": 1000}, {"pre_filled_sample_name": "8 H100 GPUs with dtype 16", "recipe_id": "healthcheck", "recipe_mode": "job", "deployment_name": "healthcheck_fp16_h100", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:healthcheck_v0.3", "recipe_node_shape": "BM.GPU.H100.8", "output_object_storage": [{"bucket_name": "healthcheck2", "mount_location": "/healthcheck_results", "volume_size_in_gbs": 20}], "recipe_container_command_args": ["--dtype", "float16", "--output_dir", "/healthcheck_results", "--expected_gpus", "A10:0,A100:0,H100:8"], "recipe_replica_count": 1, "recipe_nvidia_gpu_count": 8, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_ephemeral_storage_size": 100, "recipe_shared_memory_volume_size_limit_in_mb": 1000}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "LoRA Fine-Tuning", "blueprint_short_description": "Efficiently fine-tune large language models using Low-Rank Adaptation", "blueprint_long_description": "This blueprint enables efficient model tuning using Low-Rank Adaptation (LoRA), a highly efficient method of LLM tuning. You can fine-tune a custom LLM or most open-source LLMs from Hugging Face. You can also use a custom dataset or any publicly available dataset from Hugging Face. Once the job is complete, results such as training metrics and logged in MLFlow for analysis. The fine-tuned model is then stored in an object storage bucket, ready for deployment.", "pre_filled_samples": [{"pre_filled_sample_name": "Fine-Tune NousResearch/Meta-Llama-3.1-8B from Object Storage with Dataset from Hugging Face and Checkpoints saved in Object Storage (A10 VM)", "recipe_id": "lora_finetune_nvidia", "deployment_name": "dk_with_checkpoint", "recipe_mode": "job", "recipe_node_shape": "VM.GPU.A10.2", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:finetune_lora_dev", "recipe_nvidia_gpu_count": 2, "recipe_ephemeral_storage_size": 300, "recipe_replica_count": 1, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 500, "recipe_shared_memory_volume_size_limit_in_mb": 100, "recipe_container_env": [{"key": "Mlflow_Endpoint", "value": "http://mlflow.cluster-tools.svc.cluster.local:5000"}, {"key": "Mlflow_Exp_Name", "value": "oci_ai_blueprints_nvidia_recipe"}, {"key": "Mlflow_Run_Name", "value": "Meta-Llama-3.1-8B-local-quotes-checkpoint"}, {"key": "Hf_Token", "value": "None"}, {"key": "Download_Dataset_From_Hf", "value": "true"}, {"key": "Dataset_Name", "value": "Abirate/english_quotes"}, {"key": "Dataset_Sub_Name", "value": "None"}, {"key": "Dataset_Column_To_Use", "value": "quote"}, {"key": "Dataset_Path", "value": "/workspace/datasets"}, {"key": "Download_Model_From_Hf", "value": "false"}, {"key": "Model_Name", "value": "NousResearch/Meta-Llama-3.1-8B"}, {"key": "Model_Path", "value": "/models/NousResearch/Meta-Llama-3.1-8B"}, {"key": "Max_Model_Length", "value": "4096"}, {"key": "Resume_From_Checkpoint", "value": "true"}, {"key": "Checkpoint_Path", "value": "/checkpoint/Bucket-Llama-3.1-8B-english_quotes/checkpoint-1400"}, {"key": "Lora_R", "value": "8"}, {"key": "Lora_Alpha", "value": "16"}, {"key": "Lora_Dropout", "value": "0.1"}, {"key": "Lora_Target_Modules", "value": "q_proj,up_proj,o_proj,k_proj,down_proj,gate_proj,v_proj"}, {"key": "Bias", "value": "none"}, {"key": "Task_Type", "value": "CAUSAL_LM"}, {"key": "Per_Device_Train_Batch_Size", "value": "1"}, {"key": "Gradient_Accumulation_Steps", "value": "1"}, {"key": "Warmup_Steps", "value": "2"}, {"key": "Save_Steps", "value": "100"}, {"key": "Learning_Rate", "value": "0.0002"}, {"key": "Fp16", "value": "true"}, {"key": "Logging_Steps", "value": "1"}, {"key": "Output_Dir", "value": "/tunedmodels/Bucket-Llama-3.1-8B-english_quotes"}, {"key": "Optim", "value": "paged_adamw_8bit"}, {"key": "Number_of_Training_Epochs", "value": "2"}, {"key": "Require_Persistent_Output_Dir", "value": "true"}], "input_object_storage": [{"bucket_name": "corrino_hf_oss_models", "mount_location": "/models", "volume_size_in_gbs": 500}, {"bucket_name": "corrino_tuned_hf_oss_models", "mount_location": "/checkpoint", "volume_size_in_gbs": 500}], "output_object_storage": [{"bucket_name": "corrino_tuned_hf_oss_models", "mount_location": "/tunedmodels", "volume_size_in_gbs": 500}]}, {"pre_filled_sample_name": "Fine-Tune NousResearch/Meta-Llama-3.1-8B from Object Storage with Dataset from Hugging Face on A10 VM", "recipe_id": "lora_finetune_nvidia", "deployment_name": "dk_bucket_model_open_dataset", "recipe_mode": "job", "recipe_node_shape": "VM.GPU.A10.2", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:finetune_lora_dev", "recipe_nvidia_gpu_count": 2, "recipe_ephemeral_storage_size": 300, "recipe_replica_count": 1, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 500, "recipe_shared_memory_volume_size_limit_in_mb": 100, "recipe_container_env": [{"key": "Mlflow_Endpoint", "value": "http://mlflow.cluster-tools.svc.cluster.local:5000"}, {"key": "Mlflow_Exp_Name", "value": "oci_ai_blueprints_nvidia_recipe"}, {"key": "Mlflow_Run_Name", "value": "Meta-Llama-3.1-8B-local-quotes"}, {"key": "Hf_Token", "value": "None"}, {"key": "Download_Dataset_From_Hf", "value": "true"}, {"key": "Dataset_Name", "value": "Abirate/english_quotes"}, {"key": "Dataset_Sub_Name", "value": "None"}, {"key": "Dataset_Column_To_Use", "value": "None"}, {"key": "Dataset_Path", "value": "/workspace/datasets"}, {"key": "Download_Model_From_Hf", "value": "false"}, {"key": "Model_Name", "value": "NousResearch/Meta-Llama-3.1-8B"}, {"key": "Model_Path", "value": "/models/NousResearch/Meta-Llama-3.1-8B"}, {"key": "Max_Model_Length", "value": "8192"}, {"key": "Resume_From_Checkpoint", "value": "false"}, {"key": "Checkpoint_Path", "value": "/checkpoint"}, {"key": "Lora_R", "value": "8"}, {"key": "Lora_Alpha", "value": "32"}, {"key": "Lora_Dropout", "value": "0.1"}, {"key": "Lora_Target_Modules", "value": "q_proj,up_proj,o_proj,k_proj,down_proj,gate_proj,v_proj"}, {"key": "Bias", "value": "none"}, {"key": "Task_Type", "value": "CAUSAL_LM"}, {"key": "Per_Device_Train_Batch_Size", "value": "1"}, {"key": "Gradient_Accumulation_Steps", "value": "1"}, {"key": "Warmup_Steps", "value": "2"}, {"key": "Save_Steps", "value": "100"}, {"key": "Learning_Rate", "value": "0.0002"}, {"key": "Fp16", "value": "true"}, {"key": "Logging_Steps", "value": "1"}, {"key": "Output_Dir", "value": "/tunedmodels/Bucket-Llama-3.1-8B-english_quotes"}, {"key": "Optim", "value": "paged_adamw_8bit"}, {"key": "Number_of_Training_Epochs", "value": "2"}, {"key": "Require_Persistent_Output_Dir", "value": "true"}], "input_object_storage": [{"bucket_name": "corrino_hf_oss_models", "mount_location": "/models", "volume_size_in_gbs": 500}], "output_object_storage": [{"bucket_name": "corrino_tuned_hf_oss_models", "mount_location": "/tunedmodels", "volume_size_in_gbs": 500}]}, {"pre_filled_sample_name": "Fine-Tune NousResearch/Meta-Llama-3.1-8B from Object Storage (PAR link) with Dataset from Hugging Face on A10 VM", "recipe_id": "lora_finetune_nvidia", "deployment_name": "dk_bucket_model_open_dataset", "recipe_mode": "job", "recipe_node_shape": "VM.GPU.A10.2", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:finetune_lora_dev", "recipe_nvidia_gpu_count": 2, "recipe_ephemeral_storage_size": 300, "recipe_replica_count": 1, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 500, "recipe_shared_memory_volume_size_limit_in_mb": 100, "recipe_container_env": [{"key": "Mlflow_Endpoint", "value": "http://mlflow.cluster-tools.svc.cluster.local:5000"}, {"key": "Mlflow_Exp_Name", "value": "oci_ai_blueprints_nvidia_recipe"}, {"key": "Mlflow_Run_Name", "value": "Meta-Llama-3.1-8B-local-quotes"}, {"key": "Hf_Token", "value": "None"}, {"key": "Download_Dataset_From_Hf", "value": "true"}, {"key": "Dataset_Name", "value": "Abirate/english_quotes"}, {"key": "Dataset_Sub_Name", "value": "None"}, {"key": "Dataset_Column_To_Use", "value": "None"}, {"key": "Dataset_Path", "value": "/workspace/datasets"}, {"key": "Download_Model_From_Hf", "value": "false"}, {"key": "Model_Name", "value": "NousResearch/Meta-Llama-3.1-8B"}, {"key": "Model_Path", "value": "/models/NousResearch/Meta-Llama-3.1-8B"}, {"key": "Max_Model_Length", "value": "8192"}, {"key": "Resume_From_Checkpoint", "value": "false"}, {"key": "Checkpoint_Path", "value": "/checkpoint"}, {"key": "Lora_R", "value": "8"}, {"key": "Lora_Alpha", "value": "32"}, {"key": "Lora_Dropout", "value": "0.1"}, {"key": "Lora_Target_Modules", "value": "q_proj,up_proj,o_proj,k_proj,down_proj,gate_proj,v_proj"}, {"key": "Bias", "value": "none"}, {"key": "Task_Type", "value": "CAUSAL_LM"}, {"key": "Per_Device_Train_Batch_Size", "value": "1"}, {"key": "Gradient_Accumulation_Steps", "value": "1"}, {"key": "Warmup_Steps", "value": "2"}, {"key": "Save_Steps", "value": "100"}, {"key": "Learning_Rate", "value": "0.0002"}, {"key": "Fp16", "value": "true"}, {"key": "Logging_Steps", "value": "1"}, {"key": "Output_Dir", "value": "/tunedmodels/Bucket-Llama-3.1-8B-english_quotes"}, {"key": "Optim", "value": "paged_adamw_8bit"}, {"key": "Number_of_Training_Epochs", "value": "2"}, {"key": "Require_Persistent_Output_Dir", "value": "true"}], "input_object_storage": [{"par": "https://objectstorage.us-phoenix-1.oraclecloud.com/p/iv-8F3oSRJ8nsbVaq9ev9kjfkZ3zXItSOCSDWKfRa7zT3aPmNf4MijL_4nw_hvvY/n/iduyx1qnmway/b/corrino_hf_oss_models/o/", "mount_location": "/models", "volume_size_in_gbs": 500, "include": ["NousResearch/Meta-Llama-3.1-8B"]}], "output_object_storage": [{"bucket_name": "corrino_tuned_hf_oss_models", "mount_location": "/tunedmodels", "volume_size_in_gbs": 500}]}, {"pre_filled_sample_name": "Fine-Tune meta-llama/Llama-3.2-1B-Instruct (Closed Model) from Hugging Face with Dataset from Hugging Face on A10 VM", "recipe_id": "lora_finetune_nvidia", "deployment_name": "dk_closed_model_open_dataset", "recipe_mode": "job", "recipe_node_shape": "VM.GPU.A10.2", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:finetune_lora_dev", "recipe_nvidia_gpu_count": 2, "recipe_ephemeral_storage_size": 300, "recipe_replica_count": 1, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 500, "recipe_shared_memory_volume_size_limit_in_mb": 100, "recipe_container_env": [{"key": "Mlflow_Endpoint", "value": "http://mlflow.cluster-tools.svc.cluster.local:5000"}, {"key": "Mlflow_Exp_Name", "value": "oci_ai_blueprints_nvidia_recipe"}, {"key": "Mlflow_Run_Name", "value": "llama-3.2-1B-Instruct-scrolls-gov_report"}, {"key": "Hf_Token", "value": ""}, {"key": "Download_Dataset_From_Hf", "value": "true"}, {"key": "Dataset_Name", "value": "tau/scrolls"}, {"key": "Dataset_Sub_Name", "value": "gov_report"}, {"key": "Dataset_Column_To_Use", "value": "None"}, {"key": "Dataset_Path", "value": "/workspace/datasets"}, {"key": "Download_Model_From_Hf", "value": "true"}, {"key": "Model_Name", "value": "meta-llama/Llama-3.2-1B-Instruct"}, {"key": "Model_Path", "value": "/workspace/models"}, {"key": "Max_Model_Length", "value": "8192"}, {"key": "Resume_From_Checkpoint", "value": "false"}, {"key": "Checkpoint_Path", "value": "/checkpoint"}, {"key": "Lora_R", "value": "8"}, {"key": "Lora_Alpha", "value": "32"}, {"key": "Lora_Dropout", "value": "0.1"}, {"key": "Lora_Target_Modules", "value": "q_proj,up_proj,o_proj,k_proj,down_proj,gate_proj,v_proj"}, {"key": "Bias", "value": "none"}, {"key": "Task_Type", "value": "CAUSAL_LM"}, {"key": "Per_Device_Train_Batch_Size", "value": "1"}, {"key": "Gradient_Accumulation_Steps", "value": "1"}, {"key": "Warmup_Steps", "value": "2"}, {"key": "Save_Steps", "value": "100"}, {"key": "Learning_Rate", "value": "0.0002"}, {"key": "Fp16", "value": "true"}, {"key": "Logging_Steps", "value": "1"}, {"key": "Output_Dir", "value": "/tunedmodels/Llama-3.1-8B-english_quotes"}, {"key": "Optim", "value": "paged_adamw_8bit"}, {"key": "Number_of_Training_Epochs", "value": "2"}, {"key": "Require_Persistent_Output_Dir", "value": "true"}], "output_object_storage": [{"bucket_name": "corrino_tuned_hf_oss_models", "mount_location": "/tunedmodels", "volume_size_in_gbs": 500}]}, {"pre_filled_sample_name": "Fine-Tune NousResearch/Meta-Llama-3.1-8B (Open Model) from Hugging Face with Dataset from Hugging Face on A10 VM", "recipe_id": "lora_finetune_nvidia", "deployment_name": "dk_open_model_open_dataset", "recipe_mode": "job", "recipe_node_shape": "VM.GPU.A10.2", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:finetune_lora_dev", "recipe_nvidia_gpu_count": 2, "recipe_ephemeral_storage_size": 300, "recipe_replica_count": 1, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 500, "recipe_shared_memory_volume_size_limit_in_mb": 100, "recipe_container_env": [{"key": "Mlflow_Endpoint", "value": "http://mlflow.cluster-tools.svc.cluster.local:5000"}, {"key": "Mlflow_Exp_Name", "value": "oci_ai_blueprints_nvidia_recipe"}, {"key": "Mlflow_Run_Name", "value": "oci_ai_blueprints_run"}, {"key": "Hf_Token", "value": "None"}, {"key": "Download_Dataset_From_Hf", "value": "true"}, {"key": "Dataset_Name", "value": "Abirate/english_quotes"}, {"key": "Dataset_Sub_Name", "value": "None"}, {"key": "Dataset_Column_To_Use", "value": "None"}, {"key": "Dataset_Path", "value": "/workspace/datasets"}, {"key": "Download_Model_From_Hf", "value": "true"}, {"key": "Model_Name", "value": "NousResearch/Meta-Llama-3.1-8B"}, {"key": "Model_Path", "value": "/workspace/models"}, {"key": "Max_Model_Length", "value": "8192"}, {"key": "Resume_From_Checkpoint", "value": "false"}, {"key": "Checkpoint_Path", "value": "/checkpoint"}, {"key": "Lora_R", "value": "8"}, {"key": "Lora_Alpha", "value": "32"}, {"key": "Lora_Dropout", "value": "0.1"}, {"key": "Lora_Target_Modules", "value": "q_proj,up_proj,o_proj,k_proj,down_proj,gate_proj,v_proj"}, {"key": "Bias", "value": "none"}, {"key": "Task_Type", "value": "CAUSAL_LM"}, {"key": "Per_Device_Train_Batch_Size", "value": "1"}, {"key": "Gradient_Accumulation_Steps", "value": "1"}, {"key": "Warmup_Steps", "value": "2"}, {"key": "Save_Steps", "value": "100"}, {"key": "Learning_Rate", "value": "0.0002"}, {"key": "Fp16", "value": "true"}, {"key": "Logging_Steps", "value": "1"}, {"key": "Output_Dir", "value": "/tunedmodels/Llama-3.1-8B-english_quotes"}, {"key": "Optim", "value": "paged_adamw_8bit"}, {"key": "Number_of_Training_Epochs", "value": "2"}, {"key": "Require_Persistent_Output_Dir", "value": "true"}], "output_object_storage": [{"bucket_name": "corrino_tuned_hf_oss_models", "mount_location": "/tunedmodels", "volume_size_in_gbs": 500}]}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "Autoscaling", "blueprint_short_description": "Scale inference workloads based on traffic load", "blueprint_long_description": "OCI AI Blueprints supports automatic scaling (autoscaling) of inference workloads to handle varying traffic loads efficiently. This means that when demand increases, OCI AI Blueprints can spin up more pods (containers running your inference jobs) and, if needed, provision additional GPU nodes. When demand decreases, it scales back down to save resources and cost.", "pre_filled_samples": [{"pre_filled_sample_name": "vLLM Inference with Automatic Scaling on VM.GPU.A10.2", "recipe_id": "llm_inference_nvidia", "recipe_mode": "service", "deployment_name": "autoscale_vllm_example", "recipe_image_uri": "docker.io/vllm/vllm-openai:v0.9.1", "recipe_node_shape": "VM.GPU.A10.2", "input_object_storage": [{"par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/qFv5XzocpOoEXjlxL7Q3ZrrCFkx9GkA1fpg97zmnaNEX9WB_WMXLz2rykGuU1hqQ/n/iduyx1qnmway/b/metallama321binstruct/o/", "mount_location": "/models", "volume_size_in_gbs": 100}], "recipe_container_env": [{"key": "tensor_parallel_size", "value": "1"}, {"key": "model_name", "value": ""}, {"key": "Model_Path", "value": "/models"}], "recipe_prometheus_enabled": true, "recipe_replica_count": 1, "recipe_container_port": "8000", "recipe_nvidia_gpu_count": 1, "recipe_container_command_args": ["--model", "$(Model_Path)", "--tensor-parallel-size", "$(tensor_parallel_size)", "--gpu-memory-utilization", "0.95", "--max-model-len", "1024"], "recipe_ephemeral_storage_size": 200, "recipe_node_boot_volume_size_in_gbs": 300, "recipe_node_pool_size": 1, "recipe_shared_memory_volume_size_limit_in_mb": 200, "recipe_startup_probe_params": {"failure_threshold": 30, "endpoint_path": "/health", "port": 8000, "scheme": "HTTP", "initial_delay_seconds": 60, "period_seconds": 2, "success_threshold": 1, "timeout_seconds": 10}, "recipe_liveness_probe_params": {"failure_threshold": 3, "endpoint_path": "/health", "port": 8000, "scheme": "HTTP", "initial_delay_seconds": 65, "period_seconds": 600, "success_threshold": 1, "timeout_seconds": 10}, "recipe_node_autoscaling_params": {"min_nodes": 1, "max_nodes": 2}, "recipe_pod_autoscaling_params": {"min_replicas": 1, "max_replicas": 4}}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "CPU Inference", "blueprint_short_description": "Deploy CPU-based inference with Ollama for cost-effective and GPU-free model serving", "blueprint_long_description": "This blueprint provides a comprehensive framework for testing inference on CPUs using the Ollama platform with a variety of supported models such as Mistral, Gemma, and others available through Ollama. Unlike GPU-dependent solutions, this blueprint is designed for environments where CPU inference is preferred or required. It offers clear guidelines and configuration settings to deploy a robust CPU inference service, enabling thorough performance evaluations and reliability testing. Ollama's lightweight and efficient architecture makes it an ideal solution for developers looking to benchmark and optimize CPU-based inference workloads.\n\nThis blueprint explains how to use CPU inference for running large language models using Ollama. It includes two main deployment strategies:\n\n- Serving pre-saved models directly from Object Storage\n\n- Pulling models from Ollama and saving them to Object Storage", "pre_filled_samples": [{"pre_filled_sample_name": "CPU inference with Mistral and BM.Standard.E4", "recipe_id": "cpu_inference", "recipe_mode": "service", "deployment_name": "cpu Inference mistral BME4", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:cpu_inference_service_v0.2", "recipe_node_shape": "BM.Standard.E4.128", "input_object_storage": [{"par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/0LYMMBRGg_OEm_hzp9BG8BlQx7Ccpy3gY-gRzjQQFZRU6peG0pXyHTRHUGZLp82E/n/iduyx1qnmway/b/ollama-models/o/", "mount_location": "/models", "volume_size_in_gbs": 20}], "recipe_container_env": [{"key": "MODEL_NAME", "value": "mistral"}, {"key": "PROMPT", "value": "What is the capital of France?"}], "recipe_replica_count": 1, "recipe_container_port": "11434", "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_container_command_args": ["--input_directory", "/models", "--model_name", "mistral"], "recipe_ephemeral_storage_size": 100}, {"pre_filled_sample_name": "CPU inference with Gemma and BM.Standard.E5.192", "recipe_id": "cpu_inference", "recipe_mode": "service", "deployment_name": "cpu Inference gemma BME5", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:cpu_inference_service_v0.2", "recipe_node_shape": "BM.Standard.E5.192", "input_object_storage": [{"par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/0LYMMBRGg_OEm_hzp9BG8BlQx7Ccpy3gY-gRzjQQFZRU6peG0pXyHTRHUGZLp82E/n/iduyx1qnmway/b/ollama-models/o/", "mount_location": "/models", "volume_size_in_gbs": 20}], "recipe_container_env": [{"key": "MODEL_NAME", "value": "gemma"}, {"key": "PROMPT", "value": "What is the capital of Germany?"}], "recipe_replica_count": 1, "recipe_container_port": "11434", "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_container_command_args": ["--input_directory", "/models", "--model_name", "gemma"], "recipe_ephemeral_storage_size": 100}, {"pre_filled_sample_name": "CPU inference with mistral and VM.Standard.E4.Flex", "recipe_id": "cpu_inference", "recipe_mode": "service", "deployment_name": "cpu Inference mistral E4Flex", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:cpu_inference_service_v0.2", "recipe_node_shape": "VM.Standard.E4.Flex", "recipe_flex_shape_ocpu_count": 4, "recipe_flex_shape_memory_size_in_gbs": 64, "input_object_storage": [{"par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/0LYMMBRGg_OEm_hzp9BG8BlQx7Ccpy3gY-gRzjQQFZRU6peG0pXyHTRHUGZLp82E/n/iduyx1qnmway/b/ollama-models/o/", "mount_location": "/models", "volume_size_in_gbs": 20}], "recipe_container_env": [{"key": "MODEL_NAME", "value": "mistral"}, {"key": "PROMPT", "value": "What is the capital of Spain?"}], "recipe_replica_count": 1, "recipe_container_port": "11434", "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_container_command_args": ["--input_directory", "/models", "--model_name", "mistral"], "recipe_ephemeral_storage_size": 100}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "LLM Inference with vLLM", "blueprint_short_description": "Deploy open-source LLMs to GPUs for inference with vLLM.", "blueprint_long_description": "This blueprint simplifies the deployment of LLMs using an open-source inference engine called vLLM. You can deploy a custom model or select from a variety of open-source models on Hugging Face.\n\nThe blueprint deploys the model from an object storage bucket to a GPU node in an OKE cluster in your tenancy. Once deployed, you receive a ready-to-use API endpoint to start generating responses from the model. For mission-critical workloads, you can also configure auto-scaling driven by application metrics like inference latency. To summarize, this blueprint streamlines inference deployment, making it easy to scale and integrate into your applications without deep, technical expertise.", "pre_filled_samples": [{"pre_filled_sample_name": "Meta-Llama-3.1-8B-Instruct from OCI Object Storage on VM.GPU.A10.2 with vLLM", "recipe_id": "llm_inference_nvidia", "recipe_mode": "service", "deployment_name": "vllm-model-from-obj-storage", "recipe_image_uri": "docker.io/vllm/vllm-openai:v0.9.1", "recipe_node_shape": "VM.GPU.A10.2", "recipe_prometheus_enabled": true, "input_object_storage": [{"par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/IFknABDAjiiF5LATogUbRCcVQ9KL6aFUC1j-P5NSeUcaB2lntXLaR935rxa-E-u1/n/iduyx1qnmway/b/corrino_hf_oss_models/o/", "mount_location": "/models", "volume_size_in_gbs": 500, "include": ["NousResearch/Meta-Llama-3.1-8B-Instruct"]}], "recipe_container_env": [{"key": "tensor_parallel_size", "value": "2"}, {"key": "model_name", "value": "NousResearch/Meta-Llama-3.1-8B-Instruct"}, {"key": "Model_Path", "value": "/models/NousResearch/Meta-Llama-3.1-8B-Instruct"}], "recipe_replica_count": 1, "recipe_container_port": "8000", "recipe_nvidia_gpu_count": 2, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_container_command_args": ["--model", "$(Model_Path)", "--tensor-parallel-size", "$(tensor_parallel_size)"], "recipe_ephemeral_storage_size": 100, "recipe_shared_memory_volume_size_limit_in_mb": 200}, {"pre_filled_sample_name": "meta-llama/Llama-3.2-11B-Vision (Closed Model) from Hugging Face on VM.GPU.A10.2 with vLLM", "recipe_id": "llm_inference_nvidia", "recipe_mode": "service", "deployment_name": "vllm-closed-hf-model", "recipe_image_uri": "docker.io/vllm/vllm-openai:v0.9.1", "recipe_node_shape": "VM.GPU.A10.2", "recipe_container_env": [{"key": "HF_TOKEN", "value": ""}], "recipe_replica_count": 1, "recipe_container_port": "8000", "recipe_prometheus_enabled": true, "recipe_nvidia_gpu_count": 2, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_container_command_args": ["--model", "meta-llama/Llama-3.2-11B-Vision", "--tensor-parallel-size", "2"], "recipe_ephemeral_storage_size": 100, "recipe_shared_memory_volume_size_limit_in_mb": 200}, {"pre_filled_sample_name": "NousResearch/Meta-Llama-3-8B-Instruct (Open Model) from Hugging Face on VM.GPU.A10.2 with vLLM", "recipe_id": "llm_inference_nvidia", "recipe_mode": "service", "deployment_name": "vllm-open-hf-model", "recipe_image_uri": "docker.io/vllm/vllm-openai:v0.9.1", "recipe_node_shape": "VM.GPU.A10.2", "recipe_prometheus_enabled": true, "recipe_container_env": [{"key": "tensor_parallel_size", "value": "2"}, {"key": "model_name", "value": "NousResearch/Meta-Llama-3-8B-Instruct"}], "recipe_replica_count": 1, "recipe_container_port": "8000", "recipe_nvidia_gpu_count": 2, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_container_command_args": ["--model", "$(model_name)", "--tensor-parallel-size", "$(tensor_parallel_size)"], "recipe_ephemeral_storage_size": 100, "recipe_shared_memory_volume_size_limit_in_mb": 200}, {"pre_filled_sample_name": "NousResearch/Meta-Llama-3-8B-Instruct (Open Model) from Hugging Face on VM.GPU.A10.2 with vLLM and Endpoint API Key", "recipe_id": "llm_inference_nvidia", "recipe_mode": "service", "deployment_name": "vllm-open-hf-model-api-key-functionality", "recipe_image_uri": "docker.io/vllm/vllm-openai:v0.9.1", "recipe_node_shape": "VM.GPU.A10.2", "recipe_prometheus_enabled": true, "recipe_container_env": [{"key": "VLLM_API_KEY", "value": ""}], "recipe_replica_count": 1, "recipe_container_port": "8000", "recipe_nvidia_gpu_count": 2, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_container_command_args": ["--model", "NousResearch/Meta-Llama-3-8B-Instruct", "--tensor-parallel-size", "2"], "recipe_ephemeral_storage_size": 100, "recipe_shared_memory_volume_size_limit_in_mb": 200}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "Multi-Instance GPU (MIG)", "blueprint_short_description": "Partition GPUs into multiple isolated instances for efficient resource sharing and concurrent workloads", "blueprint_long_description": "Multi-Instance GPU (MIG) is a feature of NVIDIA GPUs that allows a single physical GPU to be partitioned into multiple isolated instances, each acting as an independent GPU with dedicated compute, memory, and cache resources. This enables multiple users or workloads to run concurrently on a single GPU without interfering with each other and without virtualization overhead.\n\nMIG is particularly useful when running multiple smaller models that do not require an entire GPU, such as hosting multiple smaller LLMs (Llama-7B, Mistral-7B, or Gemma-2B) on an A100 or H100 GPU. It ensures resource allocation is optimized, preventing one model from monopolizing the entire GPU while maintaining high throughput. This approach is incredibly well-suited for autoscaling scenarios because many more pods can be scheduled onto a single node depending on the MIG configuration.\n\nCurrently, OCI AI Blueprints supports MIG for H100, H200, and B200s with various slice configurations ranging from 7 mini GPUs to full instances. The system supports creating MIG-enabled shared node pools, deploying inference workloads to specific MIG slices, and updating MIG configurations on existing nodes.\n\nTo see supported configurations and resource requests, go to [Mig Configurations](./README.md#mig-configurations).", "pre_filled_samples": [{"pre_filled_sample_name": "MIG-Enabled H100 Shared Node Pool", "deployment_name": "H100_pool_mig", "recipe_mode": "shared_node_pool", "shared_node_pool_size": 1, "shared_node_pool_shape": "BM.GPU.H100.8", "shared_node_pool_boot_volume_size_in_gbs": 1000, "shared_node_pool_mig_config": "all-1g.20gb"}, {"pre_filled_sample_name": "MIG Inference with Multiple Replicas and Autoscaling", "recipe_id": "llm_inference_nvidia", "recipe_mode": "service", "deployment_name": "autoscale_mig", "recipe_image_uri": "docker.io/vllm/vllm-openai:v0.9.1", "recipe_prometheus_enabled": true, "recipe_node_shape": "BM.GPU.H100.8", "recipe_container_env": [{"key": "tensor_parallel_size", "value": "1"}, {"key": "HF_TOKEN", "value": ""}], "recipe_replica_count": 5, "recipe_container_port": "8000", "recipe_nvidia_gpu_count": 1, "recipe_use_shared_node_pool": true, "mig_resource_request": "1g.10gb", "recipe_container_command_args": ["--model", "meta-llama/Llama-3.2-3B-Instruct", "--dtype", "bfloat16", "--tensor-parallel-size", "$(tensor_parallel_size)", "--gpu-memory-utilization", "0.99", "--max-model-len", "16384"], "recipe_ephemeral_storage_size": 30, "recipe_node_boot_volume_size_in_gbs": 300, "recipe_shared_memory_volume_size_limit_in_mb": 1000, "recipe_startup_probe_params": {"failure_threshold": 30, "endpoint_path": "/health", "port": 8000, "scheme": "HTTP", "initial_delay_seconds": 10, "period_seconds": 2, "success_threshold": 1, "timeout_seconds": 1}, "recipe_liveness_probe_params": {"failure_threshold": 3, "endpoint_path": "/health", "port": 8000, "scheme": "HTTP", "initial_delay_seconds": 65, "period_seconds": 600, "success_threshold": 1, "timeout_seconds": 1}, "recipe_pod_autoscaling_params": {"min_replicas": 5, "max_replicas": 10}}, {"pre_filled_sample_name": "MIG Inference Single Replica (20GB Slice)", "recipe_id": "llm_inference_nvidia", "recipe_mode": "service", "deployment_name": "autoscale_mig", "recipe_image_uri": "docker.io/vllm/vllm-openai:v0.9.1", "recipe_node_shape": "BM.GPU.H100.8", "recipe_prometheus_enabled": true, "recipe_container_env": [{"key": "tensor_parallel_size", "value": "1"}, {"key": "HF_TOKEN", "value": ""}], "recipe_replica_count": 1, "recipe_container_port": "8000", "recipe_nvidia_gpu_count": 1, "recipe_use_shared_node_pool": true, "mig_resource_request": "1g.20gb", "recipe_container_command_args": ["--model", "meta-llama/Llama-3.2-3B-Instruct", "--dtype", "bfloat16", "--tensor-parallel-size", "$(tensor_parallel_size)", "--gpu-memory-utilization", "0.99", "--max-model-len", "16384"], "recipe_ephemeral_storage_size": 30, "recipe_node_boot_volume_size_in_gbs": 300, "recipe_shared_memory_volume_size_limit_in_mb": 1000, "recipe_startup_probe_params": {"failure_threshold": 30, "endpoint_path": "/health", "port": 8000, "scheme": "HTTP", "initial_delay_seconds": 10, "period_seconds": 2, "success_threshold": 1, "timeout_seconds": 1}, "recipe_liveness_probe_params": {"failure_threshold": 3, "endpoint_path": "/health", "port": 8000, "scheme": "HTTP", "initial_delay_seconds": 65, "period_seconds": 600, "success_threshold": 1, "timeout_seconds": 1}, "recipe_pod_autoscaling_params": {"min_replicas": 1, "max_replicas": 50}}, {"pre_filled_sample_name": "MIG Inference Single Replica (10GB Slice)", "recipe_id": "llm_inference_nvidia", "recipe_mode": "service", "deployment_name": "autoscale_mig", "recipe_image_uri": "docker.io/vllm/vllm-openai:v0.9.1", "recipe_node_shape": "BM.GPU.H100.8", "recipe_prometheus_enabled": true, "recipe_container_env": [{"key": "tensor_parallel_size", "value": "1"}, {"key": "HF_TOKEN", "value": ""}], "recipe_replica_count": 1, "recipe_container_port": "8000", "recipe_nvidia_gpu_count": 1, "recipe_use_shared_node_pool": true, "mig_resource_request": "1g.10gb", "recipe_container_command_args": ["--model", "meta-llama/Llama-3.2-3B-Instruct", "--dtype", "bfloat16", "--tensor-parallel-size", "$(tensor_parallel_size)", "--gpu-memory-utilization", "0.99", "--max-model-len", "16384"], "recipe_ephemeral_storage_size": 30, "recipe_node_boot_volume_size_in_gbs": 300, "recipe_shared_memory_volume_size_limit_in_mb": 1000, "recipe_startup_probe_params": {"failure_threshold": 30, "endpoint_path": "/health", "port": 8000, "scheme": "HTTP", "initial_delay_seconds": 10, "period_seconds": 2, "success_threshold": 1, "timeout_seconds": 1}, "recipe_liveness_probe_params": {"failure_threshold": 3, "endpoint_path": "/health", "port": 8000, "scheme": "HTTP", "initial_delay_seconds": 65, "period_seconds": 600, "success_threshold": 1, "timeout_seconds": 1}, "recipe_pod_autoscaling_params": {"min_replicas": 1, "max_replicas": 50}}, {"pre_filled_sample_name": "Update MIG Configuration by Node Name", "recipe_mode": "update", "deployment_name": "all-1g10gb", "recipe_node_name": "10.0.10.138", "shared_node_pool_mig_config": "all-1g.10gb"}, {"pre_filled_sample_name": "Update MIG Configuration by Node Pool Name", "recipe_mode": "update", "deployment_name": "all-2g-20gb", "recipe_node_pool_name": "h100migpool", "shared_node_pool_mig_config": "all-2g.20gb"}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "Multi-Node Inference", "blueprint_short_description": "Scale large language model inference across multiple GPU nodes using tensor and pipeline parallelism", "blueprint_long_description": "Multi-node inference enables deploying very large language models that cannot fit within the GPU memory of a single node by distributing the workload across multiple computing nodes. This approach combines tensor parallelism (splitting operations across GPUs within a node) and pipeline parallelism (distributing sequential stages across nodes) to efficiently utilize available hardware resources.\n\nThis blueprint is essential when serving models like Llama-3.3-70B-Instruct that require approximately 150GB of GPU memory, exceeding the capacity of single-node configurations. The system uses vLLM and Ray with LeaderWorkerSet (LWS) to manage distributed state across nodes, creating a cluster with one head node and multiple worker nodes.\n\nThe multi-node approach significantly reduces processing time and improves throughput for both real-time and batch predictions. It requires careful planning to determine the appropriate node shapes and GPU requirements based on model size, precision, and available compute shapes. The system supports shared node pools and optional RDMA connectivity for enhanced performance.\n\nKey benefits include the ability to serve models that exceed single-node memory limits, improved inference throughput through parallel processing, and efficient resource utilization across distributed GPU infrastructure.", "pre_filled_samples": [{"pre_filled_sample_name": "Multi-Node Inference on VM.GPU.A10 Cluster", "recipe_id": "vllm_multinode_inference", "recipe_mode": "service", "deployment_name": "multinode_inference", "recipe_node_shape": "VM.GPU.A10.2", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:ray2430_vllmv083", "input_object_storage": [{"par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/IFknABDAjiiF5LATogUbRCcVQ9KL6aFUC1j-P5NSeUcaB2lntXLaR935rxa-E-u1/n/iduyx1qnmway/b/corrino_hf_oss_models/o/", "mount_location": "/models", "volume_size_in_gbs": 500, "include": ["NousResearch/Meta-Llama-3.1-8B-Instruct"]}], "recipe_replica_count": 1, "recipe_nvidia_gpu_count": 2, "recipe_ephemeral_storage_size": 150, "recipe_shared_memory_volume_size_limit_in_mb": 10000, "recipe_container_port": "8000", "recipe_use_shared_node_pool": true, "multinode_num_nodes_to_use_from_shared_pool": 2, "recipe_container_command_args": ["--port", "8000", "--model", "/models", "--tensor-parallel-size", "2", "--pipeline-parallel-size", "2", "--gpu-memory-utilization", "0.90", "--distributed-executor-backend", "ray"], "recipe_readiness_probe_params": {"endpoint_path": "/health", "port": 8000, "initial_delay_seconds": 20, "period_seconds": 10}}, {"pre_filled_sample_name": "Multi-Node Inference on BM.GPU.A10 Cluster", "recipe_id": "vllm_multinode_inference", "recipe_mode": "service", "deployment_name": "multinode_inference", "recipe_node_shape": "BM.GPU.A10.4", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:ray2430_vllmv083", "input_object_storage": [{"par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/IFknABDAjiiF5LATogUbRCcVQ9KL6aFUC1j-P5NSeUcaB2lntXLaR935rxa-E-u1/n/iduyx1qnmway/b/corrino_hf_oss_models/o/", "mount_location": "/models", "volume_size_in_gbs": 500, "include": ["NousResearch/Meta-Llama-3.1-8B-Instruct"]}], "recipe_replica_count": 1, "recipe_nvidia_gpu_count": 4, "recipe_ephemeral_storage_size": 150, "recipe_shared_memory_volume_size_limit_in_mb": 10000, "recipe_container_port": "8000", "recipe_use_shared_node_pool": true, "multinode_num_nodes_to_use_from_shared_pool": 2, "recipe_container_command_args": ["--port", "8000", "--model", "/models", "--tensor-parallel-size", "4", "--pipeline-parallel-size", "2", "--gpu-memory-utilization", "0.90", "--distributed-executor-backend", "ray"], "recipe_readiness_probe_params": {"endpoint_path": "/health", "port": 8000, "initial_delay_seconds": 20, "period_seconds": 10}}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "Install OCI AI Blueprints onto an Existing OKE Cluster", "blueprint_short_description": "Deploy OCI AI Blueprints on your existing OKE cluster without creating new infrastructure", "blueprint_long_description": "This guide helps you install and use **OCI AI Blueprints** on an existing OKE cluster that was created outside of blueprints and already has workflows running on it. Rather than installing blueprints onto a new cluster, you can leverage an existing cluster with node pools and tools already installed.\n\nThe installation process involves ensuring you have the correct IAM policies in place, retrieving existing cluster OKE and VCN information from the console, deploying the OCI AI Blueprints application onto the existing cluster, and optionally adding existing nodes to be used by blueprints. You can then deploy sample recipes to test functionality.\n\nKey considerations include managing existing tooling like Prometheus, Grafana, or the GPU operator that may already be installed on your cluster. The blueprint installation process can detect and work around these existing components. Additionally, if you have the nvidia-gpu-operator installed and plan to use Multi-Instance GPUs with H100 nodes, special configuration steps are available.\n\nThis approach allows you to:\n\n- Leverage existing cluster resources and configurations\n\n- Add blueprints capabilities without disrupting current workloads\n\n- Utilize existing node pools for blueprint deployments\n\n- Maintain compatibility with pre-installed cluster tools", "pre_filled_samples": [{"pre_filled_sample_name": "Add Existing Node to Control Plane", "recipe_mode": "update", "deployment_name": "startupaddnode", "recipe_node_name": "10.0.10.164", "recipe_node_labels": {"corrino": "a10pool", "corrino/pool-shared-any": "true"}}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "Llama Stack on OCI", "blueprint_short_description": "Pre-packaged GenAI runtime — vLLM + ChromaDB + Postgres (optional Jaeger) ready for one-click deployment", "blueprint_long_description": "Deploy Llama Stack on OCI via OCI AI Blueprints. For more information on Llama Stack: https://github.com/meta-llama/llama-stack\n\nWe are using Postgres for the backend store, chromaDB for the vector database, Jaeger for tracing and vLLM for inference serving.", "pre_filled_samples": [{"pre_filled_sample_name": "Llama 3.1 8B Model with vLLM", "deployment_group": {"name": "group", "deployments": [{"name": "postgres", "recipe": {"recipe_id": "postgres", "deployment_name": "postgres", "recipe_mode": "service", "recipe_node_pool_size": 1, "recipe_node_shape": "VM.Standard.E4.Flex", "recipe_flex_shape_ocpu_count": 2, "recipe_flex_shape_memory_size_in_gbs": 16, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_ephemeral_storage_size": 100, "recipe_image_uri": "docker.io/library/postgres:latest", "recipe_container_port": "5432", "recipe_host_port": "5432", "recipe_container_env": [{"key": "POSTGRES_USER", "value": "llamastack"}, {"key": "POSTGRES_PASSWORD", "value": "llamastack"}, {"key": "POSTGRES_DB", "value": "llamastack"}], "recipe_replica_count": 1}, "exports": ["internal_dns_name"]}, {"name": "chroma", "recipe": {"recipe_id": "chromadb", "deployment_name": "chroma", "recipe_mode": "service", "recipe_node_pool_size": 1, "recipe_node_shape": "VM.Standard.E4.Flex", "recipe_flex_shape_ocpu_count": 2, "recipe_flex_shape_memory_size_in_gbs": 16, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_ephemeral_storage_size": 100, "recipe_image_uri": "docker.io/chromadb/chroma:latest", "recipe_container_port": "8000", "recipe_host_port": "8000", "recipe_container_env": [{"key": "IS_PERSISTENT", "value": "TRUE"}, {"key": "ANONYMIZED_TELEMETRY", "value": "FALSE"}], "recipe_replica_count": 1, "output_object_storage": [{"bucket_name": "chromadb", "mount_location": "/chroma/chroma", "volume_size_in_gbs": 500}]}, "exports": ["internal_dns_name"]}, {"name": "vllm", "recipe": {"recipe_id": "llm_inference_nvidia", "deployment_name": "vllm", "recipe_mode": "service", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:vllmv0.6.6.pos1", "recipe_node_shape": "VM.GPU.A10.2", "input_object_storage": [{"par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/IFknABDAjiiF5LATogUbRCcVQ9KL6aFUC1j-P5NSeUcaB2lntXLaR935rxa-E-u1/n/iduyx1qnmway/b/corrino_hf_oss_models/o/", "mount_location": "/models", "volume_size_in_gbs": 500, "include": ["NousResearch/Meta-Llama-3.1-8B-Instruct"]}], "recipe_container_env": [{"key": "tensor_parallel_size", "value": "2"}, {"key": "model_name", "value": "NousResearch/Meta-Llama-3.1-8B-Instruct"}, {"key": "Model_Path", "value": "/models/NousResearch/Meta-Llama-3.1-8B-Instruct"}], "recipe_replica_count": 1, "recipe_container_port": "8000", "recipe_nvidia_gpu_count": 2, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_container_command_args": ["--model", "$(Model_Path)", "--tensor-parallel-size", "$(tensor_parallel_size)"], "recipe_ephemeral_storage_size": 100, "recipe_shared_memory_volume_size_limit_in_mb": 200}, "exports": ["internal_dns_name"]}, {"name": "jaeger", "recipe": {"recipe_id": "jaeger", "deployment_name": "jaeger", "recipe_mode": "service", "recipe_node_pool_size": 1, "recipe_node_shape": "VM.Standard.E4.Flex", "recipe_flex_shape_ocpu_count": 2, "recipe_flex_shape_memory_size_in_gbs": 16, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_ephemeral_storage_size": 100, "recipe_image_uri": "docker.io/jaegertracing/jaeger:latest", "recipe_container_port": "16686", "recipe_additional_ingress_ports": [{"name": "jaeger", "port": 4318, "path": "/jaeger"}], "recipe_replica_count": 1}, "exports": ["internal_dns_name"]}, {"name": "llamastack_app", "recipe": {"recipe_id": "llamastack_app", "deployment_name": "llamastack_app", "recipe_mode": "service", "recipe_node_pool_size": 1, "recipe_node_shape": "VM.Standard.E4.Flex", "recipe_flex_shape_ocpu_count": 2, "recipe_flex_shape_memory_size_in_gbs": 16, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_ephemeral_storage_size": 100, "recipe_image_uri": "docker.io/llamastack/distribution-postgres-demo:latest", "recipe_container_port": "8321", "recipe_container_env": [{"key": "INFERENCE_MODEL", "value": "/models/NousResearch/Meta-Llama-3.1-8B-Instruct"}, {"key": "VLLM_URL", "value": "http://${vllm.internal_dns_name}/v1"}, {"key": "ENABLE_CHROMADB", "value": "1"}, {"key": "CHROMADB_URL", "value": "http://${chroma.internal_dns_name}:8000"}, {"key": "POSTGRES_HOST", "value": "${postgres.internal_dns_name}"}, {"key": "POSTGRES_PORT", "value": "5432"}, {"key": "POSTGRES_DB", "value": "llamastack"}, {"key": "POSTGRES_USER", "value": "llamastack"}, {"key": "POSTGRES_PASSWORD", "value": "llamastack"}, {"key": "TELEMETRY_SINKS", "value": "console,otel_trace"}, {"key": "OTEL_TRACE_ENDPOINT", "value": "http://${jaeger.internal_dns_name}/jaeger/v1/traces"}], "output_object_storage": [{"bucket_name": "llamastack", "mount_location": "/root/.llama", "volume_size_in_gbs": 100}], "recipe_replica_count": 1}, "depends_on": ["postgres", "chroma", "vllm", "jaeger"]}]}}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "Model Storage", "blueprint_short_description": "Download and store models from HuggingFace to OCI Object Storage for efficient blueprint deployment", "blueprint_long_description": "Model storage is a critical component for AI/ML workloads, providing efficient access to large language models and other AI assets. OCI AI Blueprints supports storing models in OCI Object Storage, which offers faster loading times and better resource management compared to downloading models directly from HuggingFace during container startup.\n\nThis blueprint provides automated workflows to download models from HuggingFace (both open and gated models) and store them in OCI Object Storage buckets. Once stored, these models can be efficiently accessed by inference blueprints through pre-authenticated requests (PARs) or direct bucket access, significantly reducing deployment times and improving reliability.\n\nThe system supports both open-source models that require no authentication and closed/gated models that require HuggingFace tokens for access. Models are downloaded using optimized parallel workers and stored with appropriate volume sizing to accommodate large model files.", "pre_filled_samples": [{"pre_filled_sample_name": "Download Closed HuggingFace Model to Object Storage", "recipe_id": "example", "recipe_mode": "job", "deployment_name": "model_to_object", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:hf_downloader_v1", "recipe_container_command_args": ["meta-llama/Llama-3.2-90B-Vision-Instruct", "--local-dir", "/models", "--max-workers", "4", "--token", ""], "recipe_container_port": "5678", "recipe_node_shape": "VM.Standard.E4.Flex", "recipe_node_pool_size": 1, "recipe_flex_shape_ocpu_count": 4, "recipe_flex_shape_memory_size_in_gbs": 64, "recipe_node_boot_volume_size_in_gbs": 500, "recipe_ephemeral_storage_size": 450, "output_object_storage": [{"bucket_name": "llama3290Bvisioninstruct", "mount_location": "/models", "volume_size_in_gbs": 450}]}, {"pre_filled_sample_name": "Download Open HuggingFace Model to Object Storage", "recipe_id": "example", "recipe_mode": "job", "deployment_name": "model_to_object", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:hf_downloader_v1", "recipe_container_command_args": ["NousResearch/Meta-Llama-3.1-405B-FP8", "--local-dir", "/models", "--max-workers", "16"], "recipe_container_port": "5678", "recipe_node_shape": "VM.Standard.E4.Flex", "recipe_node_pool_size": 1, "recipe_flex_shape_ocpu_count": 16, "recipe_flex_shape_memory_size_in_gbs": 256, "recipe_node_boot_volume_size_in_gbs": 1000, "recipe_ephemeral_storage_size": 900, "output_object_storage": [{"bucket_name": "nousllama31405bfp8", "mount_location": "/models", "volume_size_in_gbs": 800}]}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "Using RDMA Enabled Node Pools", "blueprint_short_description": "Enable high-performance inter-node communication using Remote Direct Memory Access for large-scale AI workloads", "blueprint_long_description": "Remote Direct Memory Access (RDMA) is a protocol that enables one node to read from or write to the memory of another node without involving either machine's CPU or operating system, enabling true zero-copy data transfers and dramatically reducing latency and CPU overhead. In large-scale AI workloads such as multi-node training with AllReduce or disaggregated LLM inference, RDMA can yield tremendous performance gains by significantly reducing communication and copy overhead between nodes.\n\nOCI AI Blueprints uses OCI cluster networks with instance pools to provision RDMA-enabled node pools, supporting high-performance compute shapes including BM.GPU.H100.8, BM.GPU.H200.8, and BM.GPU.B4.8. The system requires custom node images with proper drivers and libraries for RDMA connectivity, which must be imported from the oci-hpc-oke quickstart repository.\n\nRDMA-enabled deployments are particularly valuable for distributing very large language models (like Llama-3.1-405B-Instruct) that exceed single-node GPU memory capacity, requiring distributed inference across multiple nodes with high-bandwidth, low-latency communication. The technology enables efficient tensor and pipeline parallelism by eliminating traditional network communication bottlenecks.\n\nThe implementation supports both creating new RDMA-enabled shared node pools and integrating OCI AI Blueprints with existing RDMA-enabled clusters, providing flexibility for various deployment scenarios and infrastructure configurations.", "pre_filled_samples": [{"pre_filled_sample_name": "RDMA-Enabled H100 Shared Node Pool", "deployment_name": "H100_rdma_pool", "recipe_mode": "shared_node_pool", "shared_node_pool_size": 2, "shared_node_pool_shape": "BM.GPU.H100.8", "shared_node_pool_boot_volume_size_in_gbs": 1000, "recipe_availability_domain": "TrcQ:EU-FRANKFURT-1-AD-3", "recipe_node_image_ocid": "ocid1.image.oc1.eu-frankfurt-1.aaaaaaaakhpy5kt3p6gjmeqbasnndemp6aetlnbkm57hohrkgksuh4476llq", "multinode_rdma_enabled_in_shared_pool": true}, {"pre_filled_sample_name": "RDMA Distributed Inference (405B Model)", "recipe_id": "llm_inference_nvidia", "recipe_mode": "service", "deployment_name": "405b", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:ray2430_vllmv083", "recipe_node_shape": "BM.GPU.H100.8", "recipe_replica_count": 1, "recipe_container_port": "8000", "recipe_nvidia_gpu_count": 8, "recipe_use_shared_node_pool": true, "multinode_rdma_enabled_in_shared_pool": true, "multinode_num_nodes_to_use_from_shared_pool": 2, "input_object_storage": [{"par": "https://iduyx1qnmway.objectstorage.eu-frankfurt-1.oci.customer-oci.com/p/7N2O5JFirNX_CG70t-HPILzHvlTMP4FC9f_eauJVECosqNafIYxwcDwhItQHvaDK/n/iduyx1qnmway/b/llama31405binstruct/o/", "mount_location": "/models", "volume_size_in_gbs": 500}], "recipe_container_env": [{"key": "NCCL_DEBUG", "value": "INFO"}, {"key": "NCCL_DEBUG_SUBSYS", "value": "INIT,NET,ENV"}], "recipe_readiness_probe_params": {"endpoint_path": "/health", "port": 8000, "initial_delay_seconds": 20, "period_seconds": 10}, "recipe_container_command_args": ["--port", "8000", "--model", "/models", "--tensor-parallel-size", "8", "--gpu-memory-utilization", "0.90", "--pipeline-parallel-size", "2", "--distributed-executor-backend", "ray"], "recipe_ephemeral_storage_size": 100, "recipe_shared_memory_volume_size_limit_in_mb": 10000}, {"pre_filled_sample_name": "Update Nodes for RDMA Support", "recipe_mode": "update", "deployment_name": "startupaddnode1", "recipe_node_name": "10.0.10.164", "recipe_node_labels": {"corrino": "h100pool", "corrino/pool-shared-any": "true", "corrino/rdma": "true"}}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "Whisper Transcription API", "blueprint_short_description": "This blueprint provides a complete solution for running **audio/video transcription**, **speaker diarization**, and **summarization** via a RESTful API. It integrates [Faster-Whisper](https://github.com/guillaumekln/faster-whisper) for efficient transcription, [pyannote.audio](https://github.com/pyannote/pyannote-audio) for diarization, and Hugging Face instruction-tuned LLMs (e.g., Mistral-7B) for summarization. It supports multi-GPU acceleration, real-time streaming logs, and JSON/text output formats.", "blueprint_long_description": "---", "pre_filled_samples": [{"pre_filled_sample_name": "Deploy Whisper transcription on A10 GPU for real-time speech-to-text", "recipe_id": "whisper transcription", "recipe_mode": "service", "deployment_name": "whisper-transcription-a10", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:whisper_transcription_v8", "recipe_node_shape": "VM.GPU.A10.2", "recipe_replica_count": 1, "recipe_container_port": "8000", "recipe_nvidia_gpu_count": 2, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_ephemeral_storage_size": 100, "recipe_shared_memory_volume_size_limit_in_mb": 1000}, {"pre_filled_sample_name": "Deploy Whisper transcription on A100 GPU for high-speed processing", "recipe_id": "whisper transcription", "recipe_mode": "service", "deployment_name": "whisper-transcription-a100", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:whisper_transcription_v8", "recipe_node_shape": "BM.GPU.A100.8", "recipe_replica_count": 1, "recipe_container_port": "8000", "recipe_nvidia_gpu_count": 8, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_ephemeral_storage_size": 100, "recipe_shared_memory_volume_size_limit_in_mb": 1000}, {"pre_filled_sample_name": "Deploy Whisper transcription on H100 GPU for next-gen AI workloads", "recipe_id": "whisper transcription", "recipe_mode": "service", "deployment_name": "whisper-transcription-h100", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:whisper_transcription_v8", "recipe_node_shape": "BM.GPU.H100.8", "recipe_replica_count": 1, "recipe_container_port": "8000", "recipe_nvidia_gpu_count": 8, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_ephemeral_storage_size": 100, "recipe_shared_memory_volume_size_limit_in_mb": 1000}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "Deployment Groups", "blueprint_short_description": "Connected multi-container deployments in a single blueprint", "blueprint_long_description": "Deployment Groups let you spin up several deployments — each derived from its own blueprint — in a single `POST /deployment` request and treat them as one cohesive application. OCI AI Blueprints automatically sequences those member deployments according to the depends_on relationships you declare, publishes each deployment’s outputs (such as service URLs or internal dns name) for easy discovery, and then injects those outputs wherever you reference the placeholder `${deployment_name.export_key}` inside downstream blueprints. What once required a series of separate API calls stitched together with hard-coded endpoints can now be expressed declaratively in one step, with OCI AI Blueprints resolving every cross-service connection at runtime.", "pre_filled_samples": [{"pre_filled_sample_name": "Deployment Groups Showcase: Llama Stack", "deployment_group": {"name": "group", "deployments": [{"name": "postgres", "recipe": {"recipe_id": "postgres", "deployment_name": "postgres", "recipe_mode": "service", "recipe_node_pool_size": 1, "recipe_node_shape": "VM.Standard.E4.Flex", "recipe_flex_shape_ocpu_count": 2, "recipe_flex_shape_memory_size_in_gbs": 16, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_ephemeral_storage_size": 100, "recipe_image_uri": "docker.io/library/postgres:latest", "recipe_container_port": "5432", "recipe_host_port": "5432", "recipe_container_env": [{"key": "POSTGRES_USER", "value": "llamastack"}, {"key": "POSTGRES_PASSWORD", "value": "llamastack"}, {"key": "POSTGRES_DB", "value": "llamastack"}], "recipe_replica_count": 1}, "exports": ["internal_dns_name"]}, {"name": "chroma", "recipe": {"recipe_id": "chromadb", "deployment_name": "chroma", "recipe_mode": "service", "recipe_node_pool_size": 1, "recipe_node_shape": "VM.Standard.E4.Flex", "recipe_flex_shape_ocpu_count": 2, "recipe_flex_shape_memory_size_in_gbs": 16, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_ephemeral_storage_size": 100, "recipe_image_uri": "docker.io/chromadb/chroma:latest", "recipe_container_port": "8000", "recipe_host_port": "8000", "recipe_container_env": [{"key": "IS_PERSISTENT", "value": "TRUE"}, {"key": "ANONYMIZED_TELEMETRY", "value": "FALSE"}], "recipe_replica_count": 1, "output_object_storage": [{"bucket_name": "chromadb", "mount_location": "/chroma/chroma", "volume_size_in_gbs": 500}]}, "exports": ["internal_dns_name"]}, {"name": "vllm", "recipe": {"recipe_id": "llm_inference_nvidia", "deployment_name": "vllm", "recipe_mode": "service", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:vllmv0.6.6.pos1", "recipe_node_shape": "VM.GPU.A10.2", "input_object_storage": [{"par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/IFknABDAjiiF5LATogUbRCcVQ9KL6aFUC1j-P5NSeUcaB2lntXLaR935rxa-E-u1/n/iduyx1qnmway/b/corrino_hf_oss_models/o/", "mount_location": "/models", "volume_size_in_gbs": 500, "include": ["NousResearch/Meta-Llama-3.1-8B-Instruct"]}], "recipe_container_env": [{"key": "tensor_parallel_size", "value": "2"}, {"key": "model_name", "value": "NousResearch/Meta-Llama-3.1-8B-Instruct"}, {"key": "Model_Path", "value": "/models/NousResearch/Meta-Llama-3.1-8B-Instruct"}], "recipe_replica_count": 1, "recipe_container_port": "8000", "recipe_nvidia_gpu_count": 2, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_container_command_args": ["--model", "$(Model_Path)", "--tensor-parallel-size", "$(tensor_parallel_size)"], "recipe_ephemeral_storage_size": 100, "recipe_shared_memory_volume_size_limit_in_mb": 200}, "exports": ["internal_dns_name"]}, {"name": "jaeger", "recipe": {"recipe_id": "jaeger", "deployment_name": "jaeger", "recipe_mode": "service", "recipe_node_pool_size": 1, "recipe_node_shape": "VM.Standard.E4.Flex", "recipe_flex_shape_ocpu_count": 2, "recipe_flex_shape_memory_size_in_gbs": 16, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_ephemeral_storage_size": 100, "recipe_image_uri": "docker.io/jaegertracing/jaeger:latest", "recipe_container_port": "16686", "recipe_additional_ingress_ports": [{"name": "jaeger", "port": 4318, "path": "/jaeger"}], "recipe_replica_count": 1}, "exports": ["internal_dns_name"]}, {"name": "llamastack_app", "recipe": {"recipe_id": "llamastack_app", "deployment_name": "llamastack_app", "recipe_mode": "service", "recipe_node_pool_size": 1, "recipe_node_shape": "VM.Standard.E4.Flex", "recipe_flex_shape_ocpu_count": 2, "recipe_flex_shape_memory_size_in_gbs": 16, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_ephemeral_storage_size": 100, "recipe_image_uri": "docker.io/llamastack/distribution-postgres-demo:latest", "recipe_container_port": "8321", "recipe_container_env": [{"key": "INFERENCE_MODEL", "value": "/models/NousResearch/Meta-Llama-3.1-8B-Instruct"}, {"key": "VLLM_URL", "value": "http://${vllm.internal_dns_name}/v1"}, {"key": "ENABLE_CHROMADB", "value": "1"}, {"key": "CHROMADB_URL", "value": "http://${chroma.internal_dns_name}:8000"}, {"key": "POSTGRES_HOST", "value": "${postgres.internal_dns_name}"}, {"key": "POSTGRES_PORT", "value": "5432"}, {"key": "POSTGRES_DB", "value": "llamastack"}, {"key": "POSTGRES_USER", "value": "llamastack"}, {"key": "POSTGRES_PASSWORD", "value": "llamastack"}, {"key": "TELEMETRY_SINKS", "value": "console,otel_trace"}, {"key": "OTEL_TRACE_ENDPOINT", "value": "http://${jaeger.internal_dns_name}/jaeger/v1/traces"}], "output_object_storage": [{"bucket_name": "llamastack", "mount_location": "/root/.llama", "volume_size_in_gbs": 100}], "recipe_replica_count": 1}, "depends_on": ["postgres", "chroma", "vllm", "jaeger"]}]}}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "Shared Node Pools", "blueprint_short_description": "Create persistent node pools for efficient blueprint deployment without infrastructure recycling", "blueprint_long_description": "Shared node pools enable you to launch infrastructure independent of individual blueprints, allowing multiple blueprints to deploy and undeploy on the same underlying infrastructure without the overhead of spinning up new node pools for each deployment. This approach eliminates the time-consuming process of infrastructure provisioning and teardown, particularly beneficial for bare metal shapes that require longer recycle times.\n\nWhen you deploy a standard blueprint, OCI AI Blueprints creates a separate node pool for each blueprint and destroys it upon undeployment. Shared node pools solve this inefficiency by providing persistent infrastructure that can host multiple blueprints simultaneously or sequentially. This is especially valuable when you want to deploy multiple blueprints on the same hardware (e.g., two blueprints each using 2 GPUs on a 4-GPU shape) or need rapid deployment cycles.\n\nThe system supports both selector-based and non-selector deployment strategies. With selectors, you can use naming conventions to ensure specific blueprints land on designated shared node pools, providing precise control over resource allocation. Without selectors, blueprints will deploy to any available shared node pool matching the required shape.\n\nShared node pools are compatible with any blueprint and support all OCI compute shapes, with special considerations for bare metal configurations that require boot volume size specifications.\n\n**Note**: The list of shapes below are supported by Blueprints, but not yet supported by OKE, requiring blueprints to treat them as self-managed nodes. These require:\n\n1. Specifying the Availability Domain of the instance type\n\n2. Specifying the custom image OCID to use for the node\n\nAdditional required fields:\n\n```json\n\n\"recipe_availability_domain\": \"\",\n\n\"recipe_node_image_ocid\": \"\"\n\n```\n\nSee [this recipe](./shared_node_pool_B200_BM.json) as an example for these parameters.\n\n[This document section](../using_rdma_enabled_node_pools/README.md#import-a-custom-image) describes now to import a custom image and provides links to import custom images for various shapes.", "pre_filled_samples": [{"pre_filled_sample_name": "Shared Node Pool for BM.GPU.A10", "deployment_name": "BM.GPU.A10.4 shared pool", "recipe_mode": "shared_node_pool", "shared_node_pool_size": 2, "shared_node_pool_shape": "BM.GPU.A10.4", "shared_node_pool_boot_volume_size_in_gbs": 500}, {"pre_filled_sample_name": "Shared Node Pool for VM.GPU.A10", "deployment_name": "VM.GPU.A10.2 shared pool", "recipe_mode": "shared_node_pool", "shared_node_pool_size": 2, "shared_node_pool_shape": "VM.GPU.A10.2", "shared_node_pool_boot_volume_size_in_gbs": 500}, {"pre_filled_sample_name": "vLLM Inference on Shared Pool", "recipe_id": "llm_inference_nvidia", "recipe_mode": "service", "deployment_name": "vLLM Inference Deployment", "recipe_image_uri": "docker.io/vllm/vllm-openai:v0.9.1", "recipe_node_shape": "BM.GPU.A10.4", "input_object_storage": [{"par": "https://objectstorage.us-ashburn-1.oraclecloud.com/p/IFknABDAjiiF5LATogUbRCcVQ9KL6aFUC1j-P5NSeUcaB2lntXLaR935rxa-E-u1/n/iduyx1qnmway/b/corrino_hf_oss_models/o/", "mount_location": "/models", "volume_size_in_gbs": 500, "include": ["NousResearch/Meta-Llama-3.1-8B-Instruct"]}], "recipe_container_env": [{"key": "tensor_parallel_size", "value": "2"}, {"key": "model_name", "value": "NousResearch/Meta-Llama-3.1-8B-Instruct"}, {"key": "Model_Path", "value": "/models/NousResearch/Meta-Llama-3.1-8B-Instruct"}], "recipe_replica_count": 1, "recipe_container_port": "8000", "recipe_nvidia_gpu_count": 2, "recipe_use_shared_node_pool": true, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_container_command_args": ["--model", "$(Model_Path)", "--tensor-parallel-size", "$(tensor_parallel_size)"], "recipe_ephemeral_storage_size": 100, "recipe_shared_memory_volume_size_limit_in_mb": 1000}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "Startup, Liveness, and Readiness Probes", "blueprint_short_description": "Configure application health monitoring and startup validation for reliable service deployment", "blueprint_long_description": "Startup, Liveness, and Readiness Probes are essential Kubernetes tools that ensure your applications are truly ready to serve traffic and remain healthy throughout their lifecycle. These probes are particularly critical for LLM inference services that require time to load model weights before becoming ready to serve requests.\n\nThis blueprint demonstrates how to configure these probes with any OCI AI Blueprint deployment to improve service reliability and prevent traffic routing to unhealthy containers. The probes can be applied to any blueprint type - inference, training, or custom workloads - providing consistent health monitoring across your AI infrastructure.", "pre_filled_samples": [{"pre_filled_sample_name": "vLLM Autoscaling with Health Probes", "recipe_id": "llm_inference_nvidia", "recipe_mode": "service", "deployment_name": "autoscale_with_fss", "recipe_image_uri": "docker.io/vllm/vllm-openai:v0.9.1", "recipe_node_shape": "VM.GPU.A10.2", "recipe_container_env": [{"key": "tensor_parallel_size", "value": "1"}, {"key": "Model_Path", "value": "/models/models/meta-llama/Llama-3.2-1B-Instruct"}], "recipe_replica_count": 1, "recipe_container_port": "8000", "recipe_nvidia_gpu_count": 1, "recipe_container_command_args": ["--model", "$(Model_Path)", "--tensor-parallel-size", "$(tensor_parallel_size)", "--gpu-memory-utilization", "0.99", "--max-model-len", "1024"], "recipe_ephemeral_storage_size": 200, "recipe_node_boot_volume_size_in_gbs": 300, "recipe_node_pool_size": 1, "recipe_shared_memory_volume_size_limit_in_mb": 200, "recipe_startup_probe_params": {"failure_threshold": 30, "endpoint_path": "/health", "port": 8000, "scheme": "HTTP", "initial_delay_seconds": 10, "period_seconds": 2, "success_threshold": 1, "timeout_seconds": 1}, "recipe_liveness_probe_params": {"failure_threshold": 3, "endpoint_path": "/health", "port": 8000, "scheme": "HTTP", "initial_delay_seconds": 65, "period_seconds": 600, "success_threshold": 1, "timeout_seconds": 1}, "recipe_pod_autoscaling_params": {"min_replicas": 1, "max_replicas": 4}, "recipe_node_autoscaling_params": {"min_nodes": 1, "max_nodes": 2}, "input_file_system": [{"file_system_ocid": "ocid1.filesystem.oc1.iad.aaaaaaaaaaklirslnfqwillqojxwiotjmfsc2ylefuzqaaaa", "mount_target_ocid": "ocid1.mounttarget.oc1.iad.aaaaacvipp3o7rlwnfqwillqojxwiotjmfsc2ylefuzqaaaa", "mount_location": "/models", "volume_size_in_gbs": 50}]}]}, {"blueprint_type": "oci_blueprint", "blueprint_category": "workload_blueprint", "blueprint_title": "Teams", "blueprint_short_description": "Enforce resource quotas and fair sharing between teams using Kueue job queuing for efficient cluster utilization", "blueprint_long_description": "Teams in OCI AI Blueprints enables administrators to enforce resource quotas and fair sharing between different organizational units, ensuring efficient allocation of GPU and CPU resources across multiple teams within a shared cluster. The system leverages Kueue, a Kubernetes job queuing system, to manage AI/ML workloads with workload queuing, prioritization, and resource-aware scheduling.\n\nEach team functions as a logical grouping backed by a Kueue ClusterQueue and LocalQueue, with configurable nominal quotas (guaranteed resources), borrowing limits (extra resources when available), and lending limits (idle resources offered to other teams). This approach enables fair sharing, dynamic resource allocation, and improved utilization across workloads while maintaining strict resource boundaries.\n\nThe team system supports multi-tenant clusters where business units, research groups, or customers can be isolated while still sharing idle GPU/CPU capacity. Jobs are admitted based on available quotas and resource policies, with priority thresholds determining which teams can exceed their nominal quotas when extra resources are available.\n\nTeams are particularly valuable for capacity planning, expressing organizational-level GPU budgets in code, and tracking consumption across different groups. The system automatically handles resource borrowing and lending through a shared cohort, ensuring that resources never sit idle while respecting team boundaries and priorities.", "pre_filled_samples": [{"pre_filled_sample_name": "Create Team with Resource Quotas", "recipe_mode": "team", "deployment_name": "create_team", "team": {"team_name": "randomteam", "priority_threshold": 100, "quotas": [{"shape_name": "BM.GPU.H100.8", "cpu_nominal_quota": "10", "cpu_borrowing_limit": "4", "cpu_lending_limit": "4", "mem_nominal_quota": "10", "mem_borrowing_limit": "4", "mem_lending_limit": "4", "gpu_nominal_quota": "10", "gpu_borrowing_limit": "4", "gpu_lending_limit": "4"}, {"shape_name": "VM.GPU.A10.2", "cpu_nominal_quota": "10", "cpu_borrowing_limit": "4", "cpu_lending_limit": "4", "mem_nominal_quota": "10", "mem_borrowing_limit": "4", "mem_lending_limit": "4", "gpu_nominal_quota": "10", "gpu_borrowing_limit": "4", "gpu_lending_limit": "4"}]}}, {"pre_filled_sample_name": "Create Job with Team Assignment", "recipe_id": "healthcheck", "recipe_mode": "job", "deployment_name": "create_job_with_team", "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:healthcheck_v0.3", "recipe_node_shape": "VM.GPU.A10.2", "recipe_use_shared_node_pool": true, "recipe_team_info": {"team_name": "randomteam"}, "output_object_storage": [{"bucket_name": "healthcheck2", "mount_location": "/healthcheck_results", "volume_size_in_gbs": 20}], "recipe_container_command_args": ["--dtype", "float16", "--output_dir", "/healthcheck_results", "--expected_gpus", "A10:2,A100:0,H100:0"], "recipe_replica_count": 1, "recipe_nvidia_gpu_count": 2, "recipe_node_pool_size": 1, "recipe_node_boot_volume_size_in_gbs": 200, "recipe_ephemeral_storage_size": 100, "recipe_shared_memory_volume_size_limit_in_mb": 1000, "recipe_container_cpu_count": 4, "recipe_container_memory_size": 20}]}] \ No newline at end of file diff --git a/docs/sample_blueprints/other/whisper_transcription/README.md b/docs/sample_blueprints/other/whisper_transcription/README.md index 99fa310..4ae11d7 100644 --- a/docs/sample_blueprints/other/whisper_transcription/README.md +++ b/docs/sample_blueprints/other/whisper_transcription/README.md @@ -1,42 +1,50 @@ # Whisper Transcription API -### Transcription + Summarization + Diarization Pipeline (FastAPI-powered) +#### Transcription + Summarization + Diarization Pipeline (FastAPI-powered) This blueprint provides a complete solution for running **audio/video transcription**, **speaker diarization**, and **summarization** via a RESTful API. It integrates [Faster-Whisper](https://github.com/guillaumekln/faster-whisper) for efficient transcription, [pyannote.audio](https://github.com/pyannote/pyannote-audio) for diarization, and Hugging Face instruction-tuned LLMs (e.g., Mistral-7B) for summarization. It supports multi-GPU acceleration, real-time streaming logs, and JSON/text output formats. --- + ## Pre-Filled Samples Below are pre-configured blueprints for deploying Whisper transcription using different GPU configurations on Oracle Cloud Infrastructure. -| Feature Showcase Title | Description | Blueprint File | -|----------------------------------------------------------------------|-----------------------------------------------------------------------|-----------------------------------| -| Deploy Whisper transcription on A10 GPU for real-time speech-to-text | Real-time audio transcription with Whisper on BM.GPU.A10.8 | [whisper-transcription-A10.json](whisper-transcription-A10.json) | -| Deploy Whisper transcription on A100 GPU for high-speed processing | High-performance Whisper transcription using BM.GPU.A100.8 | [whisper-transcription-A100.json](whisper-transcription-A100.json) | -| Deploy Whisper transcription on H100 GPU for next-gen AI workloads | Ultra-fast Whisper transcription with Whisper on BM.GPU.H100.8 | [whisper-transcription-H100.json](whisper-transcription-H100.json) | +## Pre-Filled Samples + +| Feature Showcase | Title | Description | Blueprint File | +| -------------------------------------------------------------------- | ------------------ | -------------------------------------------------------------- | ------------------------------------------------------------------ | +| Deploy Whisper transcription on A10 GPU for real-time speech-to-text | A10 Transcription | Real-time audio transcription with Whisper on BM.GPU.A10.8 | [whisper-transcription-A10.json](whisper-transcription-A10.json) | +| Deploy Whisper transcription on A100 GPU for high-speed processing | A100 Transcription | High-performance Whisper transcription using BM.GPU.A100.8 | [whisper-transcription-A100.json](whisper-transcription-A100.json) | +| Deploy Whisper transcription on H100 GPU for next-gen AI workloads | H100 Transcription | Ultra-fast Whisper transcription with Whisper on BM.GPU.H100.8 | [whisper-transcription-H100.json](whisper-transcription-H100.json) | -## Key Features +--- + +# In-Depth Feature Overview -| Capability | Description | -|------------------------|-----------------------------------------------------------------------------------------------| -| Transcription | Fast, multi-GPU inference with Faster-Whisper | -| Summarization | Uses Mistral-7B (or other HF models) to create summaries of long transcripts | -| Speaker Diarization | Global speaker labeling via pyannote.audio | -| Denoising | Hybrid removal of background noise using Demucs and noisereduce | -| Real-Time Streaming | Logs stream live via HTTP if enabled | -| Format Compatibility | Supports `.mp3`, `.wav`, `.flac`, `.aac`, `.m4a`, `.mp4`, `.webm`, `.mov`, `.mkv`, `.avi`, etc. | +| Capability | Description | +| -------------------- | ----------------------------------------------------------------------------------------------- | +| Transcription | Fast, multi-GPU inference with Faster-Whisper | +| Summarization | Uses Mistral-7B (or other HF models) to create summaries of long transcripts | +| Speaker Diarization | Global speaker labeling via pyannote.audio | +| Denoising | Hybrid removal of background noise using Demucs and noisereduce | +| Real-Time Streaming | Logs stream live via HTTP if enabled | +| Format Compatibility | Supports `.mp3`, `.wav`, `.flac`, `.aac`, `.m4a`, `.mp4`, `.webm`, `.mov`, `.mkv`, `.avi`, etc. | --- ## Deployment on OCI Blueprint ### Sample Recipe (Service Mode) -please look at this json file as an example [whisper-transcription-A10.json](whisper-transcription-A10.json) + +please look at this json file as an example [whisper-transcription-A10.json](whisper-transcription-A10.json) ### Endpoint + ``` POST https://.nip.io/transcribe ``` + **Example:** `https://whisper-transcription-a10-6666.130-162-199-33.nip.io/transcribe` @@ -44,23 +52,24 @@ POST https://.nip.io/transcribe ## API Parameters -| Name | Type | Description | -|-------------------|-----------|-----------------------------------------------------------------------------------------------------------------------| -| `audio_url` | string | URL to audio file in OCI Object Storage (requires PAR) | -| `model` | string | Whisper model to use: `base`, `medium`, `large`, `turbo`, etc. | -| `summary` | bool | Whether to generate a summary (default: false). Requires `hf_token` if model path not provided | -| `speaker` | bool | Whether to run diarization (default: false). Requires `hf_token` | -| `max_speakers` | int | (Optional) Maximum number of speakers expected for diarization | -| `denoise` | bool | Whether to apply noise reduction | -| `streaming` | bool | Enables real-time logs via /stream_log endpoint | -| `hf_token` | string | Hugging Face access token (required for diarization or HF-hosted summarizers) | -| `prop_decrease` | float | (Optional) Controls level of noise suppression. Range: 0.0–1.0 (default: 0.7) | -| `summarized_model`| string | (Optional) Path or HF model ID for summarizer. Default: `mistralai/Mistral-7B-Instruct-v0.1` | -| `ground_truth` | string | (Optional) Path to reference transcript file to compute WER | +| Name | Type | Description | +| ------------------ | ------ | ---------------------------------------------------------------------------------------------- | +| `audio_url` | string | URL to audio file in OCI Object Storage (requires PAR) | +| `model` | string | Whisper model to use: `base`, `medium`, `large`, `turbo`, etc. | +| `summary` | bool | Whether to generate a summary (default: false). Requires `hf_token` if model path not provided | +| `speaker` | bool | Whether to run diarization (default: false). Requires `hf_token` | +| `max_speakers` | int | (Optional) Maximum number of speakers expected for diarization | +| `denoise` | bool | Whether to apply noise reduction | +| `streaming` | bool | Enables real-time logs via /stream_log endpoint | +| `hf_token` | string | Hugging Face access token (required for diarization or HF-hosted summarizers) | +| `prop_decrease` | float | (Optional) Controls level of noise suppression. Range: 0.0–1.0 (default: 0.7) | +| `summarized_model` | string | (Optional) Path or HF model ID for summarizer. Default: `mistralai/Mistral-7B-Instruct-v0.1` | +| `ground_truth` | string | (Optional) Path to reference transcript file to compute WER | --- ## Example cURL Command + ```bash curl -k -N -L -X POST https://.nip.io/transcribe \ -F "audio_url=" \ @@ -88,13 +97,16 @@ Each processed audio generates the following: ## Streaming Logs If `streaming=true`, the response will contain a log filename: + ```json { "meta": "logfile_name", "logfile": "transcription_log_remote_audio_.log" } ``` + To stream logs in real-time: + ```bash curl -N https://.nip.io/stream_log/ ``` @@ -113,15 +125,15 @@ https://huggingface.co/settings/tokens ## Dependencies -| Package | Purpose | -|---------------------|----------------------------------| -| `faster-whisper` | Core transcription engine | -| `transformers` | Summarization via Hugging Face | -| `pyannote.audio` | Speaker diarization | -| `pydub`, `librosa` | Audio chunking and processing | -| `demucs` | Vocal separation / denoising | -| `fastapi`, `uvicorn`| REST API server | -| `jiwer` | WER evaluation | +| Package | Purpose | +| -------------------- | ------------------------------ | +| `faster-whisper` | Core transcription engine | +| `transformers` | Summarization via Hugging Face | +| `pyannote.audio` | Speaker diarization | +| `pydub`, `librosa` | Audio chunking and processing | +| `demucs` | Vocal separation / denoising | +| `fastapi`, `uvicorn` | REST API server | +| `jiwer` | WER evaluation | --- From 98da6830567098446960a325561880ca6d4ce7c0 Mon Sep 17 00:00:00 2001 From: grantneumanoracle Date: Fri, 18 Jul 2025 16:46:17 -0700 Subject: [PATCH 03/13] update the blueprint json schema to match the new blueprint categories --- .../blueprint_json_schema.json | 32 +++++++++++++++---- 1 file changed, 26 insertions(+), 6 deletions(-) diff --git a/docs/custom_blueprints/blueprint_json_schema.json b/docs/custom_blueprints/blueprint_json_schema.json index b888a8c..9a38519 100644 --- a/docs/custom_blueprints/blueprint_json_schema.json +++ b/docs/custom_blueprints/blueprint_json_schema.json @@ -599,14 +599,34 @@ "description": "Classifies the blueprint by intent.", "oneOf": [ { - "const": "workload_blueprint", - "title": "Workload blueprint", - "description": "End‑to‑end workloads such as inference, fine‑tuning, benchmarking, or health‑checking that deliver a runnable solution." + "const": "gpu_benchmarking", + "title": "GPU Benchmarking", + "description": "Benchmarks for measuring GPU performance, compute throughput, memory bandwidth, and hardware utilization across different workloads and configurations." }, { - "const": "platform_feature_blueprint", - "title": "Platform‑feature blueprint", - "description": "Demonstrates how to use a specific OCI AI Blueprints capability (autoscaling, shared pools, MIG, etc.) that users can copy into other blueprints." + "const": "gpu_health_check", + "title": "GPU Health Check", + "description": "Diagnostic tools and health monitoring solutions for validating GPU functionality, detecting hardware issues, and ensuring optimal GPU cluster operations." + }, + { + "const": "model_fine_tuning", + "title": "Model Fine-tuning", + "description": "End-to-end solutions for fine-tuning pre-trained machine learning models on custom datasets, including parameter-efficient methods like LoRA and full fine-tuning approaches." + }, + { + "const": "model_serving", + "title": "Model Serving", + "description": "Inference and model serving solutions for deploying trained models as scalable services, including real-time inference, batch processing, and multi-model serving scenarios." + }, + { + "const": "platform_features", + "title": "Platform Features", + "description": "Demonstrations of specific OCI AI Blueprints platform capabilities such as autoscaling, shared node pools, MIG configurations, storage integrations, and networking features." + }, + { + "const": "other", + "title": "Other", + "description": "General-purpose blueprints and specialized use cases that don't fit into the standard categories, including experimental workflows and custom integrations." } ] }, From 079b1ea30267fca1a240a8f0498e7659d8201497 Mon Sep 17 00:00:00 2001 From: ssraghavan-oci Date: Mon, 21 Apr 2025 12:09:10 -0400 Subject: [PATCH 04/13] docs for offline inference --- .../offline-inference-infra/README.md | 120 ++++++++++++++++++ 1 file changed, 120 insertions(+) create mode 100644 docs/sample_blueprints/offline-inference-infra/README.md diff --git a/docs/sample_blueprints/offline-inference-infra/README.md b/docs/sample_blueprints/offline-inference-infra/README.md new file mode 100644 index 0000000..803e977 --- /dev/null +++ b/docs/sample_blueprints/offline-inference-infra/README.md @@ -0,0 +1,120 @@ +Here’s your rewritten `README.md`, styled similarly to the CPU inference blueprint but focused on **offline GPU inference using the SGLang backend**. + +# Offline Inference Blueprint - Infra (SGLang + vLLM) + +This blueprint provides a configurable framework to run **offline LLM inference benchmarks** using either the SGLang or vLLM backends. It is designed for cloud GPU environments and supports automated performance benchmarking with MLflow logging. + +This blueprint enables you to: +- Run inference locally on GPU nodes using pre-loaded models +- Benchmark token throughput, latency, and request performance +- Push results to MLflow for comparison and analysis + +--- + +## Pre-Filled Samples + +| Title | Description | +|------------------------------|-----------------------------------------------------------------------------| +|Offline inference with LLaMA 3|Benchmarks Meta-Llama-3.1-8B model using SGLang on VM.GPU.A10.2 with 2 GPUs. | + +You can access these pre-filled samples from the OCI AI Blueprint portal. + +--- +## When to use Offline inference + +Offline inference is ideal for: +- Accurate performance benchmarking (no API or network bottlenecks) +- Comparing GPU hardware performance (A10, A100, H100, MI300X) +- Evaluating backend frameworks like vLLM and SGLang + +--- + +## Supported Backends + +| Backend | Description | +|----------|--------------------------------------------------------------| +| sglang | Fast multi-modal LLM backend with optimized throughput | +| vllm | Token streaming inference engine for LLMs with speculative decoding | + +--- + +## Running the Benchmark + +This blueprint supports benchmark execution via a job-mode recipe using a YAML config file. The recipe mounts a model and config file from Object Storage, runs offline inference, and logs metrics. + +--- + +### Sample Recipe (Job Mode for Offline SGLang Inference) + +```json +{ + "recipe_id": "offline_inference_sglang", + "recipe_mode": "job", + "deployment_name": "Offline Inference Benchmark", + "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:llm-benchmark-0409-v2", + "recipe_node_shape": "VM.GPU.A10.2", + "input_object_storage": [ + { + "par": "https://objectstorage.ap-melbourne-1.oraclecloud.com/p/Z2q73uuLCAxCbGXJ99CIeTxnCTNipsE-1xHE9HYfCz0RBYPTcCbqi9KHViUEH-Wq/n/iduyx1qnmway/b/mymodels/o/", + "mount_location": "/models", + "volume_size_in_gbs": 500, + "include": [ + "example_sglang.yaml", + "NousResearch/Meta-Llama-3.1-8B" + ] + } + ], + "recipe_container_command_args": [ + "/models/example_sglang.yaml" + ], + "recipe_replica_count": 1, + "recipe_container_port": "8000", + "recipe_nvidia_gpu_count": 2, + "recipe_node_pool_size": 1, + "recipe_node_boot_volume_size_in_gbs": 200, + "recipe_ephemeral_storage_size": 100, + "recipe_shared_memory_volume_size_limit_in_mb": 200 +} +``` + +--- + +## Sample Config File (`example_sglang.yaml`) + +```yaml +benchmark_type: offline +offline_backend: sglang + +model_path: /models/NousResearch/Meta-Llama-3.1-8B +tokenizer_path: /models/NousResearch/Meta-Llama-3.1-8B +trust_remote_code: true +conv_template: llama-2 + +input_len: 128 +output_len: 128 +num_prompts: 64 +max_seq_len: 4096 +max_batch_size: 8 +dtype: auto +temperature: 0.7 +top_p: 0.9 + +mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000 +experiment_name: "sglang-bench-doc-test-new" +run_name: "llama3-8b-sglang-test" +``` + +--- + +## Metrics Logged + +- `requests_per_second` +- `input_tokens_per_second` +- `output_tokens_per_second` +- `total_tokens_per_second` +- `elapsed_time` +- `total_input_tokens` +- `total_output_tokens` + +If a dataset is provided: +- `accuracy` From 774c9072009d359c6e1b3ab578ea876ea2fa09c7 Mon Sep 17 00:00:00 2001 From: ssraghavan-oci Date: Mon, 21 Apr 2025 12:18:56 -0400 Subject: [PATCH 05/13] removed edit line --- docs/sample_blueprints/offline-inference-infra/README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/docs/sample_blueprints/offline-inference-infra/README.md b/docs/sample_blueprints/offline-inference-infra/README.md index 803e977..d0bb6d0 100644 --- a/docs/sample_blueprints/offline-inference-infra/README.md +++ b/docs/sample_blueprints/offline-inference-infra/README.md @@ -1,5 +1,3 @@ -Here’s your rewritten `README.md`, styled similarly to the CPU inference blueprint but focused on **offline GPU inference using the SGLang backend**. - # Offline Inference Blueprint - Infra (SGLang + vLLM) This blueprint provides a configurable framework to run **offline LLM inference benchmarks** using either the SGLang or vLLM backends. It is designed for cloud GPU environments and supports automated performance benchmarking with MLflow logging. From 76e53e0066a4c017472c4d6eeba97d8f8f993512 Mon Sep 17 00:00:00 2001 From: ssraghavan-oci Date: Mon, 21 Apr 2025 12:22:37 -0400 Subject: [PATCH 06/13] online inference readme --- .../online-inference-infra/README.md | 104 ++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 docs/sample_blueprints/online-inference-infra/README.md diff --git a/docs/sample_blueprints/online-inference-infra/README.md b/docs/sample_blueprints/online-inference-infra/README.md new file mode 100644 index 0000000..8b1f4bf --- /dev/null +++ b/docs/sample_blueprints/online-inference-infra/README.md @@ -0,0 +1,104 @@ +# Online Inference Blueprint (LLMPerf) + +This blueprint benchmarks **online inference performance** of large language models using **LLMPerf**, a standardized benchmarking tool. It is designed to evaluate LLM APIs served via platforms such as OpenAI-compatible interfaces, including self-hosted LLM inference endpoints. + +This blueprint helps: +- Simulate real-time request load on a running model server +- Measure end-to-end latency, throughput, and completion performance +- Push results to MLflow for visibility and tracking + +--- + +## Pre-Filled Samples + +| Title | Description | +|----------------------------------------|-----------------------------------------------------------------------------| +|Online inference on LLaMA 3 using LLMPerf|Benchmark of meta/llama3-8b-instruct via a local OpenAI-compatible endpoint | + +These can be accessed directly from the OCI AI Blueprint portal. + +--- + +## Prerequisites + +Before running this blueprint: +- You **must have an inference server already running**, compatible with the OpenAI API format. +- Ensure the endpoint and model name match what’s defined in the config. + +--- + +## Supported Scenarios + +| Use Case | Description | +|-----------------------|-------------------------------------------------------| +| Local LLM APIs | Benchmark your own self-hosted models (e.g., vLLM) | +| Remote OpenAI API | Benchmark OpenAI deployments for throughput analysis | +| Multi-model endpoints | Test latency/throughput across different configurations | + +--- + +### Sample Recipe (Job Mode for Online Benchmarking) + +```json +{ + "recipe_id": "online_inference_benchmark", + "recipe_mode": "job", + "deployment_name": "Online Inference Benchmark", + "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:llm-benchmark-0409-v2", + "recipe_node_shape": "VM.GPU.A10.2", + "input_object_storage": [ + { + "par": "https://objectstorage.ap-melbourne-1.oraclecloud.com/p/Z2q73uuLCAxCbGXJ99CIeTxnCTNipsE-1xHE9HYfCz0RBYPTcCbqi9KHViUEH-Wq/n/iduyx1qnmway/b/mymodels/o/", + "mount_location": "/models", + "volume_size_in_gbs": 100, + "include": [ + "example_online.yaml" + ] + } + ], + "recipe_container_command_args": [ + "/models/example_online.yaml" + ], + "recipe_replica_count": 1, + "recipe_container_port": "8000", + "recipe_node_pool_size": 1, + "recipe_node_boot_volume_size_in_gbs": 200, + "recipe_ephemeral_storage_size": 100 +} +``` + +--- + +## Sample Config File (`example_online.yaml`) + +```yaml +benchmark_type: online + +model: meta/llama3-8b-instruct +input_len: 64 +output_len: 32 +max_requests: 5 +timeout: 300 +num_concurrent: 1 +results_dir: /workspace/results_on +llm_api: openai +llm_api_key: dummy-key +llm_api_base: http://localhost:8001/v1 + +experiment_name: local-bench +run_name: llama3-test +mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000 +llmperf_path: /opt/llmperf-src +metadata: test=localhost +``` + +--- + +## Metrics Logged + +- `output_tokens_per_second` +- `requests_per_minute` +- `overall_output_throughput` +- All raw metrics from the `_summary.json` output of LLMPerf + +--- From bad0fde07086d2fd8518ee9bddecab434c80294b Mon Sep 17 00:00:00 2001 From: ssraghavan-oci Date: Mon, 21 Apr 2025 23:16:40 -0400 Subject: [PATCH 07/13] better readme with extra pre-filled samples for offline inference --- .../offline-inference-infra/README.md | 192 +++++++++++++++--- 1 file changed, 165 insertions(+), 27 deletions(-) diff --git a/docs/sample_blueprints/offline-inference-infra/README.md b/docs/sample_blueprints/offline-inference-infra/README.md index d0bb6d0..f67861b 100644 --- a/docs/sample_blueprints/offline-inference-infra/README.md +++ b/docs/sample_blueprints/offline-inference-infra/README.md @@ -14,6 +14,7 @@ This blueprint enables you to: | Title | Description | |------------------------------|-----------------------------------------------------------------------------| |Offline inference with LLaMA 3|Benchmarks Meta-Llama-3.1-8B model using SGLang on VM.GPU.A10.2 with 2 GPUs. | +|Offline inference with LLAMA 3- vLLM| Benchmarks Meta-Llama-3.1-8B model using vLLM on VM.GPU.A10.2 with 2 GPUs.| You can access these pre-filled samples from the OCI AI Blueprint portal. @@ -46,33 +47,41 @@ This blueprint supports benchmark execution via a job-mode recipe using a YAML c ```json { - "recipe_id": "offline_inference_sglang", - "recipe_mode": "job", - "deployment_name": "Offline Inference Benchmark", - "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:llm-benchmark-0409-v2", - "recipe_node_shape": "VM.GPU.A10.2", - "input_object_storage": [ - { - "par": "https://objectstorage.ap-melbourne-1.oraclecloud.com/p/Z2q73uuLCAxCbGXJ99CIeTxnCTNipsE-1xHE9HYfCz0RBYPTcCbqi9KHViUEH-Wq/n/iduyx1qnmway/b/mymodels/o/", - "mount_location": "/models", - "volume_size_in_gbs": 500, - "include": [ - "example_sglang.yaml", - "NousResearch/Meta-Llama-3.1-8B" - ] - } - ], - "recipe_container_command_args": [ - "/models/example_sglang.yaml" - ], - "recipe_replica_count": 1, - "recipe_container_port": "8000", - "recipe_nvidia_gpu_count": 2, - "recipe_node_pool_size": 1, - "recipe_node_boot_volume_size_in_gbs": 200, - "recipe_ephemeral_storage_size": 100, - "recipe_shared_memory_volume_size_limit_in_mb": 200 -} + "recipe_id": "offline_inference_sglang", + "recipe_mode": "job", + "deployment_name": "Offline Inference Benchmark", + "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:llm-benchmark-0409-v4", + "recipe_node_shape": "VM.GPU.A10.2", + "input_object_storage": [ + { + "par": "https://objectstorage.ap-melbourne-1.oraclecloud.com/p/0T99iRADcM08aVpumM6smqMIcnIJTFtV2D8ZIIWidUP9eL8GSRyDMxOb9Va9rmRc/n/iduyx1qnmway/b/mymodels/o/", + "mount_location": "/models", + "volume_size_in_gbs": 500, + "include": [ + "new_example_sglang.yaml", + "NousResearch/Meta-Llama-3.1-8B" + ] + } + ], + "output_object_storage": [ + { + "bucket_name": "inference_output", + "mount_location": "/mlcommons_output", + "volume_size_in_gbs": 200 + } + ], + "recipe_container_command_args": [ + "/models/new_example_sglang.yaml" + ], + "recipe_replica_count": 1, + "recipe_container_port": "8000", + "recipe_nvidia_gpu_count": 2, + "recipe_node_pool_size": 1, + "recipe_node_boot_volume_size_in_gbs": 200, + "recipe_ephemeral_storage_size": 100, + "recipe_shared_memory_volume_size_limit_in_mb": 200 + } + ``` --- @@ -100,6 +109,43 @@ top_p: 0.9 mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000 experiment_name: "sglang-bench-doc-test-new" run_name: "llama3-8b-sglang-test" + + +save_metrics_path: /mlcommons_output/benchmark_output_llama3_sglang.json + +``` + +```yaml +benchmark_type: offline +model: /models/NousResearch/Meta-Llama-3.1-8B +tokenizer: /models/NousResearch/Meta-Llama-3.1-8B + +input_len: 12 +output_len: 12 +num_prompts: 2 +seed: 42 +tensor_parallel_size: 8 + +# vLLM-specific +#quantization: awq +dtype: half +gpu_memory_utilization: 0.99 +num_scheduler_steps: 10 +device: cuda +enforce_eager: true +kv_cache_dtype: auto +enable_prefix_caching: true +distributed_executor_backend: mp + +# Output +#output_json: ./128_128.json + +# MLflow +mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000 +experiment_name: test-bm-suite-doc +run_name: llama3-vllm-test +save_metrics_path: /mlcommons_output/benchmark_output_llama3_vllm.json + ``` --- @@ -116,3 +162,95 @@ run_name: "llama3-8b-sglang-test" If a dataset is provided: - `accuracy` + + +### Top-level Deployment Keys + +| Key | Description | +|-----|-------------| +| `recipe_id` | Identifier of the recipe to run; here, it's an offline SGLang benchmark job. | +| `recipe_mode` | Specifies this is a `job`, meaning it runs to completion and exits. | +| `deployment_name` | Human-readable name for the job. | +| `recipe_image_uri` | Docker image containing the benchmark code and dependencies. | +| `recipe_node_shape` | Shape of the VM or GPU node to run the job (e.g., VM.GPU.A10.2). | + +### Input Object Storage + +| Key | Description | +|-----|-------------| +| `input_object_storage` | List of inputs to mount from Object Storage. | +| `par` | Pre-Authenticated Request (PAR) link to a bucket/folder. | +| `mount_location` | Files are mounted to this path inside the container. | +| `volume_size_in_gbs` | Size of the mount volume. | +| `include` | Only these files/folders from the bucket are mounted (e.g., model + config). | + +### Output Object Storage + +| Key | Description | +|-----|-------------| +| `output_object_storage` | Where to store outputs like benchmark logs or results. | +| `bucket_name` | Name of the output bucket in OCI Object Storage. | +| `mount_location` | Mount point inside container where outputs are written. | +| `volume_size_in_gbs` | Size of this volume in GBs. | + +### Runtime & Infra Settings + +| Key | Description | +|-----|-------------| +| `recipe_container_command_args` | Path to the YAML config that defines benchmark parameters. | +| `recipe_replica_count` | Number of job replicas to run (usually 1 for inference). | +| `recipe_container_port` | Port (optional for offline mode; required if API is exposed). | +| `recipe_nvidia_gpu_count` | Number of GPUs allocated to this job. | +| `recipe_node_pool_size` | Number of nodes in the pool (1 means 1 VM). | +| `recipe_node_boot_volume_size_in_gbs` | Disk size for OS + dependencies. | +| `recipe_ephemeral_storage_size` | Local scratch space in GBs. | +| `recipe_shared_memory_volume_size_limit_in_mb` | Shared memory (used by some inference engines). | + +--- + +## **Sample Config File (`example_sglang.yaml`)** + +This file is consumed by the container during execution to configure the benchmark run. + +### Inference Setup + +| Key | Description | +|-----|-------------| +| `benchmark_type` | Set to `offline` to indicate local execution with no HTTP server. | +| `offline_backend` | Backend engine to use (`sglang` or `vllm`). | +| `model_path` | Path to the model directory (already mounted via Object Storage). | +| `tokenizer_path` | Path to the tokenizer (usually same as model path). | +| `trust_remote_code` | Enables loading models that require custom code (Hugging Face). | +| `conv_template` | Prompt formatting template to use (e.g., `llama-2`). | + +### Benchmark Parameters + +| Key | Description | +|-----|-------------| +| `input_len` | Number of tokens in the input prompt. | +| `output_len` | Number of tokens to generate. | +| `num_prompts` | Number of total prompts to run (e.g., 64 prompts x 128 output tokens). | +| `max_seq_len` | Max sequence length supported by the model (e.g., 4096). | +| `max_batch_size` | Max batch size per inference run (depends on GPU memory). | +| `dtype` | Precision (e.g., float16, bfloat16, auto). | + +### Sampling Settings + +| Key | Description | +|-----|-------------| +| `temperature` | Controls randomness in generation (lower = more deterministic). | +| `top_p` | Top-p sampling for diversity (0.9 keeps most probable tokens). | + +### MLflow Logging + +| Key | Description | +|-----|-------------| +| `mlflow_uri` | MLflow server to log performance metrics. | +| `experiment_name` | Experiment name to group runs in MLflow UI. | +| `run_name` | Custom name to identify this particular run. | + +### Output + +| Key | Description | +|-----|-------------| +| `save_metrics_path` | Path inside the container where metrics will be saved as JSON. | From eb05014f3c38387a9cf0635d6cb78068db6cf01a Mon Sep 17 00:00:00 2001 From: ssraghavan-oci Date: Tue, 22 Apr 2025 12:14:21 -0400 Subject: [PATCH 08/13] added sample json files --- .../offline-inference-infra/README.md | 1 + .../new_example_sglang.yaml | 24 +++++++++++++++ .../offline_vllm_example.yaml | 29 +++++++++++++++++++ .../online_example.yaml | 16 ++++++++++ 4 files changed, 70 insertions(+) create mode 100644 docs/sample_blueprints/offline-inference-infra/new_example_sglang.yaml create mode 100644 docs/sample_blueprints/offline-inference-infra/offline_vllm_example.yaml create mode 100644 docs/sample_blueprints/online-inference-infra/online_example.yaml diff --git a/docs/sample_blueprints/offline-inference-infra/README.md b/docs/sample_blueprints/offline-inference-infra/README.md index f67861b..256f63d 100644 --- a/docs/sample_blueprints/offline-inference-infra/README.md +++ b/docs/sample_blueprints/offline-inference-infra/README.md @@ -41,6 +41,7 @@ Offline inference is ideal for: This blueprint supports benchmark execution via a job-mode recipe using a YAML config file. The recipe mounts a model and config file from Object Storage, runs offline inference, and logs metrics. +Notes : Make sure your output object storage is in the same tenancy as your stack. --- ### Sample Recipe (Job Mode for Offline SGLang Inference) diff --git a/docs/sample_blueprints/offline-inference-infra/new_example_sglang.yaml b/docs/sample_blueprints/offline-inference-infra/new_example_sglang.yaml new file mode 100644 index 0000000..1649e7a --- /dev/null +++ b/docs/sample_blueprints/offline-inference-infra/new_example_sglang.yaml @@ -0,0 +1,24 @@ +benchmark_type: offline +offline_backend: sglang + +model_path: /models/NousResearch/Meta-Llama-3.1-8B +tokenizer_path: /models/NousResearch/Meta-Llama-3.1-8B +trust_remote_code: true +conv_template: llama-2 + +input_len: 128 +output_len: 128 +num_prompts: 64 +max_seq_len: 4096 +max_batch_size: 8 +dtype: auto +temperature: 0.7 +top_p: 0.9 + +mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000 +experiment_name: "sglang-bench-doc-test-new" +run_name: "llama3-8b-sglang-test" + + +save_metrics_path: /mlcommons_output/benchmark_output_llama3_sglang.json + diff --git a/docs/sample_blueprints/offline-inference-infra/offline_vllm_example.yaml b/docs/sample_blueprints/offline-inference-infra/offline_vllm_example.yaml new file mode 100644 index 0000000..7734c14 --- /dev/null +++ b/docs/sample_blueprints/offline-inference-infra/offline_vllm_example.yaml @@ -0,0 +1,29 @@ +benchmark_type: offline +model: /models/NousResearch/Meta-Llama-3.1-8B +tokenizer: /models/NousResearch/Meta-Llama-3.1-8B + +input_len: 12 +output_len: 12 +num_prompts: 2 +seed: 42 +tensor_parallel_size: 8 + +# vLLM-specific +#quantization: awq +dtype: half +gpu_memory_utilization: 0.99 +num_scheduler_steps: 10 +device: cuda +enforce_eager: true +kv_cache_dtype: auto +enable_prefix_caching: true +distributed_executor_backend: mp + +# Output +#output_json: ./128_128.json + +# MLflow +mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000 +experiment_name: test-bm-suite-doc +run_name: llama3-vllm-test +save_metrics_path: /mlcommons_output/benchmark_output_llama3_vllm.json diff --git a/docs/sample_blueprints/online-inference-infra/online_example.yaml b/docs/sample_blueprints/online-inference-infra/online_example.yaml new file mode 100644 index 0000000..d4d0fe3 --- /dev/null +++ b/docs/sample_blueprints/online-inference-infra/online_example.yaml @@ -0,0 +1,16 @@ +benchmark_type: online +model: meta/llama3-8b-instruct +input_len: 64 +output_len: 32 +max_requests: 5 +timeout: 300 +num_concurrent: 1 +results_dir: /workspace/results_on +llm_api: openai +llm_api_key: dummy-key +llm_api_base: http://localhost:8001/v1 +experiment_name: local-bench +run_name: llama3-test +mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000 +llmperf_path: /opt/llmperf-src +metadata: test=localhost From 001875bef9bcbba564fb6c96513a1436fb858d78 Mon Sep 17 00:00:00 2001 From: ssraghavan-oci Date: Tue, 22 Apr 2025 12:18:56 -0400 Subject: [PATCH 09/13] added deployment json files --- .../offline_deployment_sglang.json | 36 +++++++++++++++++++ .../offline_deployment_vllm.json | 36 +++++++++++++++++++ 2 files changed, 72 insertions(+) create mode 100644 docs/sample_blueprints/offline-inference-infra/offline_deployment_sglang.json create mode 100644 docs/sample_blueprints/offline-inference-infra/offline_deployment_vllm.json diff --git a/docs/sample_blueprints/offline-inference-infra/offline_deployment_sglang.json b/docs/sample_blueprints/offline-inference-infra/offline_deployment_sglang.json new file mode 100644 index 0000000..e3b988a --- /dev/null +++ b/docs/sample_blueprints/offline-inference-infra/offline_deployment_sglang.json @@ -0,0 +1,36 @@ +{ + "recipe_id": "offline_inference_sglang", + "recipe_mode": "job", + "deployment_name": "Offline Inference Benchmark", + "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:llm-benchmark-0409-v4", + "recipe_node_shape": "VM.GPU.A10.2", + "input_object_storage": [ + { + "par": "https://objectstorage.ap-melbourne-1.oraclecloud.com/p/0T99iRADcM08aVpumM6smqMIcnIJTFtV2D8ZIIWidUP9eL8GSRyDMxOb9Va9rmRc/n/iduyx1qnmway/b/mymodels/o/", + "mount_location": "/models", + "volume_size_in_gbs": 500, + "include": [ + "new_example_sglang.yaml", + "NousResearch/Meta-Llama-3.1-8B" + ] + } + ], + "output_object_storage": [ + { + "bucket_name": "inference_output", + "mount_location": "/mlcommons_output", + "volume_size_in_gbs": 200 + } + ], + "recipe_container_command_args": [ + "/models/new_example_sglang.yaml" + ], + "recipe_replica_count": 1, + "recipe_container_port": "8000", + "recipe_nvidia_gpu_count": 2, + "recipe_node_pool_size": 1, + "recipe_node_boot_volume_size_in_gbs": 200, + "recipe_ephemeral_storage_size": 100, + "recipe_shared_memory_volume_size_limit_in_mb": 200 + } + \ No newline at end of file diff --git a/docs/sample_blueprints/offline-inference-infra/offline_deployment_vllm.json b/docs/sample_blueprints/offline-inference-infra/offline_deployment_vllm.json new file mode 100644 index 0000000..e920f38 --- /dev/null +++ b/docs/sample_blueprints/offline-inference-infra/offline_deployment_vllm.json @@ -0,0 +1,36 @@ +{ + "recipe_id": "offline_inference_vllm", + "recipe_mode": "job", + "deployment_name": "Offline Inference Benchmark vllm", + "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:llm-benchmark-0409-v4", + "recipe_node_shape": "VM.GPU.A10.2", + "input_object_storage": [ + { + "par": "https://objectstorage.ap-melbourne-1.oraclecloud.com/p/0T99iRADcM08aVpumM6smqMIcnIJTFtV2D8ZIIWidUP9eL8GSRyDMxOb9Va9rmRc/n/iduyx1qnmway/b/mymodels/o/", + "mount_location": "/models", + "volume_size_in_gbs": 500, + "include": [ + "new_example_sglang.yaml", + "NousResearch/Meta-Llama-3.1-8B" + ] + } + ], + "output_object_storage": [ + { + "bucket_name": "inference_output", + "mount_location": "/mlcommons_output", + "volume_size_in_gbs": 200 + } + ], + "recipe_container_command_args": [ + "/models/offline_vllm_example.yaml" + ], + "recipe_replica_count": 1, + "recipe_container_port": "8000", + "recipe_nvidia_gpu_count": 2, + "recipe_node_pool_size": 1, + "recipe_node_boot_volume_size_in_gbs": 200, + "recipe_ephemeral_storage_size": 100, + "recipe_shared_memory_volume_size_limit_in_mb": 200 + } + \ No newline at end of file From e73cbdfc87c7f520654cb33a6552591bd4b4ae55 Mon Sep 17 00:00:00 2001 From: ssraghavan-oci Date: Tue, 22 Apr 2025 22:57:36 -0400 Subject: [PATCH 10/13] addressed PR comments --- .../offline-inference-infra/README.md | 55 ++++++++++++++++++- 1 file changed, 52 insertions(+), 3 deletions(-) diff --git a/docs/sample_blueprints/offline-inference-infra/README.md b/docs/sample_blueprints/offline-inference-infra/README.md index 256f63d..bb3b882 100644 --- a/docs/sample_blueprints/offline-inference-infra/README.md +++ b/docs/sample_blueprints/offline-inference-infra/README.md @@ -38,13 +38,20 @@ Offline inference is ideal for: --- ## Running the Benchmark +- Things need to run the benchmark + - Model checkpoints pre-downloaded and stored in an object storage. + - Make sure to get a PAR for the object storage where the models are saved. With listing, write and read perimissions + - A Bucket to save the outputs. This does not take a PAR, so should be a bucket in the same tenancy as to where you have your OCI blueprints stack + - Config `.yaml` file that has all the parameters required to run the benhcmark. This includes input_len, output_len, gpu_utilization value etc. + - Deployment `.json` to deploy your blueprint. + - Sample deployment and config files are provided below along with links. This blueprint supports benchmark execution via a job-mode recipe using a YAML config file. The recipe mounts a model and config file from Object Storage, runs offline inference, and logs metrics. Notes : Make sure your output object storage is in the same tenancy as your stack. --- -### Sample Recipe (Job Mode for Offline SGLang Inference) +### [Sample Blueprint (Job Mode for Offline SGLang Inference)](dhttps://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/offline_deployment_sglang.json) ```json { @@ -86,8 +93,50 @@ Notes : Make sure your output object storage is in the same tenancy as your stac ``` --- +### [Sample Blueprint (Job Mode for Offline vLLM Inference)](dhttps://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/offline_deployment_sglang.json) -## Sample Config File (`example_sglang.yaml`) +```json +{ + "recipe_id": "offline_inference_vllm", + "recipe_mode": "job", + "deployment_name": "Offline Inference Benchmark vllm", + "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:llm-benchmark-0409-v4", + "recipe_node_shape": "VM.GPU.A10.2", + "input_object_storage": [ + { + "par": "https://objectstorage.ap-melbourne-1.oraclecloud.com/p/0T99iRADcM08aVpumM6smqMIcnIJTFtV2D8ZIIWidUP9eL8GSRyDMxOb9Va9rmRc/n/iduyx1qnmway/b/mymodels/o/", + "mount_location": "/models", + "volume_size_in_gbs": 500, + "include": [ + "offline_vllm_example.yaml", + "NousResearch/Meta-Llama-3.1-8B" + ] + } + ], + "output_object_storage": [ + { + "bucket_name": "inference_output", + "mount_location": "/mlcommons_output", + "volume_size_in_gbs": 200 + } + ], + "recipe_container_command_args": [ + "/models/offline_vllm_example.yaml" + ], + "recipe_replica_count": 1, + "recipe_container_port": "8000", + "recipe_nvidia_gpu_count": 2, + "recipe_node_pool_size": 1, + "recipe_node_boot_volume_size_in_gbs": 200, + "recipe_ephemeral_storage_size": 100, + "recipe_shared_memory_volume_size_limit_in_mb": 200 + } + +``` + +--- + +## [Sample Config File SGlang - 1 (`new_example_sglang.yaml`)](https://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/new_example_sglang.yaml) ```yaml benchmark_type: offline @@ -115,7 +164,7 @@ run_name: "llama3-8b-sglang-test" save_metrics_path: /mlcommons_output/benchmark_output_llama3_sglang.json ``` - +## [Sample Config File - 2 vLLM (`offline_vllm_example.yaml`)](https://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/offline_vllm_example.yaml) ```yaml benchmark_type: offline model: /models/NousResearch/Meta-Llama-3.1-8B From 41ea8dd4d49d2ca3db12ff07cba04f899264c230 Mon Sep 17 00:00:00 2001 From: ssraghavan-oci Date: Thu, 24 Apr 2025 10:05:14 -0400 Subject: [PATCH 11/13] changed file names to indiciate the workload, addressed comments on the PR for offline inference --- .../offline-inference-infra/README.md | 14 ++++---- ...glang.yaml => offline_sglang_example.yaml} | 2 +- .../llama3_public_online.yaml | 17 +++++++++ .../online_deployment.json | 35 +++++++++++++++++++ .../online_example.yaml | 16 --------- 5 files changed, 60 insertions(+), 24 deletions(-) rename docs/sample_blueprints/offline-inference-infra/{new_example_sglang.yaml => offline_sglang_example.yaml} (86%) create mode 100644 docs/sample_blueprints/online-inference-infra/llama3_public_online.yaml create mode 100644 docs/sample_blueprints/online-inference-infra/online_deployment.json delete mode 100644 docs/sample_blueprints/online-inference-infra/online_example.yaml diff --git a/docs/sample_blueprints/offline-inference-infra/README.md b/docs/sample_blueprints/offline-inference-infra/README.md index bb3b882..b7b91b4 100644 --- a/docs/sample_blueprints/offline-inference-infra/README.md +++ b/docs/sample_blueprints/offline-inference-infra/README.md @@ -66,7 +66,7 @@ Notes : Make sure your output object storage is in the same tenancy as your stac "mount_location": "/models", "volume_size_in_gbs": 500, "include": [ - "new_example_sglang.yaml", + "offline_sglang_example.yaml", "NousResearch/Meta-Llama-3.1-8B" ] } @@ -74,12 +74,12 @@ Notes : Make sure your output object storage is in the same tenancy as your stac "output_object_storage": [ { "bucket_name": "inference_output", - "mount_location": "/mlcommons_output", + "mount_location": "/benchmarking_output", "volume_size_in_gbs": 200 } ], "recipe_container_command_args": [ - "/models/new_example_sglang.yaml" + "/models/offline_sglang_example.yaml" ], "recipe_replica_count": 1, "recipe_container_port": "8000", @@ -93,7 +93,7 @@ Notes : Make sure your output object storage is in the same tenancy as your stac ``` --- -### [Sample Blueprint (Job Mode for Offline vLLM Inference)](dhttps://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/offline_deployment_sglang.json) +### [Sample Blueprint (Job Mode for Offline vLLM Inference)](dhttps://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/offline_deployment_vllm.json) ```json { @@ -116,7 +116,7 @@ Notes : Make sure your output object storage is in the same tenancy as your stac "output_object_storage": [ { "bucket_name": "inference_output", - "mount_location": "/mlcommons_output", + "mount_location": "/benchmarking_output", "volume_size_in_gbs": 200 } ], @@ -161,7 +161,7 @@ experiment_name: "sglang-bench-doc-test-new" run_name: "llama3-8b-sglang-test" -save_metrics_path: /mlcommons_output/benchmark_output_llama3_sglang.json +save_metrics_path: /benchmarking_output/benchmark_output_llama3_sglang.json ``` ## [Sample Config File - 2 vLLM (`offline_vllm_example.yaml`)](https://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/offline_vllm_example.yaml) @@ -194,7 +194,7 @@ distributed_executor_backend: mp mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000 experiment_name: test-bm-suite-doc run_name: llama3-vllm-test -save_metrics_path: /mlcommons_output/benchmark_output_llama3_vllm.json +save_metrics_path: /benchmarking_output/benchmark_output_llama3_vllm.json ``` diff --git a/docs/sample_blueprints/offline-inference-infra/new_example_sglang.yaml b/docs/sample_blueprints/offline-inference-infra/offline_sglang_example.yaml similarity index 86% rename from docs/sample_blueprints/offline-inference-infra/new_example_sglang.yaml rename to docs/sample_blueprints/offline-inference-infra/offline_sglang_example.yaml index 1649e7a..a1ccf27 100644 --- a/docs/sample_blueprints/offline-inference-infra/new_example_sglang.yaml +++ b/docs/sample_blueprints/offline-inference-infra/offline_sglang_example.yaml @@ -20,5 +20,5 @@ experiment_name: "sglang-bench-doc-test-new" run_name: "llama3-8b-sglang-test" -save_metrics_path: /mlcommons_output/benchmark_output_llama3_sglang.json +save_metrics_path: /benchmarking_output/benchmark_output_llama3_sglang.json diff --git a/docs/sample_blueprints/online-inference-infra/llama3_public_online.yaml b/docs/sample_blueprints/online-inference-infra/llama3_public_online.yaml new file mode 100644 index 0000000..967b5c8 --- /dev/null +++ b/docs/sample_blueprints/online-inference-infra/llama3_public_online.yaml @@ -0,0 +1,17 @@ +benchmark_type: online +model: /models/NousResearch/Meta-Llama-3.1-8B-Instruct # Updated model path +input_len: 64 +output_len: 32 +max_requests: 5 +timeout: 300 +num_concurrent: 1 +results_dir: /online_output +llm_api: openai +llm_api_key: dummy-key +llm_api_base: https://llama8bobjvllm.129-80-16-111.nip.io/v1 # Updated to HTTPS +experiment_name: local-bench +run_name: llama3-test +mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000 +llmperf_path: /opt/llmperf-src +metadata: test=public-endpoint +save_metrics_path: /online_output/benchmark_output_llama3_online_public.json \ No newline at end of file diff --git a/docs/sample_blueprints/online-inference-infra/online_deployment.json b/docs/sample_blueprints/online-inference-infra/online_deployment.json new file mode 100644 index 0000000..daeca81 --- /dev/null +++ b/docs/sample_blueprints/online-inference-infra/online_deployment.json @@ -0,0 +1,35 @@ +{ + "recipe_id": "online_infernece_llmperf", + "recipe_mode": "job", + "deployment_name": "a1", + "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:llm-benchmark-0409-v4", + "recipe_node_shape": "VM.Standard.E4.Flex", + "recipe_node_pool_size": 1, + "recipe_flex_shape_ocpu_count": 32, + "recipe_flex_shape_memory_size_in_gbs": 256, + "recipe_node_boot_volume_size_in_gbs": 200, + "recipe_ephemeral_storage_size": 150, + "input_object_storage": [ + { + "par": "https://objectstorage.ap-melbourne-1.oraclecloud.com/p/0T99iRADcM08aVpumM6smqMIcnIJTFtV2D8ZIIWidUP9eL8GSRyDMxOb9Va9rmRc/n/iduyx1qnmway/b/mymodels/o/", + "mount_location": "/models", + "volume_size_in_gbs": 500, + "include": [ + "llama3_public_online.yaml" + ] + } + ], + "output_object_storage": [ + { + "bucket_name": "inference_output", + "mount_location": "/online_output", + "volume_size_in_gbs": 200 + } + ], + "recipe_container_command_args": [ + "/models/llama3_public_online.yaml" + ], + "recipe_replica_count": 1, + "recipe_container_port": "5678" + } + \ No newline at end of file diff --git a/docs/sample_blueprints/online-inference-infra/online_example.yaml b/docs/sample_blueprints/online-inference-infra/online_example.yaml deleted file mode 100644 index d4d0fe3..0000000 --- a/docs/sample_blueprints/online-inference-infra/online_example.yaml +++ /dev/null @@ -1,16 +0,0 @@ -benchmark_type: online -model: meta/llama3-8b-instruct -input_len: 64 -output_len: 32 -max_requests: 5 -timeout: 300 -num_concurrent: 1 -results_dir: /workspace/results_on -llm_api: openai -llm_api_key: dummy-key -llm_api_base: http://localhost:8001/v1 -experiment_name: local-bench -run_name: llama3-test -mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000 -llmperf_path: /opt/llmperf-src -metadata: test=localhost From a7fe7fdca317fb0b82c153a6532053e157348284 Mon Sep 17 00:00:00 2001 From: ssraghavan-oci Date: Thu, 24 Apr 2025 10:18:58 -0400 Subject: [PATCH 12/13] minor edit - offline readme --- docs/sample_blueprints/offline-inference-infra/README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/sample_blueprints/offline-inference-infra/README.md b/docs/sample_blueprints/offline-inference-infra/README.md index b7b91b4..a45c426 100644 --- a/docs/sample_blueprints/offline-inference-infra/README.md +++ b/docs/sample_blueprints/offline-inference-infra/README.md @@ -136,7 +136,7 @@ Notes : Make sure your output object storage is in the same tenancy as your stac --- -## [Sample Config File SGlang - 1 (`new_example_sglang.yaml`)](https://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/new_example_sglang.yaml) +## [Sample Config File SGlang - 1 (`new_example_sglang.yaml`)](https://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/offline_sglang_example.yaml) ```yaml benchmark_type: offline From edb14d8426b11c71e0689b06c19cd582109d8b9a Mon Sep 17 00:00:00 2001 From: grantneumanoracle Date: Fri, 18 Jul 2025 17:06:52 -0700 Subject: [PATCH 13/13] Add offline and online inference blueprints and configuration files - Introduced new JSON and YAML files for offline inference benchmarks using SGLang and vLLM backends. - Added README documentation for both offline and online inference blueprints, detailing usage, supported scenarios, and sample configurations. - Removed outdated README files for offline and online inference to streamline documentation. --- .../offline-inference-infra/README.md | 171 ++++++++++ .../offline_deployment_sglang.json | 0 .../offline_deployment_vllm.json | 0 .../offline_sglang_example.yaml | 0 .../offline_vllm_example.yaml | 0 .../online-inference-infra/README.md | 58 ++++ .../example_online.yaml | 18 ++ .../llama3_public_online.yaml | 0 .../online_deployment.json | 0 .../online_inference_job.json | 21 ++ .../offline-inference-infra/README.md | 306 ------------------ .../online-inference-infra/README.md | 104 ------ 12 files changed, 268 insertions(+), 410 deletions(-) create mode 100644 docs/sample_blueprints/model_serving/offline-inference-infra/README.md rename docs/sample_blueprints/{ => model_serving}/offline-inference-infra/offline_deployment_sglang.json (100%) rename docs/sample_blueprints/{ => model_serving}/offline-inference-infra/offline_deployment_vllm.json (100%) rename docs/sample_blueprints/{ => model_serving}/offline-inference-infra/offline_sglang_example.yaml (100%) rename docs/sample_blueprints/{ => model_serving}/offline-inference-infra/offline_vllm_example.yaml (100%) create mode 100644 docs/sample_blueprints/model_serving/online-inference-infra/README.md create mode 100644 docs/sample_blueprints/model_serving/online-inference-infra/example_online.yaml rename docs/sample_blueprints/{ => model_serving}/online-inference-infra/llama3_public_online.yaml (100%) rename docs/sample_blueprints/{ => model_serving}/online-inference-infra/online_deployment.json (100%) create mode 100644 docs/sample_blueprints/model_serving/online-inference-infra/online_inference_job.json delete mode 100644 docs/sample_blueprints/offline-inference-infra/README.md delete mode 100644 docs/sample_blueprints/online-inference-infra/README.md diff --git a/docs/sample_blueprints/model_serving/offline-inference-infra/README.md b/docs/sample_blueprints/model_serving/offline-inference-infra/README.md new file mode 100644 index 0000000..4bc98ac --- /dev/null +++ b/docs/sample_blueprints/model_serving/offline-inference-infra/README.md @@ -0,0 +1,171 @@ +# Offline Inference Blueprint - Infra (SGLang + vLLM) + +#### Run offline LLM inference benchmarks using SGLang or vLLM backends with automated performance tracking and MLflow logging. + +This blueprint provides a configurable framework to run **offline LLM inference benchmarks** using either the SGLang or vLLM backends. It is designed for cloud GPU environments and supports automated performance benchmarking with MLflow logging. + +This blueprint enables you to: + +- Run inference locally on GPU nodes using pre-loaded models +- Benchmark token throughput, latency, and request performance +- Push results to MLflow for comparison and analysis + +--- + +## Pre-Filled Samples + +| Feature Showcase | Title | Description | Blueprint File | +| ---------------------------------------------------------------------------------------------------------- | ------------------------------------ | ---------------------------------------------------------------------------- | ---------------------------------------------------------------- | +| Benchmark LLM performance using SGLang backend with offline inference for accurate performance measurement | Offline inference with LLaMA 3 | Benchmarks Meta-Llama-3.1-8B model using SGLang on VM.GPU.A10.2 with 2 GPUs. | [offline_deployment_sglang.json](offline_deployment_sglang.json) | +| Benchmark LLM performance using vLLM backend with offline inference for token throughput analysis | Offline inference with LLAMA 3- vLLM | Benchmarks Meta-Llama-3.1-8B model using vLLM on VM.GPU.A10.2 with 2 GPUs. | [offline_deployment_vllm.json](offline_deployment_vllm.json) | + +You can access these pre-filled samples from the OCI AI Blueprint portal. + +--- + +## When to use Offline inference + +Offline inference is ideal for: + +- Accurate performance benchmarking (no API or network bottlenecks) +- Comparing GPU hardware performance (A10, A100, H100, MI300X) +- Evaluating backend frameworks like vLLM and SGLang + +--- + +## Supported Backends + +| Backend | Description | +| ------- | ------------------------------------------------------------------- | +| sglang | Fast multi-modal LLM backend with optimized throughput | +| vllm | Token streaming inference engine for LLMs with speculative decoding | + +--- + +## Running the Benchmark + +- Things need to run the benchmark + - Model checkpoints pre-downloaded and stored in an object storage. + - Make sure to get a PAR for the object storage where the models are saved. With listing, write and read perimissions + - A Bucket to save the outputs. This does not take a PAR, so should be a bucket in the same tenancy as to where you have your OCI blueprints stack + - Config `.yaml` file that has all the parameters required to run the benhcmark. This includes input_len, output_len, gpu_utilization value etc. + - Deployment `.json` to deploy your blueprint. + - Sample deployment and config files are provided below along with links. + +This blueprint supports benchmark execution via a job-mode recipe using a YAML config file. The recipe mounts a model and config file from Object Storage, runs offline inference, and logs metrics. + +### Notes : Make sure your output object storage is in the same tenancy as your stack. + +## Sample Blueprints + +[Sample Blueprint (Job Mode for Offline SGLang Inference)](offline_deployment_sglang.json) +[Sample Blueprint (Job Mode for Offline vLLM Inference)](offline_deployment_vllm.json) +[Sample Config File SGlang ](offline_sglang_example.yaml) +[Sample Config File - vLLM ](offline_vllm_example.yaml) + +--- + +## Metrics Logged + +- `requests_per_second` +- `input_tokens_per_second` +- `output_tokens_per_second` +- `total_tokens_per_second` +- `elapsed_time` +- `total_input_tokens` +- `total_output_tokens` + +If a dataset is provided: + +- `accuracy` + +### Top-level Deployment Keys + +| Key | Description | +| ------------------- | ---------------------------------------------------------------------------- | +| `recipe_id` | Identifier of the recipe to run; here, it's an offline SGLang benchmark job. | +| `recipe_mode` | Specifies this is a `job`, meaning it runs to completion and exits. | +| `deployment_name` | Human-readable name for the job. | +| `recipe_image_uri` | Docker image containing the benchmark code and dependencies. | +| `recipe_node_shape` | Shape of the VM or GPU node to run the job (e.g., VM.GPU.A10.2). | + +### Input Object Storage + +| Key | Description | +| ---------------------- | ---------------------------------------------------------------------------- | +| `input_object_storage` | List of inputs to mount from Object Storage. | +| `par` | Pre-Authenticated Request (PAR) link to a bucket/folder. | +| `mount_location` | Files are mounted to this path inside the container. | +| `volume_size_in_gbs` | Size of the mount volume. | +| `include` | Only these files/folders from the bucket are mounted (e.g., model + config). | + +### Output Object Storage + +| Key | Description | +| ----------------------- | ------------------------------------------------------- | +| `output_object_storage` | Where to store outputs like benchmark logs or results. | +| `bucket_name` | Name of the output bucket in OCI Object Storage. | +| `mount_location` | Mount point inside container where outputs are written. | +| `volume_size_in_gbs` | Size of this volume in GBs. | + +### Runtime & Infra Settings + +| Key | Description | +| ---------------------------------------------- | ------------------------------------------------------------- | +| `recipe_container_command_args` | Path to the YAML config that defines benchmark parameters. | +| `recipe_replica_count` | Number of job replicas to run (usually 1 for inference). | +| `recipe_container_port` | Port (optional for offline mode; required if API is exposed). | +| `recipe_nvidia_gpu_count` | Number of GPUs allocated to this job. | +| `recipe_node_pool_size` | Number of nodes in the pool (1 means 1 VM). | +| `recipe_node_boot_volume_size_in_gbs` | Disk size for OS + dependencies. | +| `recipe_ephemeral_storage_size` | Local scratch space in GBs. | +| `recipe_shared_memory_volume_size_limit_in_mb` | Shared memory (used by some inference engines). | + +--- + +## **Sample Config File (`example_sglang.yaml`)** + +This file is consumed by the container during execution to configure the benchmark run. + +### Inference Setup + +| Key | Description | +| ------------------- | ----------------------------------------------------------------- | +| `benchmark_type` | Set to `offline` to indicate local execution with no HTTP server. | +| `offline_backend` | Backend engine to use (`sglang` or `vllm`). | +| `model_path` | Path to the model directory (already mounted via Object Storage). | +| `tokenizer_path` | Path to the tokenizer (usually same as model path). | +| `trust_remote_code` | Enables loading models that require custom code (Hugging Face). | +| `conv_template` | Prompt formatting template to use (e.g., `llama-2`). | + +### Benchmark Parameters + +| Key | Description | +| ---------------- | ---------------------------------------------------------------------- | +| `input_len` | Number of tokens in the input prompt. | +| `output_len` | Number of tokens to generate. | +| `num_prompts` | Number of total prompts to run (e.g., 64 prompts x 128 output tokens). | +| `max_seq_len` | Max sequence length supported by the model (e.g., 4096). | +| `max_batch_size` | Max batch size per inference run (depends on GPU memory). | +| `dtype` | Precision (e.g., float16, bfloat16, auto). | + +### Sampling Settings + +| Key | Description | +| ------------- | --------------------------------------------------------------- | +| `temperature` | Controls randomness in generation (lower = more deterministic). | +| `top_p` | Top-p sampling for diversity (0.9 keeps most probable tokens). | + +### MLflow Logging + +| Key | Description | +| ----------------- | -------------------------------------------- | +| `mlflow_uri` | MLflow server to log performance metrics. | +| `experiment_name` | Experiment name to group runs in MLflow UI. | +| `run_name` | Custom name to identify this particular run. | + +### Output + +| Key | Description | +| ------------------- | -------------------------------------------------------------- | +| `save_metrics_path` | Path inside the container where metrics will be saved as JSON. | diff --git a/docs/sample_blueprints/offline-inference-infra/offline_deployment_sglang.json b/docs/sample_blueprints/model_serving/offline-inference-infra/offline_deployment_sglang.json similarity index 100% rename from docs/sample_blueprints/offline-inference-infra/offline_deployment_sglang.json rename to docs/sample_blueprints/model_serving/offline-inference-infra/offline_deployment_sglang.json diff --git a/docs/sample_blueprints/offline-inference-infra/offline_deployment_vllm.json b/docs/sample_blueprints/model_serving/offline-inference-infra/offline_deployment_vllm.json similarity index 100% rename from docs/sample_blueprints/offline-inference-infra/offline_deployment_vllm.json rename to docs/sample_blueprints/model_serving/offline-inference-infra/offline_deployment_vllm.json diff --git a/docs/sample_blueprints/offline-inference-infra/offline_sglang_example.yaml b/docs/sample_blueprints/model_serving/offline-inference-infra/offline_sglang_example.yaml similarity index 100% rename from docs/sample_blueprints/offline-inference-infra/offline_sglang_example.yaml rename to docs/sample_blueprints/model_serving/offline-inference-infra/offline_sglang_example.yaml diff --git a/docs/sample_blueprints/offline-inference-infra/offline_vllm_example.yaml b/docs/sample_blueprints/model_serving/offline-inference-infra/offline_vllm_example.yaml similarity index 100% rename from docs/sample_blueprints/offline-inference-infra/offline_vllm_example.yaml rename to docs/sample_blueprints/model_serving/offline-inference-infra/offline_vllm_example.yaml diff --git a/docs/sample_blueprints/model_serving/online-inference-infra/README.md b/docs/sample_blueprints/model_serving/online-inference-infra/README.md new file mode 100644 index 0000000..84be75d --- /dev/null +++ b/docs/sample_blueprints/model_serving/online-inference-infra/README.md @@ -0,0 +1,58 @@ +# Online Inference Blueprint (LLMPerf) + +#### Benchmark online inference performance of large language models using LLMPerf standardized benchmarking tool. + +This blueprint benchmarks **online inference performance** of large language models using **LLMPerf**, a standardized benchmarking tool. It is designed to evaluate LLM APIs served via platforms such as OpenAI-compatible interfaces, including self-hosted LLM inference endpoints. + +This blueprint helps: + +- Simulate real-time request load on a running model server +- Measure end-to-end latency, throughput, and completion performance +- Push results to MLflow for visibility and tracking + +--- + +## Pre-Filled Samples + +| Feature Showcase | Title | Description | Blueprint File | +| --------------------------------------------------------------------------------------------------- | ----------------------------------------- | --------------------------------------------------------------------------- | ------------------------------------------------ | +| Benchmark live LLM API endpoints using LLMPerf to measure real-time performance and latency metrics | Online inference on LLaMA 3 using LLMPerf | Benchmark of meta/llama3-8b-instruct via a local OpenAI-compatible endpoint | [online_deployment.json](online_deployment.json) | + +These can be accessed directly from the OCI AI Blueprint portal. + +--- + +## Prerequisites + +Before running this blueprint: + +- You **must have an inference server already running**, compatible with the OpenAI API format. +- Ensure the endpoint and model name match what’s defined in the config. + +--- + +## Supported Scenarios + +| Use Case | Description | +| --------------------- | ------------------------------------------------------- | +| Local LLM APIs | Benchmark your own self-hosted models (e.g., vLLM) | +| Remote OpenAI API | Benchmark OpenAI deployments for throughput analysis | +| Multi-model endpoints | Test latency/throughput across different configurations | + +--- + +## Sample Blueprints + +[Sample Blueprint (Job Mode for Online Benchmarking)](online_inference_job.json) +[Sample Config File ](example_online.yaml) + +--- + +## Metrics Logged + +- `output_tokens_per_second` +- `requests_per_minute` +- `overall_output_throughput` +- All raw metrics from the `_summary.json` output of LLMPerf + +--- diff --git a/docs/sample_blueprints/model_serving/online-inference-infra/example_online.yaml b/docs/sample_blueprints/model_serving/online-inference-infra/example_online.yaml new file mode 100644 index 0000000..ea06d10 --- /dev/null +++ b/docs/sample_blueprints/model_serving/online-inference-infra/example_online.yaml @@ -0,0 +1,18 @@ +benchmark_type: online + +model: meta/llama3-8b-instruct +input_len: 64 +output_len: 32 +max_requests: 5 +timeout: 300 +num_concurrent: 1 +results_dir: /workspace/results_on +llm_api: openai +llm_api_key: dummy-key +llm_api_base: http://localhost:8001/v1 + +experiment_name: local-bench +run_name: llama3-test +mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000 +llmperf_path: /opt/llmperf-src +metadata: test=localhost \ No newline at end of file diff --git a/docs/sample_blueprints/online-inference-infra/llama3_public_online.yaml b/docs/sample_blueprints/model_serving/online-inference-infra/llama3_public_online.yaml similarity index 100% rename from docs/sample_blueprints/online-inference-infra/llama3_public_online.yaml rename to docs/sample_blueprints/model_serving/online-inference-infra/llama3_public_online.yaml diff --git a/docs/sample_blueprints/online-inference-infra/online_deployment.json b/docs/sample_blueprints/model_serving/online-inference-infra/online_deployment.json similarity index 100% rename from docs/sample_blueprints/online-inference-infra/online_deployment.json rename to docs/sample_blueprints/model_serving/online-inference-infra/online_deployment.json diff --git a/docs/sample_blueprints/model_serving/online-inference-infra/online_inference_job.json b/docs/sample_blueprints/model_serving/online-inference-infra/online_inference_job.json new file mode 100644 index 0000000..8522fb7 --- /dev/null +++ b/docs/sample_blueprints/model_serving/online-inference-infra/online_inference_job.json @@ -0,0 +1,21 @@ +{ + "recipe_id": "online_inference_benchmark", + "recipe_mode": "job", + "deployment_name": "Online Inference Benchmark", + "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:llm-benchmark-0409-v2", + "recipe_node_shape": "VM.GPU.A10.2", + "input_object_storage": [ + { + "par": "https://objectstorage.ap-melbourne-1.oraclecloud.com/p/Z2q73uuLCAxCbGXJ99CIeTxnCTNipsE-1xHE9HYfCz0RBYPTcCbqi9KHViUEH-Wq/n/iduyx1qnmway/b/mymodels/o/", + "mount_location": "/models", + "volume_size_in_gbs": 100, + "include": ["example_online.yaml"] + } + ], + "recipe_container_command_args": ["/models/example_online.yaml"], + "recipe_replica_count": 1, + "recipe_container_port": "8000", + "recipe_node_pool_size": 1, + "recipe_node_boot_volume_size_in_gbs": 200, + "recipe_ephemeral_storage_size": 100 +} diff --git a/docs/sample_blueprints/offline-inference-infra/README.md b/docs/sample_blueprints/offline-inference-infra/README.md deleted file mode 100644 index a45c426..0000000 --- a/docs/sample_blueprints/offline-inference-infra/README.md +++ /dev/null @@ -1,306 +0,0 @@ -# Offline Inference Blueprint - Infra (SGLang + vLLM) - -This blueprint provides a configurable framework to run **offline LLM inference benchmarks** using either the SGLang or vLLM backends. It is designed for cloud GPU environments and supports automated performance benchmarking with MLflow logging. - -This blueprint enables you to: -- Run inference locally on GPU nodes using pre-loaded models -- Benchmark token throughput, latency, and request performance -- Push results to MLflow for comparison and analysis - ---- - -## Pre-Filled Samples - -| Title | Description | -|------------------------------|-----------------------------------------------------------------------------| -|Offline inference with LLaMA 3|Benchmarks Meta-Llama-3.1-8B model using SGLang on VM.GPU.A10.2 with 2 GPUs. | -|Offline inference with LLAMA 3- vLLM| Benchmarks Meta-Llama-3.1-8B model using vLLM on VM.GPU.A10.2 with 2 GPUs.| - -You can access these pre-filled samples from the OCI AI Blueprint portal. - ---- -## When to use Offline inference - -Offline inference is ideal for: -- Accurate performance benchmarking (no API or network bottlenecks) -- Comparing GPU hardware performance (A10, A100, H100, MI300X) -- Evaluating backend frameworks like vLLM and SGLang - ---- - -## Supported Backends - -| Backend | Description | -|----------|--------------------------------------------------------------| -| sglang | Fast multi-modal LLM backend with optimized throughput | -| vllm | Token streaming inference engine for LLMs with speculative decoding | - ---- - -## Running the Benchmark -- Things need to run the benchmark - - Model checkpoints pre-downloaded and stored in an object storage. - - Make sure to get a PAR for the object storage where the models are saved. With listing, write and read perimissions - - A Bucket to save the outputs. This does not take a PAR, so should be a bucket in the same tenancy as to where you have your OCI blueprints stack - - Config `.yaml` file that has all the parameters required to run the benhcmark. This includes input_len, output_len, gpu_utilization value etc. - - Deployment `.json` to deploy your blueprint. - - Sample deployment and config files are provided below along with links. - -This blueprint supports benchmark execution via a job-mode recipe using a YAML config file. The recipe mounts a model and config file from Object Storage, runs offline inference, and logs metrics. - -Notes : Make sure your output object storage is in the same tenancy as your stack. ---- - -### [Sample Blueprint (Job Mode for Offline SGLang Inference)](dhttps://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/offline_deployment_sglang.json) - -```json -{ - "recipe_id": "offline_inference_sglang", - "recipe_mode": "job", - "deployment_name": "Offline Inference Benchmark", - "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:llm-benchmark-0409-v4", - "recipe_node_shape": "VM.GPU.A10.2", - "input_object_storage": [ - { - "par": "https://objectstorage.ap-melbourne-1.oraclecloud.com/p/0T99iRADcM08aVpumM6smqMIcnIJTFtV2D8ZIIWidUP9eL8GSRyDMxOb9Va9rmRc/n/iduyx1qnmway/b/mymodels/o/", - "mount_location": "/models", - "volume_size_in_gbs": 500, - "include": [ - "offline_sglang_example.yaml", - "NousResearch/Meta-Llama-3.1-8B" - ] - } - ], - "output_object_storage": [ - { - "bucket_name": "inference_output", - "mount_location": "/benchmarking_output", - "volume_size_in_gbs": 200 - } - ], - "recipe_container_command_args": [ - "/models/offline_sglang_example.yaml" - ], - "recipe_replica_count": 1, - "recipe_container_port": "8000", - "recipe_nvidia_gpu_count": 2, - "recipe_node_pool_size": 1, - "recipe_node_boot_volume_size_in_gbs": 200, - "recipe_ephemeral_storage_size": 100, - "recipe_shared_memory_volume_size_limit_in_mb": 200 - } - -``` - ---- -### [Sample Blueprint (Job Mode for Offline vLLM Inference)](dhttps://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/offline_deployment_vllm.json) - -```json -{ - "recipe_id": "offline_inference_vllm", - "recipe_mode": "job", - "deployment_name": "Offline Inference Benchmark vllm", - "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:llm-benchmark-0409-v4", - "recipe_node_shape": "VM.GPU.A10.2", - "input_object_storage": [ - { - "par": "https://objectstorage.ap-melbourne-1.oraclecloud.com/p/0T99iRADcM08aVpumM6smqMIcnIJTFtV2D8ZIIWidUP9eL8GSRyDMxOb9Va9rmRc/n/iduyx1qnmway/b/mymodels/o/", - "mount_location": "/models", - "volume_size_in_gbs": 500, - "include": [ - "offline_vllm_example.yaml", - "NousResearch/Meta-Llama-3.1-8B" - ] - } - ], - "output_object_storage": [ - { - "bucket_name": "inference_output", - "mount_location": "/benchmarking_output", - "volume_size_in_gbs": 200 - } - ], - "recipe_container_command_args": [ - "/models/offline_vllm_example.yaml" - ], - "recipe_replica_count": 1, - "recipe_container_port": "8000", - "recipe_nvidia_gpu_count": 2, - "recipe_node_pool_size": 1, - "recipe_node_boot_volume_size_in_gbs": 200, - "recipe_ephemeral_storage_size": 100, - "recipe_shared_memory_volume_size_limit_in_mb": 200 - } - -``` - ---- - -## [Sample Config File SGlang - 1 (`new_example_sglang.yaml`)](https://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/offline_sglang_example.yaml) - -```yaml -benchmark_type: offline -offline_backend: sglang - -model_path: /models/NousResearch/Meta-Llama-3.1-8B -tokenizer_path: /models/NousResearch/Meta-Llama-3.1-8B -trust_remote_code: true -conv_template: llama-2 - -input_len: 128 -output_len: 128 -num_prompts: 64 -max_seq_len: 4096 -max_batch_size: 8 -dtype: auto -temperature: 0.7 -top_p: 0.9 - -mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000 -experiment_name: "sglang-bench-doc-test-new" -run_name: "llama3-8b-sglang-test" - - -save_metrics_path: /benchmarking_output/benchmark_output_llama3_sglang.json - -``` -## [Sample Config File - 2 vLLM (`offline_vllm_example.yaml`)](https://github.com/oracle-quickstart/oci-ai-blueprints/blob/offline-inference-benchmark/docs/sample_blueprints/offline-inference-infra/offline_vllm_example.yaml) -```yaml -benchmark_type: offline -model: /models/NousResearch/Meta-Llama-3.1-8B -tokenizer: /models/NousResearch/Meta-Llama-3.1-8B - -input_len: 12 -output_len: 12 -num_prompts: 2 -seed: 42 -tensor_parallel_size: 8 - -# vLLM-specific -#quantization: awq -dtype: half -gpu_memory_utilization: 0.99 -num_scheduler_steps: 10 -device: cuda -enforce_eager: true -kv_cache_dtype: auto -enable_prefix_caching: true -distributed_executor_backend: mp - -# Output -#output_json: ./128_128.json - -# MLflow -mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000 -experiment_name: test-bm-suite-doc -run_name: llama3-vllm-test -save_metrics_path: /benchmarking_output/benchmark_output_llama3_vllm.json - -``` - ---- - -## Metrics Logged - -- `requests_per_second` -- `input_tokens_per_second` -- `output_tokens_per_second` -- `total_tokens_per_second` -- `elapsed_time` -- `total_input_tokens` -- `total_output_tokens` - -If a dataset is provided: -- `accuracy` - - -### Top-level Deployment Keys - -| Key | Description | -|-----|-------------| -| `recipe_id` | Identifier of the recipe to run; here, it's an offline SGLang benchmark job. | -| `recipe_mode` | Specifies this is a `job`, meaning it runs to completion and exits. | -| `deployment_name` | Human-readable name for the job. | -| `recipe_image_uri` | Docker image containing the benchmark code and dependencies. | -| `recipe_node_shape` | Shape of the VM or GPU node to run the job (e.g., VM.GPU.A10.2). | - -### Input Object Storage - -| Key | Description | -|-----|-------------| -| `input_object_storage` | List of inputs to mount from Object Storage. | -| `par` | Pre-Authenticated Request (PAR) link to a bucket/folder. | -| `mount_location` | Files are mounted to this path inside the container. | -| `volume_size_in_gbs` | Size of the mount volume. | -| `include` | Only these files/folders from the bucket are mounted (e.g., model + config). | - -### Output Object Storage - -| Key | Description | -|-----|-------------| -| `output_object_storage` | Where to store outputs like benchmark logs or results. | -| `bucket_name` | Name of the output bucket in OCI Object Storage. | -| `mount_location` | Mount point inside container where outputs are written. | -| `volume_size_in_gbs` | Size of this volume in GBs. | - -### Runtime & Infra Settings - -| Key | Description | -|-----|-------------| -| `recipe_container_command_args` | Path to the YAML config that defines benchmark parameters. | -| `recipe_replica_count` | Number of job replicas to run (usually 1 for inference). | -| `recipe_container_port` | Port (optional for offline mode; required if API is exposed). | -| `recipe_nvidia_gpu_count` | Number of GPUs allocated to this job. | -| `recipe_node_pool_size` | Number of nodes in the pool (1 means 1 VM). | -| `recipe_node_boot_volume_size_in_gbs` | Disk size for OS + dependencies. | -| `recipe_ephemeral_storage_size` | Local scratch space in GBs. | -| `recipe_shared_memory_volume_size_limit_in_mb` | Shared memory (used by some inference engines). | - ---- - -## **Sample Config File (`example_sglang.yaml`)** - -This file is consumed by the container during execution to configure the benchmark run. - -### Inference Setup - -| Key | Description | -|-----|-------------| -| `benchmark_type` | Set to `offline` to indicate local execution with no HTTP server. | -| `offline_backend` | Backend engine to use (`sglang` or `vllm`). | -| `model_path` | Path to the model directory (already mounted via Object Storage). | -| `tokenizer_path` | Path to the tokenizer (usually same as model path). | -| `trust_remote_code` | Enables loading models that require custom code (Hugging Face). | -| `conv_template` | Prompt formatting template to use (e.g., `llama-2`). | - -### Benchmark Parameters - -| Key | Description | -|-----|-------------| -| `input_len` | Number of tokens in the input prompt. | -| `output_len` | Number of tokens to generate. | -| `num_prompts` | Number of total prompts to run (e.g., 64 prompts x 128 output tokens). | -| `max_seq_len` | Max sequence length supported by the model (e.g., 4096). | -| `max_batch_size` | Max batch size per inference run (depends on GPU memory). | -| `dtype` | Precision (e.g., float16, bfloat16, auto). | - -### Sampling Settings - -| Key | Description | -|-----|-------------| -| `temperature` | Controls randomness in generation (lower = more deterministic). | -| `top_p` | Top-p sampling for diversity (0.9 keeps most probable tokens). | - -### MLflow Logging - -| Key | Description | -|-----|-------------| -| `mlflow_uri` | MLflow server to log performance metrics. | -| `experiment_name` | Experiment name to group runs in MLflow UI. | -| `run_name` | Custom name to identify this particular run. | - -### Output - -| Key | Description | -|-----|-------------| -| `save_metrics_path` | Path inside the container where metrics will be saved as JSON. | diff --git a/docs/sample_blueprints/online-inference-infra/README.md b/docs/sample_blueprints/online-inference-infra/README.md deleted file mode 100644 index 8b1f4bf..0000000 --- a/docs/sample_blueprints/online-inference-infra/README.md +++ /dev/null @@ -1,104 +0,0 @@ -# Online Inference Blueprint (LLMPerf) - -This blueprint benchmarks **online inference performance** of large language models using **LLMPerf**, a standardized benchmarking tool. It is designed to evaluate LLM APIs served via platforms such as OpenAI-compatible interfaces, including self-hosted LLM inference endpoints. - -This blueprint helps: -- Simulate real-time request load on a running model server -- Measure end-to-end latency, throughput, and completion performance -- Push results to MLflow for visibility and tracking - ---- - -## Pre-Filled Samples - -| Title | Description | -|----------------------------------------|-----------------------------------------------------------------------------| -|Online inference on LLaMA 3 using LLMPerf|Benchmark of meta/llama3-8b-instruct via a local OpenAI-compatible endpoint | - -These can be accessed directly from the OCI AI Blueprint portal. - ---- - -## Prerequisites - -Before running this blueprint: -- You **must have an inference server already running**, compatible with the OpenAI API format. -- Ensure the endpoint and model name match what’s defined in the config. - ---- - -## Supported Scenarios - -| Use Case | Description | -|-----------------------|-------------------------------------------------------| -| Local LLM APIs | Benchmark your own self-hosted models (e.g., vLLM) | -| Remote OpenAI API | Benchmark OpenAI deployments for throughput analysis | -| Multi-model endpoints | Test latency/throughput across different configurations | - ---- - -### Sample Recipe (Job Mode for Online Benchmarking) - -```json -{ - "recipe_id": "online_inference_benchmark", - "recipe_mode": "job", - "deployment_name": "Online Inference Benchmark", - "recipe_image_uri": "iad.ocir.io/iduyx1qnmway/corrino-devops-repository:llm-benchmark-0409-v2", - "recipe_node_shape": "VM.GPU.A10.2", - "input_object_storage": [ - { - "par": "https://objectstorage.ap-melbourne-1.oraclecloud.com/p/Z2q73uuLCAxCbGXJ99CIeTxnCTNipsE-1xHE9HYfCz0RBYPTcCbqi9KHViUEH-Wq/n/iduyx1qnmway/b/mymodels/o/", - "mount_location": "/models", - "volume_size_in_gbs": 100, - "include": [ - "example_online.yaml" - ] - } - ], - "recipe_container_command_args": [ - "/models/example_online.yaml" - ], - "recipe_replica_count": 1, - "recipe_container_port": "8000", - "recipe_node_pool_size": 1, - "recipe_node_boot_volume_size_in_gbs": 200, - "recipe_ephemeral_storage_size": 100 -} -``` - ---- - -## Sample Config File (`example_online.yaml`) - -```yaml -benchmark_type: online - -model: meta/llama3-8b-instruct -input_len: 64 -output_len: 32 -max_requests: 5 -timeout: 300 -num_concurrent: 1 -results_dir: /workspace/results_on -llm_api: openai -llm_api_key: dummy-key -llm_api_base: http://localhost:8001/v1 - -experiment_name: local-bench -run_name: llama3-test -mlflow_uri: http://mlflow-benchmarking.corrino-oci.com:5000 -llmperf_path: /opt/llmperf-src -metadata: test=localhost -``` - ---- - -## Metrics Logged - -- `output_tokens_per_second` -- `requests_per_minute` -- `overall_output_throughput` -- All raw metrics from the `_summary.json` output of LLMPerf - ----