|
| 1 | +{ |
| 2 | + "cells": [ |
| 3 | + { |
| 4 | + "cell_type": "markdown", |
| 5 | + "metadata": {}, |
| 6 | + "source": [ |
| 7 | + "# 5G Network Operations Insights with Fine Tuning of GPT2 (This is the smallest version of GPT-2, with 124M parameters.)\n", |
| 8 | + "## Project Overview\n", |
| 9 | + "Author: Fatih E. NAR\n", |
| 10 | + "This project aims to deliver a 5g network insight with fine tuning a network performant LLM\n", |
| 11 | + "Model card: https://huggingface.co/openai-community/gpt2" |
| 12 | + ] |
| 13 | + }, |
| 14 | + { |
| 15 | + "cell_type": "code", |
| 16 | + "execution_count": null, |
| 17 | + "metadata": {}, |
| 18 | + "outputs": [], |
| 19 | + "source": [ |
| 20 | + "%pip install -r requirements.txt\n", |
| 21 | + "%pip install --pre torch torchvision torchaudio --extra-index-url https://download.pytorch.org/whl/nightly/cpu" |
| 22 | + ] |
| 23 | + }, |
| 24 | + { |
| 25 | + "cell_type": "code", |
| 26 | + "execution_count": null, |
| 27 | + "metadata": {}, |
| 28 | + "outputs": [], |
| 29 | + "source": [ |
| 30 | + "import lzma\n", |
| 31 | + "import shutil\n", |
| 32 | + "import pandas as pd\n", |
| 33 | + "import os\n", |
| 34 | + "import torch\n", |
| 35 | + "import psutil\n", |
| 36 | + "import threading\n", |
| 37 | + "import sys\n", |
| 38 | + "import time\n", |
| 39 | + "import gc\n", |
| 40 | + "from datasets import Dataset\n", |
| 41 | + "from transformers import GPT2LMHeadModel, GPT2Tokenizer, Trainer, TrainingArguments, get_linear_schedule_with_warmup\n", |
| 42 | + "\n", |
| 43 | + "from peft import get_peft_model, LoraConfig, TaskType\n", |
| 44 | + "\n", |
| 45 | + "# Save the model and tokenizer\n", |
| 46 | + "model_save_path = \"models/5g_oss_model\"\n", |
| 47 | + "#model_name = \"distilgpt2\"\n", |
| 48 | + "model_name = \"gpt2\"\n", |
| 49 | + "\n", |
| 50 | + "# Set TOKENIZERS_PARALLELISM to false to avoid warnings\n", |
| 51 | + "os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n", |
| 52 | + "\n", |
| 53 | + "# Clear GPU cache before starting\n", |
| 54 | + "torch.cuda.empty_cache()\n", |
| 55 | + "gc.collect()\n", |
| 56 | + "\n", |
| 57 | + "# Cap memory usage to a specific size (e.g., 8 GB) for cuda\n", |
| 58 | + "max_memory_gb = 8\n", |
| 59 | + "max_memory_mb = max_memory_gb * 1024\n", |
| 60 | + "os.environ['PYTORCH_CUDA_ALLOC_CONF'] = f'max_split_size_mb:{max_memory_mb}'\n", |
| 61 | + "\n", |
| 62 | + "# Check if any accelerator is available \n", |
| 63 | + "if torch.cuda.is_available():\n", |
| 64 | + " print(\"Using CUDA (NVIDIA GPU)\")\n", |
| 65 | + " os.environ[\"CUDA_LAUNCH_BLOCKING\"] = \"1\"\n", |
| 66 | + " # Leverage multi-gpu if available\n", |
| 67 | + " device1 = torch.device(\"cuda:0\")\n", |
| 68 | + " device2 = torch.device(\"cuda:1\") if torch.cuda.device_count() > 1 else torch.device(\"cuda:0\")\n", |
| 69 | + " print(\"Using CUDA\")\n", |
| 70 | + " # Clear GPU cache before starting\n", |
| 71 | + " torch.cuda.empty_cache()\n", |
| 72 | + "# Check if MPS (Apple Silicon GPU) is available\n", |
| 73 | + "elif torch.backends.mps.is_available():\n", |
| 74 | + " os.environ[\"PYTORCH_MPS_HIGH_WATERMARK_RATIO\"] = \"0.0\"\n", |
| 75 | + " os.environ[\"PYTORCH_ENABLE_MPS_FALLBACK\"] = \"1\"\n", |
| 76 | + " # Leverage multi-gpu if available\n", |
| 77 | + " device1 = torch.device(\"mps:0\")\n", |
| 78 | + " device2 = torch.device(\"mps:1\") \n", |
| 79 | + " print(\"Using MPS\")\n", |
| 80 | + "else:\n", |
| 81 | + " device1 = torch.device(\"cpu\")\n", |
| 82 | + " device2 = torch.device(\"cpu\")\n", |
| 83 | + " print(\"Using CPU\")\n", |
| 84 | + "\n", |
| 85 | + "# Extract the .xz file\n", |
| 86 | + "with lzma.open('data/5G_netops_data.csv.xz', 'rb') as f_in:\n", |
| 87 | + " with open('data/5G_netops_data.csv', 'wb') as f_out:\n", |
| 88 | + " shutil.copyfileobj(f_in, f_out)\n", |
| 89 | + "\n", |
| 90 | + "# Load the synthetic telecom data\n", |
| 91 | + "data_path = \"data/5G_netops_data.csv\"\n", |
| 92 | + "data = pd.read_csv(data_path)\n", |
| 93 | + "\n", |
| 94 | + "# Display basic information about the full dataset\n", |
| 95 | + "data.info()\n", |
| 96 | + "data.head()" |
| 97 | + ] |
| 98 | + }, |
| 99 | + { |
| 100 | + "cell_type": "code", |
| 101 | + "execution_count": null, |
| 102 | + "metadata": {}, |
| 103 | + "outputs": [], |
| 104 | + "source": [ |
| 105 | + "# Fill NaN values and prepare input and target texts\n", |
| 106 | + "# Ensure all NaN values are filled with empty strings\n", |
| 107 | + "data = data.fillna('')\n", |
| 108 | + "\n", |
| 109 | + "# Ensure 'Zip' column is treated as a string\n", |
| 110 | + "data['Zip'] = data['Zip'].astype(str)\n", |
| 111 | + "\n", |
| 112 | + "# Create the input_text column\n", |
| 113 | + "data['input_text'] = data.apply(lambda row: f\"Date: {row['Date']} Cell Availability: {row['Cell Availability (%)']} MTTR: {row['MTTR (hours)']} Throughput: {row['Throughput (Mbps)']} Latency: {row['Latency (ms)']} Packet Loss Rate: {row['Packet Loss Rate (%)']} Call Drop Rate: {row['Call Drop Rate (%)']} Handover Success Rate: {row['Handover Success Rate (%)']} Alarm Count: {row['Alarm Count']} Critical Alarm Count: {row['Critical Alarm Count']} Parameter Changes: {row['Parameter Changes']} Successful Configuration Changes: {row['Successful Configuration Changes (%)']} Data Usage: {row['Data Usage (GB)']} User Count: {row['User Count']} Signal Strength: {row['Signal Strength (dBm)']} Jitter: {row['Jitter (ms)']} Connection Setup Success Rate: {row['Connection Setup Success Rate (%)']} Security Incidents: {row['Security Incidents']} Authentication Failures: {row['Authentication Failures']} Temperature: {row['Temperature (°C)']} Humidity: {row['Humidity (%)']} Weather: {row['Weather']} Issue Reported: {row['Issue Reported']} City: {row['City']} State: {row['State']} Zip: {row['Zip']}\", axis=1)\n", |
| 114 | + "\n", |
| 115 | + "# Create the target_text column\n", |
| 116 | + "data['target_text'] = data['Fault Occurrence Rate'].astype(str)\n", |
| 117 | + "\n", |
| 118 | + "# Convert to HuggingFace Dataset\n", |
| 119 | + "dataset = Dataset.from_pandas(data)\n", |
| 120 | + "\n", |
| 121 | + "# Split the dataset into training and evaluation\n", |
| 122 | + "train_test_split = dataset.train_test_split(test_size=0.1)\n", |
| 123 | + "train_dataset = train_test_split['train']\n", |
| 124 | + "eval_dataset = train_test_split['test']\n", |
| 125 | + "\n", |
| 126 | + "# Check the loaded dataset\n", |
| 127 | + "print(f\"Training Dataset size: {len(train_dataset)}\")\n", |
| 128 | + "print(f\"Evaluation Dataset size: {len(eval_dataset)}\")\n", |
| 129 | + "print(train_dataset[0])" |
| 130 | + ] |
| 131 | + }, |
| 132 | + { |
| 133 | + "cell_type": "code", |
| 134 | + "execution_count": null, |
| 135 | + "metadata": {}, |
| 136 | + "outputs": [], |
| 137 | + "source": [ |
| 138 | + "# Load the tokenizer from the pretrained model\n", |
| 139 | + "tokenizer = GPT2Tokenizer.from_pretrained(model_name)\n", |
| 140 | + "\n", |
| 141 | + "# Add the pad token if it doesn't exist\n", |
| 142 | + "if tokenizer.pad_token is None:\n", |
| 143 | + " tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})\n", |
| 144 | + "else:\n", |
| 145 | + " tokenizer.pad_token = tokenizer.eos_token\n", |
| 146 | + "\n", |
| 147 | + "# Save the tokenizer\n", |
| 148 | + "tokenizer.save_pretrained(model_save_path)\n", |
| 149 | + "\n", |
| 150 | + "model = GPT2LMHeadModel.from_pretrained(model_name)\n", |
| 151 | + "model.resize_token_embeddings(len(tokenizer))\n", |
| 152 | + "# Save the new model\n", |
| 153 | + "model.save_pretrained(model_save_path)\n", |
| 154 | + "\n", |
| 155 | + "# Define preprocessing function\n", |
| 156 | + "def preprocess_function(examples):\n", |
| 157 | + " inputs = examples['input_text']\n", |
| 158 | + " targets = examples['target_text']\n", |
| 159 | + " model_inputs = tokenizer(inputs, max_length=512, truncation=True, padding='max_length')\n", |
| 160 | + " with tokenizer.as_target_tokenizer():\n", |
| 161 | + " labels = tokenizer(targets, max_length=512, truncation=True, padding='max_length')\n", |
| 162 | + " model_inputs['labels'] = labels['input_ids']\n", |
| 163 | + " return model_inputs\n", |
| 164 | + "\n", |
| 165 | + "# Apply preprocessing\n", |
| 166 | + "train_dataset = train_dataset.map(preprocess_function, batched=True)\n", |
| 167 | + "eval_dataset = eval_dataset.map(preprocess_function, batched=True)\n", |
| 168 | + "\n", |
| 169 | + "columns = ['input_ids', 'attention_mask', 'labels']\n", |
| 170 | + "train_dataset.set_format(type='torch', columns=columns)\n", |
| 171 | + "eval_dataset.set_format(type='torch', columns=columns)\n", |
| 172 | + "\n", |
| 173 | + "# Check the tokenized dataset\n", |
| 174 | + "print(f\"Tokenized Training Dataset size: {len(train_dataset)}\")\n", |
| 175 | + "print(f\"Tokenized Evaluation Dataset size: {len(eval_dataset)}\")\n", |
| 176 | + "print(train_dataset[0])" |
| 177 | + ] |
| 178 | + }, |
| 179 | + { |
| 180 | + "cell_type": "code", |
| 181 | + "execution_count": null, |
| 182 | + "metadata": {}, |
| 183 | + "outputs": [], |
| 184 | + "source": [ |
| 185 | + "# PEFT Part\n", |
| 186 | + "lora_config = LoraConfig(\n", |
| 187 | + " task_type=TaskType.CAUSAL_LM,\n", |
| 188 | + " inference_mode=False,\n", |
| 189 | + " r=2,\n", |
| 190 | + " lora_alpha=16,\n", |
| 191 | + " lora_dropout=0.05\n", |
| 192 | + ")\n", |
| 193 | + "\n", |
| 194 | + "model = get_peft_model(model, lora_config)\n", |
| 195 | + "\n", |
| 196 | + "# Manually split the model across the two GPUs\n", |
| 197 | + "model.transformer.h[:6].to(device1) # First half of layers to GPU 1\n", |
| 198 | + "model.transformer.h[6:].to(device2) # Second half of layers to GPU 2\n", |
| 199 | + "model.transformer.ln_f.to(device2) # Final normalization layer to GPU 2\n", |
| 200 | + "model.lm_head.to(device2) # Language modeling head to GPU 2" |
| 201 | + ] |
| 202 | + }, |
| 203 | + { |
| 204 | + "cell_type": "code", |
| 205 | + "execution_count": null, |
| 206 | + "metadata": {}, |
| 207 | + "outputs": [], |
| 208 | + "source": [ |
| 209 | + "# Set training arguments\n", |
| 210 | + "training_args = TrainingArguments(\n", |
| 211 | + " output_dir=\"./results\", # Output directory\n", |
| 212 | + " overwrite_output_dir=True, # Overwrite the content of the output directory\n", |
| 213 | + " num_train_epochs=10, # Number of training epochs\n", |
| 214 | + " per_device_train_batch_size=36, # Batch size per device during training\n", |
| 215 | + " gradient_accumulation_steps=12, # Accumulate gradients over multiple steps\n", |
| 216 | + " learning_rate=5e-5, # Learning rate\n", |
| 217 | + " save_steps=2000, # Save checkpoint every 2000 steps\n", |
| 218 | + " save_total_limit=2, # Limit the total amount of checkpoints\n", |
| 219 | + " evaluation_strategy=\"steps\", # Evaluate during training at each `logging_steps`\n", |
| 220 | + " logging_steps=500, # Log every 500 steps\n", |
| 221 | + " eval_steps=2000, # Evaluate every 2000 steps\n", |
| 222 | + " load_best_model_at_end=True, # Load the best model at the end of training\n", |
| 223 | + " metric_for_best_model=\"loss\", # Use loss to evaluate the best model\n", |
| 224 | + " fp16=False, # Disable mixed precision training for MPS\n", |
| 225 | + ")\n", |
| 226 | + "\n", |
| 227 | + "# Create the learning rate scheduler\n", |
| 228 | + "total_steps = len(train_dataset) // training_args.per_device_train_batch_size * training_args.num_train_epochs\n", |
| 229 | + "optimizer = torch.optim.AdamW(model.parameters(), lr=training_args.learning_rate)\n", |
| 230 | + "lr_scheduler = get_linear_schedule_with_warmup(\n", |
| 231 | + " optimizer,\n", |
| 232 | + " num_warmup_steps=total_steps // 10, # Warm-up for 10% of the total steps\n", |
| 233 | + " num_training_steps=total_steps\n", |
| 234 | + ")\n", |
| 235 | + "\n", |
| 236 | + "# Create Trainer instance\n", |
| 237 | + "trainer = Trainer(\n", |
| 238 | + " model=model,\n", |
| 239 | + " args=training_args,\n", |
| 240 | + " train_dataset=train_dataset,\n", |
| 241 | + " eval_dataset=eval_dataset,\n", |
| 242 | + " tokenizer=tokenizer,\n", |
| 243 | + " optimizers=(optimizer, lr_scheduler)\n", |
| 244 | + ")\n", |
| 245 | + "\n", |
| 246 | + "# Function to monitor system usage including GPU metrics\n", |
| 247 | + "def print_system_usage(stop_event):\n", |
| 248 | + " while not stop_event.is_set():\n", |
| 249 | + " cpu_usage = psutil.cpu_percent()\n", |
| 250 | + " memory_usage = psutil.virtual_memory().percent\n", |
| 251 | + " if torch.cuda.is_available():\n", |
| 252 | + " gpu_alloc_mem = torch.cuda.memory_allocated() / (1024 ** 2)\n", |
| 253 | + " gpu_cached = torch.cuda.memory_reserved() / (1024 ** 2)\n", |
| 254 | + " sys.stdout.write(f\"\\rCPU Usage: {cpu_usage}% | Memory Usage: {memory_usage}% | GPU-Allocated-Memory Usage: {gpu_alloc_mem:.2f}MB | GPU-Cached-Memory Usage: {gpu_cached:.2f}MB\")\n", |
| 255 | + " elif torch.backends.mps.is_available():\n", |
| 256 | + " gpu_alloc_mem = torch.mps.current_allocated_memory() / (1024 ** 2)\n", |
| 257 | + " gpu_driver_mem = torch.mps.driver_allocated_memory() / (1024 ** 2)\n", |
| 258 | + " sys.stdout.write(f\"\\rCPU Usage: {cpu_usage}% | Memory Usage: {memory_usage}% | GPU-Allocated-Memory Usage: {gpu_alloc_mem:.2f}MB | GPU-Driver-Memory Usage: {gpu_driver_mem:.2f}MB\")\n", |
| 259 | + " sys.stdout.flush()\n", |
| 260 | + " time.sleep(300) # Wait to remeasure system usage\n", |
| 261 | + "\n", |
| 262 | + "# Create an event to stop the thread\n", |
| 263 | + "stop_event = threading.Event()\n", |
| 264 | + "\n", |
| 265 | + "# Start the system usage monitoring thread\n", |
| 266 | + "monitoring_thread = threading.Thread(target=print_system_usage, args=(stop_event,))\n", |
| 267 | + "monitoring_thread.start()\n", |
| 268 | + "\n", |
| 269 | + "# Train the model\n", |
| 270 | + "try:\n", |
| 271 | + " trainer.train()\n", |
| 272 | + "except RuntimeError as e:\n", |
| 273 | + " if 'out of memory' in str(e):\n", |
| 274 | + " print(\"CUDA OutOfMemoryError: Out of memory during training. Try reducing the batch size or model size.\")\n", |
| 275 | + " else:\n", |
| 276 | + " raise\n", |
| 277 | + "finally:\n", |
| 278 | + " # Stop the monitoring thread\n", |
| 279 | + " stop_event.set()\n", |
| 280 | + " monitoring_thread.join()\n", |
| 281 | + " if torch.cuda.device_count() > 1:\n", |
| 282 | + " model.module.save_pretrained(model_save_path)\n", |
| 283 | + " else:\n", |
| 284 | + " model.save_pretrained(model_save_path)\n", |
| 285 | + " tokenizer.save_pretrained(model_save_path)\n", |
| 286 | + "\n", |
| 287 | + "print(\"Training complete and model saved.\")" |
| 288 | + ] |
| 289 | + }, |
| 290 | + { |
| 291 | + "cell_type": "code", |
| 292 | + "execution_count": null, |
| 293 | + "metadata": {}, |
| 294 | + "outputs": [], |
| 295 | + "source": [ |
| 296 | + "# Results\n", |
| 297 | + "results = trainer.evaluate(eval_dataset)\n", |
| 298 | + "print(\"Evaluation Results:\", results)" |
| 299 | + ] |
| 300 | + } |
| 301 | + ], |
| 302 | + "metadata": { |
| 303 | + "kernelspec": { |
| 304 | + "display_name": "Python 3", |
| 305 | + "language": "python", |
| 306 | + "name": "python3" |
| 307 | + }, |
| 308 | + "language_info": { |
| 309 | + "codemirror_mode": { |
| 310 | + "name": "ipython", |
| 311 | + "version": 3 |
| 312 | + }, |
| 313 | + "file_extension": ".py", |
| 314 | + "mimetype": "text/x-python", |
| 315 | + "name": "python", |
| 316 | + "nbconvert_exporter": "python", |
| 317 | + "pygments_lexer": "ipython3", |
| 318 | + "version": "3.9.6" |
| 319 | + } |
| 320 | + }, |
| 321 | + "nbformat": 4, |
| 322 | + "nbformat_minor": 2 |
| 323 | +} |
0 commit comments