Add support for examples under one GPU without NCCL

Hanbin Hu · Hanbin Hu · commit 473ec22b29da · 2020-11-06T15:34:57.000-08:00
diff --git a/examples/pytorch_benchmark.py b/examples/pytorch_benchmark.py
@@ -69,7 +69,8 @@
 bf.init()
 
 if args.cuda:
-    torch.cuda.set_device(bf.local_rank())
+    device_id = bf.local_rank() if bf.nccl_built() else bf.local_rank() % torch.cuda.device_count()
+    torch.cuda.set_device(device_id)
     cudnn.benchmark = True
 
 # Set up standard model.
diff --git a/examples/pytorch_mnist.py b/examples/pytorch_mnist.py
@@ -79,7 +79,8 @@
 
 if args.cuda:
     # Bluefog: pin GPU to local rank.
-    torch.cuda.set_device(bf.local_rank())
+    device_id = bf.local_rank() if bf.nccl_built() else bf.local_rank() % torch.cuda.device_count()
+    torch.cuda.set_device(device_id)
     torch.cuda.manual_seed(args.seed)
 
 
diff --git a/examples/pytorch_resnet.py b/examples/pytorch_resnet.py
@@ -103,7 +103,8 @@
 if args.cuda:
     print("using cuda.")
     # Bluefog: pin GPU to local rank.
-    torch.cuda.set_device(bf.local_rank())
+    device_id = bf.local_rank() if bf.nccl_built() else bf.local_rank() % torch.cuda.device_count()
+    torch.cuda.set_device(device_id)
     torch.cuda.manual_seed(args.seed)
 else:
     print("using cpu")
diff --git a/test/torch_ops_test.py b/test/torch_ops_test.py
@@ -61,10 +61,11 @@ def convert_cpu_fp16_to_fp32(self, *values):
 
     def cast_and_place(self, tensor, dtype):
         if dtype.is_cuda:
-            if bf.local_size() > torch.cuda.device_count():
+            if bf.nccl_built() and bf.local_size() > torch.cuda.device_count():
                 raise EnvironmentError(
-                    "Cannot run number of processes in one machine are more than device count")
-            return tensor.cuda(bf.local_rank()).type(dtype)
+                    "Cannot run number of processes in one machine more than GPU device count"
+                    " in NCCL environment")
+            return tensor.cuda(bf.local_rank() % torch.cuda.device_count()).type(dtype)
         return tensor.type(dtype)
 
     def test_broadcast(self):
diff --git a/test/torch_win_ops_test.py b/test/torch_win_ops_test.py
@@ -54,10 +54,11 @@ def tearDown(self):
     @staticmethod
     def cast_and_place(tensor, dtype):
         if dtype.is_cuda:
-            if bf.local_size() > torch.cuda.device_count():
+            if bf.nccl_built() and bf.local_size() > torch.cuda.device_count():
                 raise EnvironmentError(
-                    "Cannot run number of processes in one machine are more than device count")
-            return tensor.cuda(bf.local_rank()).type(dtype)
+                    "Cannot run number of processes in one machine more than GPU device count"
+                    " in NCCL environment")
+            return tensor.cuda(bf.local_rank() % torch.cuda.device_count()).type(dtype)
         return tensor.type(dtype)
 
     def test_win_create_and_sync_and_free(self):