From f3a5c3b39e57d979de3b6e4bd0ce640a5ef44372 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Tue, 25 Jun 2019 07:56:59 +0000 Subject: [PATCH 1/2] test=release/1.5, add mutigpu install check --- python/paddle/fluid/install_check.py | 155 ++++++++++++++++++++++----- 1 file changed, 131 insertions(+), 24 deletions(-) diff --git a/python/paddle/fluid/install_check.py b/python/paddle/fluid/install_check.py index ce21d575348bca..aca877caaee173 100644 --- a/python/paddle/fluid/install_check.py +++ b/python/paddle/fluid/install_check.py @@ -12,20 +12,48 @@ # See the License for the specific language governing permissions and # limitations under the License. -from .framework import Program, program_guard, unique_name, default_startup_program +import os +from .framework import Program, program_guard, unique_name from .param_attr import ParamAttr from .initializer import Constant from . import layers from . import backward from .dygraph import Layer, nn from . import executor - +from . import optimizer from . import core +from . import compiler +import logging import numpy as np __all__ = ['run_check'] +def process_env(): + env = os.environ + device_list = [] + if env.get('CUDA_VISIBLE_DEVICES') is not None: + cuda_devices = env['CUDA_VISIBLE_DEVICES'] + if core.get_cuda_device_count() == 0: + logging.warning( + "No CUDA Device Found! But your are using GPU version Paddle Fluid" + ) + device_list = [] + elif len(cuda_devices) == 1: + device_list.append(0) + elif len(cuda_devices) > 1: + for i in range(len(cuda_devices.split(","))): + device_list.append(i) + return device_list + else: + if core.get_cuda_device_count() > 1: + for i in range(core.get_cuda_device_count()): + device_list.append(i) + return device_list + else: + return [0] + + class SimpleLayer(Layer): def __init__(self, name_scope): super(SimpleLayer, self).__init__(name_scope) @@ -45,25 +73,104 @@ def run_check(): This func should not be called only if you need to verify installation ''' print("Running Verify Fluid Program ... ") - prog = Program() - startup_prog = Program() - scope = core.Scope() - with executor.scope_guard(scope): - with program_guard(prog, startup_prog): - with unique_name.guard(): - np_inp = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) - inp = layers.data( - name="inp", shape=[2, 2], append_batch_size=False) - simple_layer = SimpleLayer("simple_layer") - out = simple_layer(inp) - param_grads = backward.append_backward( - out, parameter_list=[simple_layer._fc1._w.name])[0] - exe = executor.Executor(core.CPUPlace( - ) if not core.is_compiled_with_cuda() else core.CUDAPlace(0)) - exe.run(default_startup_program()) - exe.run(feed={inp.name: np_inp}, - fetch_list=[out.name, param_grads[1].name]) - - print( - "Your Paddle Fluid is installed successfully! Let's start deep Learning with Paddle Fluid now!" - ) + + device_list = [] + if core.is_compiled_with_cuda(): + try: + core.get_cuda_device_count() + except Exception as e: + logging.warning( + "You are using GPU version Paddle Fluid, But Your CUDA Device is not set properly" + "\n Original Error is {}".format(e)) + return 0 + device_list = process_env() + else: + device_list = [0, 1] # for CPU 0,1 + + use_cuda = False if not core.is_compiled_with_cuda() else True + np_inp_single = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) + inp = [] + for i in range(len(device_list)): + inp.append(np_inp_single) + np_inp_muti = np.array(inp) + np_inp_muti = np_inp_muti.reshape(len(device_list), 2, 2) + + def test_parallerl_exe(): + train_prog = Program() + startup_prog = Program() + scope = core.Scope() + with executor.scope_guard(scope): + with program_guard(train_prog, startup_prog): + with unique_name.guard(): + places = [] + build_strategy = compiler.BuildStrategy() + build_strategy.enable_inplace = True + build_strategy.memory_optimize = True + inp = layers.data(name="inp", shape=[2, 2]) + simple_layer = SimpleLayer("simple_layer") + out = simple_layer(inp) + exe = executor.Executor( + core.CUDAPlace(0) if core.is_compiled_with_cuda() and + (core.get_cuda_device_count() > 0) else core.CPUPlace()) + if use_cuda: + for i in device_list: + places.append(core.CUDAPlace(i)) + else: + places = [core.CPUPlace(), core.CPUPlace()] + loss = layers.mean(out) + loss.persistable = True + optimizer.SGD(learning_rate=0.01).minimize(loss) + startup_prog.random_seed = 1 + compiled_prog = compiler.CompiledProgram( + train_prog).with_data_parallel( + build_strategy=build_strategy, + loss_name=loss.name, + places=places) + exe.run(startup_prog) + + exe.run(compiled_prog, + feed={inp.name: np_inp_muti}, + fetch_list=[loss.name]) + + def test_simple_exe(): + train_prog = Program() + startup_prog = Program() + scope = core.Scope() + with executor.scope_guard(scope): + with program_guard(train_prog, startup_prog): + with unique_name.guard(): + inp0 = layers.data( + name="inp", shape=[2, 2], append_batch_size=False) + simple_layer0 = SimpleLayer("simple_layer") + out0 = simple_layer0(inp0) + param_grads = backward.append_backward( + out0, parameter_list=[simple_layer0._fc1._w.name])[0] + exe0 = executor.Executor( + core.CUDAPlace(0) if core.is_compiled_with_cuda() and + (core.get_cuda_device_count() > 0) else core.CPUPlace()) + exe0.run(startup_prog) + exe0.run(feed={inp0.name: np_inp_single}, + fetch_list=[out0.name, param_grads[1].name]) + + test_simple_exe() + + print("Your Paddle Fluid works well on SINGLE GPU or CPU.") + try: + test_parallerl_exe() + print("Your Paddle Fluid works well on MUTIPLE GPU or CPU.") + print( + "Your Paddle Fluid is installed successfully! Let's start deep Learning with Paddle Fluid now" + ) + except Exception as e: + logging.warning( + "Your Paddle Fluid has some problem with multiple GPU. This may be caused by:" + "\n 1. There is only 1 GPU visible on your Device;" + "\n 2. No.1 or No.2 GPU or both of them are occupied now" + "\n 3. Wrong installation of NVIDIA-NCCL2, please follow instruction on https://github.com/NVIDIA/nccl-tests " + "\n to test your NCCL, or reinstall it following https://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html" + ) + + print("\n Original Error is: {}".format(e)) + print( + "Your Paddle Fluid is installed successfully ONLY for SINGLE GPU or CPU! " + "\n Let's start deep Learning with Paddle Fluid now") From 145ade7db5aa5cccab3476ef524a98e33fc24f58 Mon Sep 17 00:00:00 2001 From: JiabinYang Date: Tue, 25 Jun 2019 09:37:43 +0000 Subject: [PATCH 2/2] test=develop, refine code to use cuda_devices --- python/paddle/fluid/install_check.py | 42 ++++------------------------ 1 file changed, 5 insertions(+), 37 deletions(-) diff --git a/python/paddle/fluid/install_check.py b/python/paddle/fluid/install_check.py index aca877caaee173..05907562e5e955 100644 --- a/python/paddle/fluid/install_check.py +++ b/python/paddle/fluid/install_check.py @@ -13,7 +13,7 @@ # limitations under the License. import os -from .framework import Program, program_guard, unique_name +from .framework import Program, program_guard, unique_name, cuda_places, cpu_places from .param_attr import ParamAttr from .initializer import Constant from . import layers @@ -29,31 +29,6 @@ __all__ = ['run_check'] -def process_env(): - env = os.environ - device_list = [] - if env.get('CUDA_VISIBLE_DEVICES') is not None: - cuda_devices = env['CUDA_VISIBLE_DEVICES'] - if core.get_cuda_device_count() == 0: - logging.warning( - "No CUDA Device Found! But your are using GPU version Paddle Fluid" - ) - device_list = [] - elif len(cuda_devices) == 1: - device_list.append(0) - elif len(cuda_devices) > 1: - for i in range(len(cuda_devices.split(","))): - device_list.append(i) - return device_list - else: - if core.get_cuda_device_count() > 1: - for i in range(core.get_cuda_device_count()): - device_list.append(i) - return device_list - else: - return [0] - - class SimpleLayer(Layer): def __init__(self, name_scope): super(SimpleLayer, self).__init__(name_scope) @@ -83,11 +58,10 @@ def run_check(): "You are using GPU version Paddle Fluid, But Your CUDA Device is not set properly" "\n Original Error is {}".format(e)) return 0 - device_list = process_env() + device_list = cuda_places() else: - device_list = [0, 1] # for CPU 0,1 + device_list = [core.CPUPlace(), core.CPUPlace()] - use_cuda = False if not core.is_compiled_with_cuda() else True np_inp_single = np.array([[1.0, 2.0], [3.0, 4.0]], dtype=np.float32) inp = [] for i in range(len(device_list)): @@ -102,7 +76,6 @@ def test_parallerl_exe(): with executor.scope_guard(scope): with program_guard(train_prog, startup_prog): with unique_name.guard(): - places = [] build_strategy = compiler.BuildStrategy() build_strategy.enable_inplace = True build_strategy.memory_optimize = True @@ -112,11 +85,6 @@ def test_parallerl_exe(): exe = executor.Executor( core.CUDAPlace(0) if core.is_compiled_with_cuda() and (core.get_cuda_device_count() > 0) else core.CPUPlace()) - if use_cuda: - for i in device_list: - places.append(core.CUDAPlace(i)) - else: - places = [core.CPUPlace(), core.CPUPlace()] loss = layers.mean(out) loss.persistable = True optimizer.SGD(learning_rate=0.01).minimize(loss) @@ -125,7 +93,7 @@ def test_parallerl_exe(): train_prog).with_data_parallel( build_strategy=build_strategy, loss_name=loss.name, - places=places) + places=device_list) exe.run(startup_prog) exe.run(compiled_prog, @@ -164,7 +132,7 @@ def test_simple_exe(): except Exception as e: logging.warning( "Your Paddle Fluid has some problem with multiple GPU. This may be caused by:" - "\n 1. There is only 1 GPU visible on your Device;" + "\n 1. There is only 1 or 0 GPU visible on your Device;" "\n 2. No.1 or No.2 GPU or both of them are occupied now" "\n 3. Wrong installation of NVIDIA-NCCL2, please follow instruction on https://github.com/NVIDIA/nccl-tests " "\n to test your NCCL, or reinstall it following https://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html"