diff --git a/ds_config.json b/ds_config.json new file mode 100644 index 000000000..293c6e853 --- /dev/null +++ b/ds_config.json @@ -0,0 +1,21 @@ +{ + "train_micro_batch_size_per_gpu": 2, + "train_batch_size": 64, + "gradient_clipping": 1.0, + "zero_optimization": { + "stage": 0 + }, + "bf16": { + "enabled": true + }, + "steps_per_print": 1, + "wall_clock_breakdown": false, + "flops_profiler": { + "enabled": true, + "profile_step": 1, + "module_depth": -1, + "top_modules": 1, + "detailed": true, + "output_file": null + } +} \ No newline at end of file diff --git a/examples/azureml/deepspeed-BLOOM-AML-SDKv2.yaml b/examples/azureml/deepspeed-BLOOM-AML-SDKv2.yaml new file mode 100644 index 000000000..b83901a8e --- /dev/null +++ b/examples/azureml/deepspeed-BLOOM-AML-SDKv2.yaml @@ -0,0 +1,98 @@ +# Training job submission via AML CLI v2 + +$schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json + +command: >- + python pretrain_gpt.py + --tensor-model-parallel-size 8 + --pipeline-model-parallel-size 10 + --num-layers 80 + --hidden-size 144400 + --num-attention-heads 96 + --seq-length 2048 + --loss-scale 15 + --save-interval 100 + --max-position-embeddings 2048 + --micro-batch-size 1 + --global-batch-size 64 + --lr 6.0e-5 + --min-lr 6.0e-6 + --lr-decay-style 'cosine' + --log-interval 1 + --eval-iters 40 + --eval-interval 1000 + --clip-grad 1.0 + --weight-decay 0.1 + --adam-beta1 0.9 + --adam-beta2 0.95 + --init-method-std 0.006 + --bf16 + --train-iters 100000 + --data-impl 'mmap' + --checkpoint-activations + --tensorboard-dir '/tmp/outputs/tensorboard' + --deepspeed + --deepspeed_config 'ds_config.json' + --zero-stage 0 + --deepspeed-activation-checkpointing + --exit-interval 500 + --optimizer adam + --embed-layernorm + --sync-tp-duplicated-parameters + --seed 42 + --position-embedding-type alibi + --tokenizer-type PretrainedFromHF + --abort-on-unmet-fused-kernel-constraints + --aml-data-download-path ${{outputs.blobstore_datadir}} + --data-path 0.033178301 bigscience-catalogue-lm-data_bloom_bin/ar/ar_text_document 0.011279676 bigscience-catalogue-lm-data_bloom_bin/ca/ca_text_document 0.130708086 bigscience-catalogue-lm-data_bloom_bin/code/code_text_document 0.21911033 bigscience-catalogue-lm-data_bloom_bin/en/en_text_document 0.107035252 bigscience-catalogue-lm-data_bloom_bin/es/es_text_document 0.00156473 bigscience-catalogue-lm-data_bloom_bin/eu/eu_text_document 0.130973455 bigscience-catalogue-lm-data_bloom_bin/fr/fr_text_document 0.010954583 bigscience-catalogue-lm-data_bloom_bin/id/id_text_document 0.000110574 bigscience-catalogue-lm-data_bloom_bin/indic-as/indic-as_text_document 0.005510761 bigscience-catalogue-lm-data_bloom_bin/indic-bn/indic-bn_text_document 0.000403458 bigscience-catalogue-lm-data_bloom_bin/indic-gu/indic-gu_text_document 0.007495064 bigscience-catalogue-lm-data_bloom_bin/indic-hi/indic-hi_text_document 0.000621117 bigscience-catalogue-lm-data_bloom_bin/indic-kn/indic-kn_text_document 0.001036982 bigscience-catalogue-lm-data_bloom_bin/indic-ml/indic-ml_text_document 0.000502873 bigscience-catalogue-lm-data_bloom_bin/indic-mr/indic-mr_text_document 0.000669502 bigscience-catalogue-lm-data_bloom_bin/indic-ne/indic-ne_text_document 0.000360473 bigscience-catalogue-lm-data_bloom_bin/indic-or/indic-or_text_document 0.000510136 bigscience-catalogue-lm-data_bloom_bin/indic-pa/indic-pa_text_document 0.002120798 bigscience-catalogue-lm-data_bloom_bin/indic-ta/indic-ta_text_document 0.00091605 bigscience-catalogue-lm-data_bloom_bin/indic-te/indic-te_text_document 0.001249597 bigscience-catalogue-lm-data_bloom_bin/indic-ur/indic-ur_text_document 0.000316939 bigscience-catalogue-lm-data_bloom_bin/nigercongo-all/nigercongo-all_text_document 0.081644439 bigscience-catalogue-lm-data_bloom_bin/oscar-en/oscar-en_text_document 0.055479024 bigscience-catalogue-lm-data_bloom_bin/oscar-zhs/oscar-zhs_text_document 0.049707326 bigscience-catalogue-lm-data_bloom_bin/pt/pt_text_document 0.024698813 bigscience-catalogue-lm-data_bloom_bin/vi/vi_text_document 0.121322237 bigscience-catalogue-lm-data_bloom_bin/zhs/zhs_text_document 0.000519424 bigscience-catalogue-lm-data_bloom_bin/zht/zht_text_document + --split '90,5,5' + --tokenizer-name-or-path 'bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles' + --adam-eps 1e-8 + --pad-vocab-size-to 250880 + --exit-duration-in-mins 5990 + --pp-partition-method 'type:transformer|embedding' +code: ../../ +experiment_name: LargeModel-DistributedJob +environment: azureml:ACPTEnv@latest +environment_variables: + NCCL_DEBUG: 'WARN' + NCCL_DEBUG_SUBSYS: 'WARN' + CUDA_DEVICE_ORDER: 'PCI_BUS_ID' + DATASET_MOUNT_CACHE_SIZE: '10GB' + NCCL_SOCKET_IFNAME: 'eth0' + NCCL_IB_PCI_RELAXED_ORDERING: '1' + CUDA_LAUNCH_BLOCKING: '1' + UCX_TLS: 'tcp' + UCX_NET_DEVICES: 'eth0' +inputs: + train_file: + type: uri_file + mode: download + path: ../../train-splits.txt + vocab_file: + type: uri_file + mode: download + path: https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json + merge_file: + type: uri_file + mode: download + path: https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt + valid_file: + type: uri_file + mode: download + path: ../../valid-splits.txt +outputs: + output: + type: uri_folder + mode: rw_mount + path: azureml://datastores/workspaceblobstore/paths/outputs/checkoint + blobstore_datadir: + type: uri_folder + mode: rw_mount + path: azureml://datastores/bloomdatastore/paths/bloom-data +compute: azureml:bloom +distribution: + type: pytorch + process_count_per_instance: 8 +resources: + instance_count: 10 diff --git a/examples/azureml/environment/context/Dockerfile b/examples/azureml/environment/context/Dockerfile new file mode 100644 index 000000000..40b30d15f --- /dev/null +++ b/examples/azureml/environment/context/Dockerfile @@ -0,0 +1,5 @@ +FROM mcr.microsoft.com/azureml/curated/acpt-pytorch-1.11-py38-cuda11.3-gpu +RUN pip install git+https://github.com/microsoft/DeepSpeed.git@olruwase/bf16-updates +RUN pip install transformers +RUN pip install setuptools_rust +RUN pip install datasets \ No newline at end of file diff --git a/examples/azureml/environment/env.yml b/examples/azureml/environment/env.yml new file mode 100644 index 000000000..ad5b64b56 --- /dev/null +++ b/examples/azureml/environment/env.yml @@ -0,0 +1,27 @@ +$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json +name: ACPTEnv +version: 1 +build: + path: ./context/ +tags: + os: ubuntu + os_version: 20.04 + hpcx: 2.10 + mpi: openmpi + mpi_version: 4.1.2rc4 + ucx: 1.12.0 + cuda: 11.7 + cublas: 11.10.3.66 + cudnn: 8.4.1 + nccl: 2.12.10 + rapids: 22.04 + rdma_core: 36.0 + hpc_x: 2.10 + nsight_compute: 2022.2.1 + nsight_systems: 2022.1.3.3 + nccl_test: 2.11.0 + azureml-defaults: 1.41.0 + mlflow: 1.25.1 + transformers: 4.18.0 + torch: "1.13.0a0+340c412" + pynvml: 11.4.1 diff --git a/megatron/arguments.py b/megatron/arguments.py index c18235a78..b086f650a 100644 --- a/megatron/arguments.py +++ b/megatron/arguments.py @@ -316,6 +316,14 @@ def parse_args(extra_args_provider=None, defaults={}, except ModuleNotFoundError: raise ModuleNotFoundError("Please install bitsandbytes from https://github.com/facebookresearch/bitsandbytes.") + if args.aml_data_download_path is not None: + data_paths = [] + if len(args.data_path) == 1: + data_paths.append(f"{args.aml_data_download_path}/{path}") + else: + data_paths = [val if idx % 2 == 0 else f"{args.aml_data_download_path}/{val}" for idx, val in enumerate(args.data_path)] + args.data_path = data_paths + _print_args(args) return args @@ -767,6 +775,8 @@ def _add_data_args(parser): # option 1 for data loading (mutually exclusive with option2) + group.add_argument('--aml-data-download-path', type=str, default=None, + help='Path to mounted input dataset') group.add_argument('--data-path', nargs='*', default=None, help='Path to the training dataset. Accepted format:' '1) a single data path, 2) multiple datasets in the' @@ -800,7 +810,6 @@ def __call__(self, parser, args, values, option_string=None): datasets = prefix.split(",") # check if each dataset is formatted like `WEIGHT START:END PATH` for d in datasets: - assert len(d.split()) == 3, err_message start, end = d.split()[1].split(":") assert float(start) < float(end), err_message @@ -810,7 +819,6 @@ def __call__(self, parser, args, values, option_string=None): weights = [[d.split()[0] for d in p.split(",")] for p in prefixes] splits = [[d.split()[1] for d in p.split(",")] for p in prefixes] paths = [[d.split()[2] for d in p.split(",")] for p in prefixes] - # # to keep consistency with Option 1 of data loading (through --data-path) # # paths will contain strings on the following form # # "WEIGHTS1 PATH1 WEIGHTS2 PATH2 WEIGHTS3 PATH3" for each dataset group @@ -936,7 +944,6 @@ def __call__(self, parser, args, values, option_string=None): group.add_argument("--noise-density", type=float, default=None, help="Span corruption noise density") group.add_argument("--mean-noise-span-length", type=int, default=None, help="Span corruption mean noise span length") - return parser @@ -948,7 +955,6 @@ def _add_autoresume_args(parser): group.add_argument('--adlr-autoresume-interval', type=int, default=1000, help='Intervals over which check for autoresume' 'termination signal') - return parser