Skip to content

Bloom model training with AML #365

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions ds_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"train_micro_batch_size_per_gpu": 2,
"train_batch_size": 64,
"gradient_clipping": 1.0,
"zero_optimization": {
"stage": 0
},
"bf16": {
"enabled": true
},
"steps_per_print": 1,
"wall_clock_breakdown": false,
"flops_profiler": {
"enabled": true,
"profile_step": 1,
"module_depth": -1,
"top_modules": 1,
"detailed": true,
"output_file": null
}
}
98 changes: 98 additions & 0 deletions examples/azureml/deepspeed-BLOOM-AML-SDKv2.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
# Training job submission via AML CLI v2

$schema: https://azuremlschemas.azureedge.net/latest/commandJob.schema.json

command: >-
python pretrain_gpt.py
--tensor-model-parallel-size 8
--pipeline-model-parallel-size 10
--num-layers 80
--hidden-size 144400
--num-attention-heads 96
--seq-length 2048
--loss-scale 15
--save-interval 100
--max-position-embeddings 2048
--micro-batch-size 1
--global-batch-size 64
--lr 6.0e-5
--min-lr 6.0e-6
--lr-decay-style 'cosine'
--log-interval 1
--eval-iters 40
--eval-interval 1000
--clip-grad 1.0
--weight-decay 0.1
--adam-beta1 0.9
--adam-beta2 0.95
--init-method-std 0.006
--bf16
--train-iters 100000
--data-impl 'mmap'
--checkpoint-activations
--tensorboard-dir '/tmp/outputs/tensorboard'
--deepspeed
--deepspeed_config 'ds_config.json'
--zero-stage 0
--deepspeed-activation-checkpointing
--exit-interval 500
--optimizer adam
--embed-layernorm
--sync-tp-duplicated-parameters
--seed 42
--position-embedding-type alibi
--tokenizer-type PretrainedFromHF
--abort-on-unmet-fused-kernel-constraints
--aml-data-download-path ${{outputs.blobstore_datadir}}
--data-path 0.033178301 bigscience-catalogue-lm-data_bloom_bin/ar/ar_text_document 0.011279676 bigscience-catalogue-lm-data_bloom_bin/ca/ca_text_document 0.130708086 bigscience-catalogue-lm-data_bloom_bin/code/code_text_document 0.21911033 bigscience-catalogue-lm-data_bloom_bin/en/en_text_document 0.107035252 bigscience-catalogue-lm-data_bloom_bin/es/es_text_document 0.00156473 bigscience-catalogue-lm-data_bloom_bin/eu/eu_text_document 0.130973455 bigscience-catalogue-lm-data_bloom_bin/fr/fr_text_document 0.010954583 bigscience-catalogue-lm-data_bloom_bin/id/id_text_document 0.000110574 bigscience-catalogue-lm-data_bloom_bin/indic-as/indic-as_text_document 0.005510761 bigscience-catalogue-lm-data_bloom_bin/indic-bn/indic-bn_text_document 0.000403458 bigscience-catalogue-lm-data_bloom_bin/indic-gu/indic-gu_text_document 0.007495064 bigscience-catalogue-lm-data_bloom_bin/indic-hi/indic-hi_text_document 0.000621117 bigscience-catalogue-lm-data_bloom_bin/indic-kn/indic-kn_text_document 0.001036982 bigscience-catalogue-lm-data_bloom_bin/indic-ml/indic-ml_text_document 0.000502873 bigscience-catalogue-lm-data_bloom_bin/indic-mr/indic-mr_text_document 0.000669502 bigscience-catalogue-lm-data_bloom_bin/indic-ne/indic-ne_text_document 0.000360473 bigscience-catalogue-lm-data_bloom_bin/indic-or/indic-or_text_document 0.000510136 bigscience-catalogue-lm-data_bloom_bin/indic-pa/indic-pa_text_document 0.002120798 bigscience-catalogue-lm-data_bloom_bin/indic-ta/indic-ta_text_document 0.00091605 bigscience-catalogue-lm-data_bloom_bin/indic-te/indic-te_text_document 0.001249597 bigscience-catalogue-lm-data_bloom_bin/indic-ur/indic-ur_text_document 0.000316939 bigscience-catalogue-lm-data_bloom_bin/nigercongo-all/nigercongo-all_text_document 0.081644439 bigscience-catalogue-lm-data_bloom_bin/oscar-en/oscar-en_text_document 0.055479024 bigscience-catalogue-lm-data_bloom_bin/oscar-zhs/oscar-zhs_text_document 0.049707326 bigscience-catalogue-lm-data_bloom_bin/pt/pt_text_document 0.024698813 bigscience-catalogue-lm-data_bloom_bin/vi/vi_text_document 0.121322237 bigscience-catalogue-lm-data_bloom_bin/zhs/zhs_text_document 0.000519424 bigscience-catalogue-lm-data_bloom_bin/zht/zht_text_document
--split '90,5,5'
--tokenizer-name-or-path 'bigscience-catalogue-data-dev/byte-level-bpe-tokenizer-no-norm-250k-whitespace-and-eos-regex-alpha-v3-dedup-lines-articles'
--adam-eps 1e-8
--pad-vocab-size-to 250880
--exit-duration-in-mins 5990
--pp-partition-method 'type:transformer|embedding'
code: ../../
experiment_name: LargeModel-DistributedJob
environment: azureml:ACPTEnv@latest
environment_variables:
NCCL_DEBUG: 'WARN'
NCCL_DEBUG_SUBSYS: 'WARN'
CUDA_DEVICE_ORDER: 'PCI_BUS_ID'
DATASET_MOUNT_CACHE_SIZE: '10GB'
NCCL_SOCKET_IFNAME: 'eth0'
NCCL_IB_PCI_RELAXED_ORDERING: '1'
CUDA_LAUNCH_BLOCKING: '1'
UCX_TLS: 'tcp'
UCX_NET_DEVICES: 'eth0'
inputs:
train_file:
type: uri_file
mode: download
path: ../../train-splits.txt
vocab_file:
type: uri_file
mode: download
path: https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
merge_file:
type: uri_file
mode: download
path: https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
valid_file:
type: uri_file
mode: download
path: ../../valid-splits.txt
outputs:
output:
type: uri_folder
mode: rw_mount
path: azureml://datastores/workspaceblobstore/paths/outputs/checkoint
blobstore_datadir:
type: uri_folder
mode: rw_mount
path: azureml://datastores/bloomdatastore/paths/bloom-data
compute: azureml:bloom
distribution:
type: pytorch
process_count_per_instance: 8
resources:
instance_count: 10
5 changes: 5 additions & 0 deletions examples/azureml/environment/context/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
FROM mcr.microsoft.com/azureml/curated/acpt-pytorch-1.11-py38-cuda11.3-gpu
RUN pip install git+https://github.com/microsoft/DeepSpeed.git@olruwase/bf16-updates
RUN pip install transformers
RUN pip install setuptools_rust
RUN pip install datasets
27 changes: 27 additions & 0 deletions examples/azureml/environment/env.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
$schema: https://azuremlschemas.azureedge.net/latest/environment.schema.json
name: ACPTEnv
version: 1
build:
path: ./context/
tags:
os: ubuntu
os_version: 20.04
hpcx: 2.10
mpi: openmpi
mpi_version: 4.1.2rc4
ucx: 1.12.0
cuda: 11.7
cublas: 11.10.3.66
cudnn: 8.4.1
nccl: 2.12.10
rapids: 22.04
rdma_core: 36.0
hpc_x: 2.10
nsight_compute: 2022.2.1
nsight_systems: 2022.1.3.3
nccl_test: 2.11.0
azureml-defaults: 1.41.0
mlflow: 1.25.1
transformers: 4.18.0
torch: "1.13.0a0+340c412"
pynvml: 11.4.1
14 changes: 10 additions & 4 deletions megatron/arguments.py
Original file line number Diff line number Diff line change
Expand Up @@ -316,6 +316,14 @@ def parse_args(extra_args_provider=None, defaults={},
except ModuleNotFoundError:
raise ModuleNotFoundError("Please install bitsandbytes from https://github.com/facebookresearch/bitsandbytes.")

if args.aml_data_download_path is not None:
data_paths = []
if len(args.data_path) == 1:
data_paths.append(f"{args.aml_data_download_path}/{path}")
else:
data_paths = [val if idx % 2 == 0 else f"{args.aml_data_download_path}/{val}" for idx, val in enumerate(args.data_path)]
args.data_path = data_paths

_print_args(args)
return args

Expand Down Expand Up @@ -767,6 +775,8 @@ def _add_data_args(parser):


# option 1 for data loading (mutually exclusive with option2)
group.add_argument('--aml-data-download-path', type=str, default=None,
help='Path to mounted input dataset')
group.add_argument('--data-path', nargs='*', default=None,
help='Path to the training dataset. Accepted format:'
'1) a single data path, 2) multiple datasets in the'
Expand Down Expand Up @@ -800,7 +810,6 @@ def __call__(self, parser, args, values, option_string=None):
datasets = prefix.split(",")
# check if each dataset is formatted like `WEIGHT START:END PATH`
for d in datasets:
assert len(d.split()) == 3, err_message
start, end = d.split()[1].split(":")
assert float(start) < float(end), err_message

Expand All @@ -810,7 +819,6 @@ def __call__(self, parser, args, values, option_string=None):
weights = [[d.split()[0] for d in p.split(",")] for p in prefixes]
splits = [[d.split()[1] for d in p.split(",")] for p in prefixes]
paths = [[d.split()[2] for d in p.split(",")] for p in prefixes]

# # to keep consistency with Option 1 of data loading (through --data-path)
# # paths will contain strings on the following form
# # "WEIGHTS1 PATH1 WEIGHTS2 PATH2 WEIGHTS3 PATH3" for each dataset group
Expand Down Expand Up @@ -936,7 +944,6 @@ def __call__(self, parser, args, values, option_string=None):
group.add_argument("--noise-density", type=float, default=None, help="Span corruption noise density")
group.add_argument("--mean-noise-span-length", type=int, default=None, help="Span corruption mean noise span length")


return parser


Expand All @@ -948,7 +955,6 @@ def _add_autoresume_args(parser):
group.add_argument('--adlr-autoresume-interval', type=int, default=1000,
help='Intervals over which check for autoresume'
'termination signal')

return parser


Expand Down