Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multi-worker MLPerf benchmarks on GPUs using NVIDIA's v0.6 submission #2810

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGES.next.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,7 @@
- Add support for setting virtual NIC type for GCP VMs.
- Add ability to collect /proc/meminfo data with --collect_meminfo
- Add support for setting egress bandwidth tier for GCP VMs.
- Add MLPerf multiworker benchmark based on NVIDIA's v0.6 submission

### Enhancements:

Expand Down
1 change: 1 addition & 0 deletions perfkitbenchmarker/benchmark_sets.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@
'iperf',
'mesh_network',
'mlperf',
'mlperf_multiworkers',
'mnist',
'mongodb_ycsb',
'multichase',
Expand Down
191 changes: 114 additions & 77 deletions perfkitbenchmarker/linux_benchmarks/mlperf_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,26 @@
boot_disk_size: 105
"""

flags.DEFINE_enum('mlperf_benchmark', 'resnet',
['resnet', 'transformer', 'mask', 'gnmt', 'ssd', 'minigo'],
TRANSFORMER = 'transformer'
RESNET = 'resnet'
MASK = 'mask'
GNMT = 'gnmt'
SSD = 'ssd'
MINIGO = 'minigo'

flags.DEFINE_enum('mlperf_benchmark', RESNET,
[RESNET, TRANSFORMER, MASK, GNMT, SSD, MINIGO],
'MLPerf benchmark test to run.')

NVPROF = 'nvprof'
TFPROF = 'tfprof'
NONE = 'none'

flags.DEFINE_enum('mlperf_profiler', NONE, [NVPROF, TFPROF, NONE],
'profiler used to analysis GPU training')
flags.DEFINE_integer('mlperf_profile_steps', 20, 'number of steps to profile')
flags.DEFINE_string('mlperf_bucket', None, 'GCS bucket for mlperf results.')

flags.DEFINE_string(
'mlperf_gcs_resnet_checkpoint',
'gs://p3rf-mlperf/resnet50-checkpoint-2018-02-07/model.ckpt-112603',
Expand All @@ -73,7 +90,7 @@
flags.DEFINE_string('gnmt_data_dir',
'gs://p3rf-mlperf/mlperf_v0.6_nv_gnmt/',
'Directory where the nv v0.6 WMT dataset is stored')
flags.DEFINE_string('minigo_model_dir', None,
flags.DEFINE_string('minigo_model_dir', '',
'Directory on GCS to copy minigo source data from. Files '
'will be copied from subdirectories of src_dir '
'corresponding to the board size.')
Expand Down Expand Up @@ -116,14 +133,15 @@ def _DownloadData(data_dir, data_path, vm):
"""

vm.Install('google_cloud_sdk')
vm.RemoteCommand('if [ ! -d \"{data_path}\" ]; then '
' sudo mkdir -p {data_path} && '
' sudo chmod a+w {data_path} && '
' {gsutil_path} -m cp -r {data_dir}/* {data_path} ;'
'fi'.format(
data_dir=data_dir,
gsutil_path=google_cloud_sdk.GSUTIL_PATH,
data_path=data_path))
vm.RemoteCommand(
'if [ ! -d \"{data_path}\" ]; then '
' sudo mkdir -p {data_path} && '
' sudo chmod a+w {data_path} && '
' {gsutil_path} -m cp -r {data_dir}/* {data_path} ;'
'fi'.format(
data_dir=data_dir,
gsutil_path=google_cloud_sdk.GSUTIL_PATH,
data_path=data_path))


def Prepare(benchmark_spec, vm=None):
Expand Down Expand Up @@ -156,9 +174,15 @@ def Prepare(benchmark_spec, vm=None):
if vm == benchmark_spec.vms[0]:
storage_service = gcs.GoogleCloudStorageService()
benchmark_spec.storage_service = storage_service
bucket = 'pkb{}'.format(FLAGS.run_uri)
if FLAGS.mlperf_bucket:
bucket = FLAGS.mlperf_bucket
benchmark_spec.model_dir = 'gs://{bucket}/pkb-{uri}'.format(
bucket=bucket, uri=FLAGS.run_uri)
else:
bucket = 'pkb-{uri}'.format(uri=FLAGS.run_uri)
benchmark_spec.model_dir = 'gs://{}'.format(bucket)

benchmark_spec.bucket = bucket
benchmark_spec.model_dir = 'gs://{}'.format(bucket)
location = benchmark_spec.tpu_groups['train'].GetZone()
storage_service.PrepareService(util.GetRegionFromZone(location))
storage_service.MakeBucket(bucket)
Expand All @@ -182,9 +206,9 @@ def Prepare(benchmark_spec, vm=None):
'MLPerf configurations do not support the hardware in PKB. PKB may '
'need to be updated if this is a new TPU type.')

if 'mask' in benchmark_spec.benchmark:
if MASK in benchmark_spec.benchmark:
model = 'mask_rcnn'
elif 'gnmt' in benchmark_spec.benchmark:
elif GNMT in benchmark_spec.benchmark:
model = 'nmt'
else:
model = benchmark_spec.benchmark
Expand All @@ -197,20 +221,21 @@ def Prepare(benchmark_spec, vm=None):

vm.RemoteCommand('pip3 install --upgrade pyyaml==3.13 ')
vm.RemoteCommand('pip3 install cloud-tpu-profiler==1.12')
if ('mask' in benchmark_spec.benchmark or
'ssd' in benchmark_spec.benchmark):
if (MASK in benchmark_spec.benchmark or
SSD in benchmark_spec.benchmark):
# TODO(user): coco whl package for python 3.5
vm.RemoteCommand(
'cd /tmp && '
'wget https://storage.cloud.google.com/mlperf_artifcats/v0.6_training/coco-1.1-cp36-cp36m-linux_x86_64.whl'
)

run_script = posixpath.join(run_path, 'setup.sh')
vm_util.ReplaceText(vm, '--progress-bar off', ' ', run_script)
vm_util.ReplaceText(vm, 'pip ', 'pip3 ', run_script)
vm.RemoteCommand('chmod 755 {script} && {script}'.format(script=run_script))
setup_script = posixpath.join(run_path, 'setup.sh')
vm_util.ReplaceText(vm, '--progress-bar off', ' ', setup_script)
vm_util.ReplaceText(vm, 'pip ', 'pip3 ', setup_script)
vm.RemoteCommand(
'chmod 755 {script} && {script}'.format(script=setup_script))

if 'mask' not in benchmark_spec.benchmark:
if MASK not in benchmark_spec.benchmark:
vm.RemoteCommand(
'pip3 uninstall -y tf-estimator-nightly && '
'pip3 install tf-estimator-nightly==1.14.0.dev2019051801')
Expand Down Expand Up @@ -272,10 +297,10 @@ def Prepare(benchmark_spec, vm=None):
run_script)
vm.RemoteCommand('chmod 755 {}'.format(run_script))

if 'gnmt' in benchmark_spec.benchmark:
run_script = posixpath.join(code_path, model, 'metric.py')
if GNMT in benchmark_spec.benchmark:
metric_script = posixpath.join(code_path, model, 'metric.py')
vm_util.ReplaceText(vm, ' sacrebleu -t', ' python3 -m sacrebleu -t',
run_script)
metric_script)
else:
benchmark_spec.model_dir = '/tmp'

Expand All @@ -286,51 +311,51 @@ def Prepare(benchmark_spec, vm=None):
vm.Install('nvidia_docker')
vm.RemoteCommand('if [ ! -d "/data" ]; then sudo ln -s /scratch /data; fi')

if 'resnet' in benchmark_spec.benchmark:
if RESNET in benchmark_spec.benchmark:
vm.RemoteCommand(
'cd training_results_v0.6/NVIDIA/benchmarks/resnet/implementations/mxnet &&'
' sudo docker build --pull --network=host . -t mlperf-nvidia:image_classification',
' sudo docker build --network=host . -t mlperf-nvidia:image_classification',
should_log=True)
_DownloadData(benchmark_spec.imagenet_data_dir,
posixpath.join('/data', 'imagenet'), vm)

if 'transformer' in benchmark_spec.benchmark:
if TRANSFORMER in benchmark_spec.benchmark:
vm.RemoteCommand(
'cd training_results_v0.6/NVIDIA/benchmarks/transformer/implementations/pytorch &&'
' sudo docker build --pull --network=host . -t mlperf-nvidia:translation',
' sudo docker build --network=host . -t mlperf-nvidia:translation',
should_log=True)
_DownloadData(benchmark_spec.wmt_data_dir, posixpath.join('/data', 'wmt'),
vm)

if 'minigo' in benchmark_spec.benchmark:
if MINIGO in benchmark_spec.benchmark:
build_path = 'training_results_v0.6/NVIDIA/benchmarks/minigo/implementations/tensorflow'
run_script = posixpath.join(build_path, 'run_and_time.sh')
vm_util.ReplaceText(vm, 'get_data.py', 'get_data.py --src_dir={}'.format(
FLAGS.minigo_model_dir.replace('/', r'\/')), run_script)
vm.RemoteCommand('cd {} && sudo docker build --pull --network=host -t '
vm.RemoteCommand('cd {} && sudo docker build --network=host -t '
'mlperf-nvidia:minigo .'.format(build_path),
should_log=True)

if 'mask' in benchmark_spec.benchmark:
if MASK in benchmark_spec.benchmark:
vm.RemoteCommand(
'cd training_results_v0.6/NVIDIA/benchmarks/maskrcnn/implementations/pytorch && '
'sudo docker build --pull --network=host -t mlperf-nvidia:object_detection . ',
'sudo docker build --network=host -t mlperf-nvidia:object_detection . ',
should_log=True)
_DownloadData(benchmark_spec.coco_data_dir,
posixpath.join('/data', 'coco2017'), vm)

if 'gnmt' in benchmark_spec.benchmark:
if GNMT in benchmark_spec.benchmark:
vm.RemoteCommand(
'cd training_results_v0.6/NVIDIA/benchmarks/gnmt/implementations/pytorch && '
'sudo docker build --pull --network=host -t mlperf-nvidia:rnn_translator . ',
'sudo docker build --network=host -t mlperf-nvidia:rnn_translator . ',
should_log=True)
_DownloadData(benchmark_spec.gnmt_data_dir,
posixpath.join('/data', 'gnmt'), vm)

if 'ssd' in benchmark_spec.benchmark:
if SSD in benchmark_spec.benchmark:
vm.RemoteCommand(
'cd training_results_v0.6/NVIDIA/benchmarks/ssd/implementations/pytorch && '
'sudo docker build --pull --network=host -t mlperf-nvidia:single_stage_detector . ',
'sudo docker build --network=host -t mlperf-nvidia:single_stage_detector . ',
should_log=True)
_DownloadData(benchmark_spec.coco_data_dir,
posixpath.join('/data', 'coco2017'), vm)
Expand Down Expand Up @@ -362,7 +387,7 @@ def _CreateMetadataDict(benchmark_spec):
return metadata


def MakeSamplesFromOutput(metadata, output, use_tpu=False, model='resnet'):
def MakeSamplesFromOutput(metadata, output, use_tpu=False, model=RESNET):
"""Create samples containing metrics.

Args:
Expand All @@ -388,10 +413,10 @@ def MakeSamplesFromOutput(metadata, output, use_tpu=False, model='resnet'):
start = wall_time
metadata_copy = metadata.copy()
epoch = regex_util.ExtractExactlyOneMatch(r'"epoch_num": (\d+)', result)
if ('transformer' in model and (not use_tpu)):
if (TRANSFORMER in model and (not use_tpu)):
value = float(regex_util.ExtractExactlyOneMatch(r'"value": "(\d+\.\d+)"',
result))
elif 'mask' in model:
elif MASK in model:
mask_value, mask_metadata = regex_util.ExtractExactlyOneMatch(
r'^"value": (.*?), "metadata": (.*)$', result)
metadata_copy.update(json.loads(mask_value)['accuracy'])
Expand Down Expand Up @@ -424,7 +449,7 @@ def MakeSamplesFromOutput(metadata, output, use_tpu=False, model='resnet'):
metadata))

if not use_tpu:
if 'minigo' in model:
if MINIGO in model:
times = regex_util.ExtractAllMatches(r'RESULT,.*,(\d+),.*,.*', output)
else:
times = regex_util.ExtractAllMatches(r'RESULT,.*,.*,(\d+),.*,.*', output)
Expand Down Expand Up @@ -464,22 +489,23 @@ def Run(benchmark_spec):
model=benchmark_spec.benchmark,
tpus=benchmark_spec.tpu_groups['train'].GetAcceleratorType()))

if 'mask' in benchmark_spec.benchmark:
if MASK in benchmark_spec.benchmark:
model = 'mask_rcnn'
elif 'gnmt' in benchmark_spec.benchmark:
elif GNMT in benchmark_spec.benchmark:
model = 'nmt'
else:
model = benchmark_spec.benchmark

mlperf_benchmark_cmd = ('cd {code_path} && '
'export PYTHONPATH=$(pwd):$(pwd)/{model} && '
'cd {model} && '
'{run_path}/run_and_time.sh'.format(
code_path=code_path,
model=model,
run_path=run_path))
mlperf_benchmark_cmd = (
'cd {code_path} && '
'export PYTHONPATH=$(pwd):$(pwd)/{model} && '
'cd {model} && '
'{run_path}/run_and_time.sh'.format(
code_path=code_path,
model=model,
run_path=run_path))

if 'ssd' in benchmark_spec.benchmark:
if SSD in benchmark_spec.benchmark:
mlperf_benchmark_cmd = (
'export '
'MLP_GCS_RESNET_CHECKPOINT={checkpoint}'
Expand All @@ -493,48 +519,59 @@ def Run(benchmark_spec):

else:
benchmark_path = '$HOME/training_results_v0.6/NVIDIA/benchmarks'
common_env = 'DGXSYSTEM=DGX1 NEXP=1'
if 'resnet' in benchmark_spec.benchmark:
common_env = ('DGXSYSTEM=DGX1 NEXP=1 PULL=0 LOGDIR=/tmp/{benchmark}'
.format(benchmark=benchmark_spec.benchmark))

if RESNET in benchmark_spec.benchmark:
run_path = posixpath.join(benchmark_path, 'resnet/implementations/mxnet')
env = 'DATADIR=/data/imagenet LOGDIR=/tmp/resnet PULL=0'
elif 'transformer' in benchmark_spec.benchmark:
env = 'DATADIR=/data/imagenet '

elif TRANSFORMER in benchmark_spec.benchmark:
run_path = posixpath.join(benchmark_path,
'transformer/implementations/pytorch')
env = 'DATADIR=/data/wmt/utf8 LOGDIR=/tmp/transformer PULL=0'
elif 'minigo' in benchmark_spec.benchmark:
env = 'DATADIR=/data/wmt/utf8 '

elif MINIGO in benchmark_spec.benchmark:
run_path = posixpath.join(benchmark_path,
'minigo/implementations/tensorflow')
env = 'LOGDIR=/tmp/minigo CONT=mlperf-nvidia:minigo'
elif 'mask' in benchmark_spec.benchmark:
env = 'CONT=mlperf-nvidia:minigo '

elif MASK in benchmark_spec.benchmark:
run_path = posixpath.join(benchmark_path,
'maskrcnn/implementations/pytorch')
env = 'LOGDIR=/tmp/mask DATADIR=/data PULL=0'
elif 'gnmt' in benchmark_spec.benchmark:
env = 'DATADIR=/data '

elif GNMT in benchmark_spec.benchmark:
run_path = posixpath.join(benchmark_path, 'gnmt/implementations/pytorch')
env = 'LOGDIR=/tmp/gnmt DATADIR=/data/gnmt PULL=0'
elif 'ssd' in benchmark_spec.benchmark:
env = 'DATADIR=/data/gnmt '

elif SSD in benchmark_spec.benchmark:
run_path = posixpath.join(benchmark_path, 'ssd/implementations/pytorch')
env = 'LOGDIR=/tmp/ssd DATADIR=/data PULL=0'
env = 'DATADIR=/data '

run_script = posixpath.join(run_path, 'run.sub')
vm_util.ReplaceText(vm, 'SYSLOGGING=1', 'SYSLOGGING=0', run_script)
mlperf_benchmark_cmd = (
'cd {run_path} && chmod 755 run.sub && sudo {common_env} {env} '
'./run.sub'.format(run_path=run_path, common_env=common_env, env=env))
if nvidia_driver.CheckNvidiaGpuExists(vm):
common_env = '{common_env} {tf_env}'.format(
common_env=common_env, tf_env=tensorflow.GetEnvironmentVars(vm))

if nvidia_driver.CheckNvidiaGpuExists(vm):
mlperf_benchmark_cmd = '{env} {cmd}'.format(
env=tensorflow.GetEnvironmentVars(vm), cmd=mlperf_benchmark_cmd)
mlperf_benchmark_cmd = (
'chmod 755 {run_script} && '
'cd {run_path} && '
'sudo {common_env} {env} {run_script} '
.format(run_script=run_script, run_path=run_path,
common_env=common_env, env=env))

samples = []
metadata = _CreateMetadataDict(benchmark_spec)
stdout, _ = vm.RobustRemoteCommand(mlperf_benchmark_cmd, should_log=True)
samples.extend(
MakeSamplesFromOutput(
metadata,
stdout,
use_tpu=bool(benchmark_spec.tpus),
model=benchmark_spec.benchmark))
if NONE in FLAGS.mlperf_profiler:
samples.extend(
MakeSamplesFromOutput(
metadata,
stdout,
use_tpu=bool(benchmark_spec.tpus),
model=benchmark_spec.benchmark))
return samples


Expand All @@ -545,5 +582,5 @@ def Cleanup(benchmark_spec):
benchmark_spec: The benchmark specification. Contains all data that is
required to run the benchmark.
"""
if benchmark_spec.tpus:
if (benchmark_spec.tpus and FLAGS.mlperf_bucket is None):
benchmark_spec.storage_service.DeleteBucket(benchmark_spec.bucket)
Loading