Skip to content

Commit

Permalink
[testing-on-gke] Support per-workload gcsfuse-mount-options (#2348)
Browse files Browse the repository at this point in the history
This adds the following changes.

* Now, gcsfuse_mount_options is no longer 
taken on the run-gke-tests.sh script, and 
is instead taken per workload in the 
${workload_config} file.

* helm chart names and pod names have been 
shortened by using a hash of the workload 
rather than the long names earlier used which 
encapsulated the workload parameters such as 
blockSize/numThreads/batchSize etc. It still keeps 
the name of the scenario and file-size in the 
pod/chart name for some information for 
easy spotting though.
* Adds validation checks and unit tests for dlio_workload and fio_workload.

List of intermediate squashed commits:

* Support per-workload gcsfuse mount options

* add default gcsfuseMountOptions in workloads config

* add/update copyright headers

* remove default gcsfuse_mount_options from run-script
  • Loading branch information
gargnitingoogle authored Sep 18, 2024
1 parent 9209696 commit 0fae307
Show file tree
Hide file tree
Showing 15 changed files with 420 additions and 135 deletions.
73 changes: 66 additions & 7 deletions perfmetrics/scripts/testing_on_gke/examples/dlio/dlio_workload.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
# Copyright 2018 The Kubernetes Authors.
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""This file defines a DlioWorkload (a DLIO Unet3d workload) and provides utility for parsing a json
test-config file for a list of them.
Expand All @@ -8,13 +23,23 @@

def validateDlioWorkload(workload: dict, name: str):
"""Validates the given json workload object."""
if 'dlioWorkload' not in workload:
print(f"{name} does not have 'dlioWorkload' key in it.")
return False

if 'bucket' not in workload:
print(f"{name} does not have 'bucket' key in it.")
return False
for requiredWorkloadAttribute, expectedType in {
'bucket': str,
'gcsfuseMountOptions': str,
'dlioWorkload': dict,
}.items():
if requiredWorkloadAttribute not in workload:
print(f"{name} does not have '{requiredWorkloadAttribute}' key in it.")
return False
if not type(workload[requiredWorkloadAttribute]) is expectedType:
print(
f"In {name}, the type of '{requiredWorkloadAttribute}' is of type"
f" '{type(workload[requiredWorkloadAttribute])}', not {expectedType}"
)
return False
if expectedType == str and ' ' in workload[requiredWorkloadAttribute]:
print(f"{name} has space in the value of '{requiredWorkloadAttribute}'")
return False

if 'fioWorkload' in workload:
print(f"{name} has 'fioWorkload' key in it, which is unexpected.")
Expand Down Expand Up @@ -73,6 +98,14 @@ class DlioWorkload:
4. bucket (str): Name of a GCS bucket to read input files from.
5. batchSizes (set of ints): a set of ints representing multiple batchsize
values to test.
6. gcsfuseMountOptions (str): gcsfuse mount options as a single
string in compact stringified format, to be used for the
test scenario "gcsfuse-generic". The individual config/cli flag values should
be separated by comma. Each cli flag should be of the form "<flag>[=<value>]",
while each config-file flag should be of form
"<config>[:<subconfig>[:<subsubconfig>[...]]]:<value>". For example, a legal
value would be:
"implicit-dirs,file_mode=777,file-cache:enable-parallel-downloads:true,metadata-cache:ttl-secs:true".
"""

def __init__(
Expand All @@ -82,12 +115,14 @@ def __init__(
recordLength: int,
bucket: str,
batchSizes: list,
gcsfuseMountOptions: str,
):
self.scenario = scenario
self.numFilesTrain = numFilesTrain
self.recordLength = recordLength
self.bucket = bucket
self.batchSizes = set(batchSizes)
self.gcsfuseMountOptions = gcsfuseMountOptions


def ParseTestConfigForDlioWorkloads(testConfigFileName: str):
Expand Down Expand Up @@ -119,6 +154,30 @@ def ParseTestConfigForDlioWorkloads(testConfigFileName: str):
dlioWorkload['recordLength'],
workload['bucket'],
dlioWorkload['batchSizes'],
workload['gcsfuseMountOptions'],
)
)
return dlioWorkloads


def DlioChartNamePodName(
dlioWorkload: DlioWorkload, instanceID: str, batchSize: int
) -> (str, str, str):
shortenScenario = {
'local-ssd': 'ssd',
'gcsfuse-generic': 'gcsfuse',
}
shortForScenario = (
shortenScenario[dlioWorkload.scenario]
if dlioWorkload.scenario in shortenScenario
else 'other'
)

hashOfWorkload = str(hash((instanceID, batchSize, dlioWorkload))).replace(
'-', ''
)
return (
f'dlio-unet3d-{shortForScenario}-{dlioWorkload.recordLength}-{hashOfWorkload}',
f'dlio-tester-{shortForScenario}-{dlioWorkload.recordLength}-{hashOfWorkload}',
f'{instanceID}/{dlioWorkload.numFilesTrain}-{dlioWorkload.recordLength}-{batchSize}-{hashOfWorkload}/{dlioWorkload.scenario}',
)
Original file line number Diff line number Diff line change
@@ -1,3 +1,18 @@
# Copyright 2018 The Kubernetes Authors.
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""This file defines unit tests for functionalities in dlio_workload.py"""

import unittest
Expand All @@ -9,9 +24,20 @@ class DlioWorkloadTest(unittest.TestCase):
def test_validate_dlio_workload_empty(self):
self.assertFalse(validateDlioWorkload(({}), "empty-dlio-workload"))

def test_validate_dlio_workload_invalid_no_bucket(self):
def test_validate_dlio_workload_invalid_missing_bucket(self):
self.assertFalse(
validateDlioWorkload(({"dlioWorkload": {}}), "invalid-dlio-workload-1")
validateDlioWorkload(
({"dlioWorkload": {}, "gcsfuseMountOptions": ""}),
"invalid-dlio-workload-missing-bucket",
)
)

def test_validate_dlio_workload_invalid_bucket_contains_space(self):
self.assertFalse(
validateDlioWorkload(
({"dlioWorkload": {}, "gcsfuseMountOptions": "", "bucket": " "}),
"invalid-dlio-workload-bucket-contains-space",
)
)

def test_validate_dlio_workload_invalid_no_dlioWorkloadSpecified(self):
Expand All @@ -22,7 +48,11 @@ def test_validate_dlio_workload_invalid_no_dlioWorkloadSpecified(self):
def test_validate_dlio_workload_invalid_commented_out_dlioWorkload(self):
self.assertFalse(
validateDlioWorkload(
({"_dlioWorkload": {}, "bucket": "dummy-bucket"}),
({
"_dlioWorkload": {},
"bucket": "dummy-bucket",
"gcsfuseMountOptions": "implicit-dirs,cache-max-size:-1",
}),
"commented-out-dlio-workload",
)
)
Expand All @@ -34,6 +64,7 @@ def test_validate_dlio_workload_invalid_mixed_dlioWorkload_fioWorkload(self):
"dlioWorkload": {},
"fioWorkload": {},
"bucket": "dummy-bucket",
"gcsfuseMountOptions": "implicit-dirs,cache-max-size:-1",
}),
"mixed-dlio/fio-workload",
)
Expand All @@ -46,6 +77,7 @@ def test_validate_dlio_workload_invalid_missing_numFilesTrain(self):
"batchSizes": [100, 200],
},
"bucket": "dummy-bucket",
"gcsfuseMountOptions": "implicit-dirs,cache-max-size:-1",
})
self.assertFalse(
validateDlioWorkload(
Expand All @@ -62,6 +94,7 @@ def test_validate_dlio_workload_invalid_unsupported_numFilesTrain(self):
"batchSizes": [100, 200],
},
"bucket": "dummy-bucket",
"gcsfuseMountOptions": "implicit-dirs,cache-max-size:-1",
})
self.assertFalse(
validateDlioWorkload(
Expand All @@ -77,6 +110,7 @@ def test_validate_dlio_workload_invalid_missing_recordLength(self):
"batchSizes": [100, 200],
},
"bucket": "dummy-bucket",
"gcsfuseMountOptions": "implicit-dirs,cache-max-size:-1",
})
self.assertFalse(
validateDlioWorkload(
Expand All @@ -93,6 +127,7 @@ def test_validate_dlio_workload_invalid_unsupported_recordLength(self):
"batchSizes": [100, 200],
},
"bucket": "dummy-bucket",
"gcsfuseMountOptions": "implicit-dirs,cache-max-size:-1",
})
self.assertFalse(
validateDlioWorkload(
Expand All @@ -101,13 +136,69 @@ def test_validate_dlio_workload_invalid_unsupported_recordLength(self):
)
pass

def test_validate_dlio_workload_invalid_missing_gcsfuseMountOptions(self):
workload = dict({
"dlioWorkload": {
"numFilesTrain": 1000,
"recordLength": 100,
"batchSizes": [100, 200],
},
"bucket": "dummy-bucket",
})
self.assertFalse(
validateDlioWorkload(
workload, "invalid-dlio-workload-missing-gcsfuseMountOptions"
)
)
pass

def test_validate_dlio_workload_invalid_unsupported_gcsfuseMountOptions(
self,
):
workload = dict({
"dlioWorkload": {
"numFilesTrain": 1000,
"recordLength": 10000,
"batchSizes": [100, 200],
},
"bucket": "dummy-bucket",
"gcsfuseMountOptions": 100,
})
self.assertFalse(
validateDlioWorkload(
workload, "invalid-dlio-workload-unsupported-gcsfuseMountOptions1"
)
)
pass

def test_validate_dlio_workload_invalid_gcsfuseMountOptions_contains_space(
self,
):
workload = dict({
"dlioWorkload": {
"numFilesTrain": 1000,
"recordLength": 10000,
"batchSizes": [100, 200],
},
"bucket": "dummy-bucket",
"gcsfuseMountOptions": "abc def",
})
self.assertFalse(
validateDlioWorkload(
workload,
"invalid-dlio-workload-unsupported-gcsfuseMountOptions-contains-space",
)
)
pass

def test_validate_dlio_workload_invalid_missing_batchSizes(self):
workload = dict({
"dlioWorkload": {
"numFilesTrain": 1000,
"recordLength": 10000,
},
"bucket": "dummy-bucket",
"gcsfuseMountOptions": "implicit-dirs,cache-max-size:-1",
})
self.assertFalse(
validateDlioWorkload(
Expand All @@ -124,6 +215,7 @@ def test_validate_dlio_workload_invalid_unsupported_batchSizes1(self):
"batchSizes": ["100"],
},
"bucket": "dummy-bucket",
"gcsfuseMountOptions": "implicit-dirs,cache-max-size:-1",
})
self.assertFalse(
validateDlioWorkload(
Expand All @@ -140,6 +232,7 @@ def test_validate_dlio_workload_invalid_unsupported_batchSizes2(self):
"batchSizes": [0, -1],
},
"bucket": "dummy-bucket",
"gcsfuseMountOptions": "implicit-dirs,cache-max-size:-1",
})
self.assertFalse(
validateDlioWorkload(
Expand All @@ -156,6 +249,7 @@ def test_validate_dlio_workload_valid_single_batchSize(self):
"batchSizes": [100],
},
"bucket": "dummy-bucket",
"gcsfuseMountOptions": "implicit-dirs,cache-max-size:-1",
})
self.assertTrue(validateDlioWorkload(workload, "valid-dlio-workload-2"))
pass
Expand All @@ -168,6 +262,7 @@ def test_validate_dlio_workload_valid_multiple_batchSizes(self):
"batchSizes": [100, 200],
},
"bucket": "dummy-bucket",
"gcsfuseMountOptions": "implicit-dirs,cache-max-size:-1",
})
self.assertTrue(validateDlioWorkload(workload, "valid-dlio-workload-2"))
pass
Expand Down
13 changes: 6 additions & 7 deletions perfmetrics/scripts/testing_on_gke/examples/dlio/parse_logs.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#!/usr/bin/env python

# Copyright 2018 The Kubernetes Authors.
# Copyright 2022 Google LLC
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -168,15 +168,14 @@ def downloadDlioOutputs(dlioWorkloads: set, instanceId: str):
continue

for i in range(summary_data["epochs"]):
test_name = summary_data["hostname"]
part_list = test_name.split("-")
key = "-".join(part_list[2:5])
key = root.split("/")[-2]
key_split = key.split("-")

if key not in output:
output[key] = {
"num_files_train": part_list[-3],
"mean_file_size": part_list[-2],
"batch_size": part_list[-1],
"num_files_train": key_split[-4],
"mean_file_size": key_split[-3],
"batch_size": key_split[-2],
"records": {
"local-ssd": [],
"gcsfuse-generic": [],
Expand Down
Loading

0 comments on commit 0fae307

Please sign in to comment.