[testing-on-gke] Support per-workload gcsfuse-mount-options (#2348)

This adds the following changes. * Now, gcsfuse_mount_options is no longer taken on the run-gke-tests.sh script, and is instead taken per workload in the ${workload_config} file. * helm chart names and pod names have been shortened by using a hash of the workload rather than the long names earlier used which encapsulated the workload parameters such as blockSize/numThreads/batchSize etc. It still keeps the name of the scenario and file-size in the pod/chart name for some information for easy spotting though. * Adds validation checks and unit tests for dlio_workload and fio_workload. List of intermediate squashed commits: * Support per-workload gcsfuse mount options * add default gcsfuseMountOptions in workloads config * add/update copyright headers * remove default gcsfuse_mount_options from run-script
GoogleCloudPlatform · Sep 18, 2024 · 0fae307 · 0fae307
1 parent 9209696
commit 0fae307
Show file tree

Hide file tree

Showing 15 changed files with 420 additions and 135 deletions.
diff --git a/perfmetrics/scripts/testing_on_gke/examples/dlio/dlio_workload.py b/perfmetrics/scripts/testing_on_gke/examples/dlio/dlio_workload.py
@@ -1,3 +1,18 @@
+# Copyright 2018 The Kubernetes Authors.
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """This file defines a DlioWorkload (a DLIO Unet3d workload) and provides utility for parsing a json
 
 test-config file for a list of them.
@@ -8,13 +23,23 @@
 
 def validateDlioWorkload(workload: dict, name: str):
   """Validates the given json workload object."""
-  if 'dlioWorkload' not in workload:
-    print(f"{name} does not have 'dlioWorkload' key in it.")
-    return False
-
-  if 'bucket' not in workload:
-    print(f"{name} does not have 'bucket' key in it.")
-    return False
+  for requiredWorkloadAttribute, expectedType in {
+      'bucket': str,
+      'gcsfuseMountOptions': str,
+      'dlioWorkload': dict,
+  }.items():
+    if requiredWorkloadAttribute not in workload:
+      print(f"{name} does not have '{requiredWorkloadAttribute}' key in it.")
+      return False
+    if not type(workload[requiredWorkloadAttribute]) is expectedType:
+      print(
+          f"In {name}, the type of '{requiredWorkloadAttribute}' is of type"
+          f" '{type(workload[requiredWorkloadAttribute])}', not {expectedType}"
+      )
+      return False
+    if expectedType == str and ' ' in workload[requiredWorkloadAttribute]:
+      print(f"{name} has space in the value of '{requiredWorkloadAttribute}'")
+      return False
 
   if 'fioWorkload' in workload:
     print(f"{name} has 'fioWorkload' key in it, which is unexpected.")
@@ -73,6 +98,14 @@ class DlioWorkload:
   4. bucket (str): Name of a GCS bucket to read input files from.
   5. batchSizes (set of ints): a set of ints representing multiple batchsize
   values to test.
+  6. gcsfuseMountOptions (str): gcsfuse mount options as a single
+  string in compact stringified format, to be used for the
+  test scenario "gcsfuse-generic". The individual config/cli flag values should
+  be separated by comma. Each cli flag should be of the form "<flag>[=<value>]",
+  while each config-file flag should be of form
+  "<config>[:<subconfig>[:<subsubconfig>[...]]]:<value>". For example, a legal
+  value would be:
+  "implicit-dirs,file_mode=777,file-cache:enable-parallel-downloads:true,metadata-cache:ttl-secs:true".
   """
 
   def __init__(
@@ -82,12 +115,14 @@ def __init__(
       recordLength: int,
       bucket: str,
       batchSizes: list,
+      gcsfuseMountOptions: str,
   ):
     self.scenario = scenario
     self.numFilesTrain = numFilesTrain
     self.recordLength = recordLength
     self.bucket = bucket
     self.batchSizes = set(batchSizes)
+    self.gcsfuseMountOptions = gcsfuseMountOptions
 
 
 def ParseTestConfigForDlioWorkloads(testConfigFileName: str):
@@ -119,6 +154,30 @@ def ParseTestConfigForDlioWorkloads(testConfigFileName: str):
                   dlioWorkload['recordLength'],
                   workload['bucket'],
                   dlioWorkload['batchSizes'],
+                  workload['gcsfuseMountOptions'],
               )
           )
   return dlioWorkloads
+
+
+def DlioChartNamePodName(
+    dlioWorkload: DlioWorkload, instanceID: str, batchSize: int
+) -> (str, str, str):
+  shortenScenario = {
+      'local-ssd': 'ssd',
+      'gcsfuse-generic': 'gcsfuse',
+  }
+  shortForScenario = (
+      shortenScenario[dlioWorkload.scenario]
+      if dlioWorkload.scenario in shortenScenario
+      else 'other'
+  )
+
+  hashOfWorkload = str(hash((instanceID, batchSize, dlioWorkload))).replace(
+      '-', ''
+  )
+  return (
+      f'dlio-unet3d-{shortForScenario}-{dlioWorkload.recordLength}-{hashOfWorkload}',
+      f'dlio-tester-{shortForScenario}-{dlioWorkload.recordLength}-{hashOfWorkload}',
+      f'{instanceID}/{dlioWorkload.numFilesTrain}-{dlioWorkload.recordLength}-{batchSize}-{hashOfWorkload}/{dlioWorkload.scenario}',
+  )
diff --git a/perfmetrics/scripts/testing_on_gke/examples/dlio/dlio_workload_test.py b/perfmetrics/scripts/testing_on_gke/examples/dlio/dlio_workload_test.py
@@ -1,3 +1,18 @@
+# Copyright 2018 The Kubernetes Authors.
+# Copyright 2024 Google LLC
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 """This file defines unit tests for functionalities in dlio_workload.py"""
 
 import unittest
@@ -9,9 +24,20 @@ class DlioWorkloadTest(unittest.TestCase):
   def test_validate_dlio_workload_empty(self):
     self.assertFalse(validateDlioWorkload(({}), "empty-dlio-workload"))
 
-  def test_validate_dlio_workload_invalid_no_bucket(self):
+  def test_validate_dlio_workload_invalid_missing_bucket(self):
     self.assertFalse(
-        validateDlioWorkload(({"dlioWorkload": {}}), "invalid-dlio-workload-1")
+        validateDlioWorkload(
+            ({"dlioWorkload": {}, "gcsfuseMountOptions": ""}),
+            "invalid-dlio-workload-missing-bucket",
+        )
+    )
+
+  def test_validate_dlio_workload_invalid_bucket_contains_space(self):
+    self.assertFalse(
+        validateDlioWorkload(
+            ({"dlioWorkload": {}, "gcsfuseMountOptions": "", "bucket": " "}),
+            "invalid-dlio-workload-bucket-contains-space",
+        )
     )
 
   def test_validate_dlio_workload_invalid_no_dlioWorkloadSpecified(self):
@@ -22,7 +48,11 @@ def test_validate_dlio_workload_invalid_no_dlioWorkloadSpecified(self):
   def test_validate_dlio_workload_invalid_commented_out_dlioWorkload(self):
     self.assertFalse(
         validateDlioWorkload(
-            ({"_dlioWorkload": {}, "bucket": "dummy-bucket"}),
+            ({
+                "_dlioWorkload": {},
+                "bucket": "dummy-bucket",
+                "gcsfuseMountOptions": "implicit-dirs,cache-max-size:-1",
+            }),
             "commented-out-dlio-workload",
         )
     )
@@ -34,6 +64,7 @@ def test_validate_dlio_workload_invalid_mixed_dlioWorkload_fioWorkload(self):
                 "dlioWorkload": {},
                 "fioWorkload": {},
                 "bucket": "dummy-bucket",
+                "gcsfuseMountOptions": "implicit-dirs,cache-max-size:-1",
             }),
             "mixed-dlio/fio-workload",
         )
@@ -46,6 +77,7 @@ def test_validate_dlio_workload_invalid_missing_numFilesTrain(self):
             "batchSizes": [100, 200],
         },
         "bucket": "dummy-bucket",
+        "gcsfuseMountOptions": "implicit-dirs,cache-max-size:-1",
     })
     self.assertFalse(
         validateDlioWorkload(
@@ -62,6 +94,7 @@ def test_validate_dlio_workload_invalid_unsupported_numFilesTrain(self):
             "batchSizes": [100, 200],
         },
         "bucket": "dummy-bucket",
+        "gcsfuseMountOptions": "implicit-dirs,cache-max-size:-1",
     })
     self.assertFalse(
         validateDlioWorkload(
@@ -77,6 +110,7 @@ def test_validate_dlio_workload_invalid_missing_recordLength(self):
             "batchSizes": [100, 200],
         },
         "bucket": "dummy-bucket",
+        "gcsfuseMountOptions": "implicit-dirs,cache-max-size:-1",
     })
     self.assertFalse(
         validateDlioWorkload(
@@ -93,6 +127,7 @@ def test_validate_dlio_workload_invalid_unsupported_recordLength(self):
             "batchSizes": [100, 200],
         },
         "bucket": "dummy-bucket",
+        "gcsfuseMountOptions": "implicit-dirs,cache-max-size:-1",
     })
     self.assertFalse(
         validateDlioWorkload(
@@ -101,13 +136,69 @@ def test_validate_dlio_workload_invalid_unsupported_recordLength(self):
     )
     pass
 
+  def test_validate_dlio_workload_invalid_missing_gcsfuseMountOptions(self):
+    workload = dict({
+        "dlioWorkload": {
+            "numFilesTrain": 1000,
+            "recordLength": 100,
+            "batchSizes": [100, 200],
+        },
+        "bucket": "dummy-bucket",
+    })
+    self.assertFalse(
+        validateDlioWorkload(
+            workload, "invalid-dlio-workload-missing-gcsfuseMountOptions"
+        )
+    )
+    pass
+
+  def test_validate_dlio_workload_invalid_unsupported_gcsfuseMountOptions(
+      self,
+  ):
+    workload = dict({
+        "dlioWorkload": {
+            "numFilesTrain": 1000,
+            "recordLength": 10000,
+            "batchSizes": [100, 200],
+        },
+        "bucket": "dummy-bucket",
+        "gcsfuseMountOptions": 100,
+    })
+    self.assertFalse(
+        validateDlioWorkload(
+            workload, "invalid-dlio-workload-unsupported-gcsfuseMountOptions1"
+        )
+    )
+    pass
+
+  def test_validate_dlio_workload_invalid_gcsfuseMountOptions_contains_space(
+      self,
+  ):
+    workload = dict({
+        "dlioWorkload": {
+            "numFilesTrain": 1000,
+            "recordLength": 10000,
+            "batchSizes": [100, 200],
+        },
+        "bucket": "dummy-bucket",
+        "gcsfuseMountOptions": "abc def",
+    })
+    self.assertFalse(
+        validateDlioWorkload(
+            workload,
+            "invalid-dlio-workload-unsupported-gcsfuseMountOptions-contains-space",
+        )
+    )
+    pass
+
   def test_validate_dlio_workload_invalid_missing_batchSizes(self):
     workload = dict({
         "dlioWorkload": {
             "numFilesTrain": 1000,
             "recordLength": 10000,
         },
         "bucket": "dummy-bucket",
+        "gcsfuseMountOptions": "implicit-dirs,cache-max-size:-1",
     })
     self.assertFalse(
         validateDlioWorkload(
@@ -124,6 +215,7 @@ def test_validate_dlio_workload_invalid_unsupported_batchSizes1(self):
             "batchSizes": ["100"],
         },
         "bucket": "dummy-bucket",
+        "gcsfuseMountOptions": "implicit-dirs,cache-max-size:-1",
     })
     self.assertFalse(
         validateDlioWorkload(
@@ -140,6 +232,7 @@ def test_validate_dlio_workload_invalid_unsupported_batchSizes2(self):
             "batchSizes": [0, -1],
         },
         "bucket": "dummy-bucket",
+        "gcsfuseMountOptions": "implicit-dirs,cache-max-size:-1",
     })
     self.assertFalse(
         validateDlioWorkload(
@@ -156,6 +249,7 @@ def test_validate_dlio_workload_valid_single_batchSize(self):
             "batchSizes": [100],
         },
         "bucket": "dummy-bucket",
+        "gcsfuseMountOptions": "implicit-dirs,cache-max-size:-1",
     })
     self.assertTrue(validateDlioWorkload(workload, "valid-dlio-workload-2"))
     pass
@@ -168,6 +262,7 @@ def test_validate_dlio_workload_valid_multiple_batchSizes(self):
             "batchSizes": [100, 200],
         },
         "bucket": "dummy-bucket",
+        "gcsfuseMountOptions": "implicit-dirs,cache-max-size:-1",
     })
     self.assertTrue(validateDlioWorkload(workload, "valid-dlio-workload-2"))
     pass

diff --git a/perfmetrics/scripts/testing_on_gke/examples/dlio/parse_logs.py b/perfmetrics/scripts/testing_on_gke/examples/dlio/parse_logs.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 
 # Copyright 2018 The Kubernetes Authors.
-# Copyright 2022 Google LLC
+# Copyright 2024 Google LLC
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -168,15 +168,14 @@ def downloadDlioOutputs(dlioWorkloads: set, instanceId: str):
           continue
 
       for i in range(summary_data["epochs"]):
-        test_name = summary_data["hostname"]
-        part_list = test_name.split("-")
-        key = "-".join(part_list[2:5])
+        key = root.split("/")[-2]
+        key_split = key.split("-")
 
         if key not in output:
           output[key] = {
-              "num_files_train": part_list[-3],
-              "mean_file_size": part_list[-2],
-              "batch_size": part_list[-1],
+              "num_files_train": key_split[-4],
+              "mean_file_size": key_split[-3],
+              "batch_size": key_split[-2],
               "records": {
                   "local-ssd": [],
                   "gcsfuse-generic": [],