Use monitoring api for cpu/memory utilization

.. only when mash is not available on the VM in question. Add min/max split in cpu/memory values cpu/memory from monotoring api
GoogleCloudPlatform · Aug 21, 2024 · 5f73162 · 5f73162
1 parent 35ca01e
commit 5f73162
Show file tree

Hide file tree

Showing 6 changed files with 364 additions and 33 deletions.
diff --git a/perfmetrics/scripts/testing_on_gke/examples/dlio/parse_logs.py b/perfmetrics/scripts/testing_on_gke/examples/dlio/parse_logs.py
@@ -22,7 +22,7 @@
 import dlio_workload
 
 sys.path.append("../")
-from utils.utils import get_memory, get_cpu, standard_timestamp, is_mash_installed
+from utils.utils import get_memory, get_cpu, unix_to_timestamp, standard_timestamp, is_mash_installed, get_memory_from_monitoring_api, get_cpu_from_monitoring_api, timestamp_to_epoch
 from utils.parse_logs_common import ensureDir, download_gcs_objects, parseLogParserArguments, SUPPORTED_SCENARIOS
 
 _LOCAL_LOGS_LOCATION = "../../bin/dlio-logs/logs"
@@ -36,6 +36,8 @@
     "train_throughput_samples_per_second": 0,
     "train_throughput_mb_per_second": 0,
     "throughput_over_local_ssd": 0,
+    "start_epoch": "",
+    "end_epoch": "",
     "start": "",
     "end": "",
     "highest_memory": 0,
@@ -161,24 +163,51 @@ def createOutputScenariosFromDownloadedFiles(args: dict) -> dict:
             * int(output[key]["mean_file_size"])
             / (1024**2)
         )
+        r["start_epoch"] = timestamp_to_epoch(
+            per_epoch_stats_data[str(i + 1)]["start"]
+        )
+        r["end_epoch"] = timestamp_to_epoch(
+            per_epoch_stats_data[str(i + 1)]["end"]
+        )
         r["start"] = standard_timestamp(
             per_epoch_stats_data[str(i + 1)]["start"]
         )
         r["end"] = standard_timestamp(per_epoch_stats_data[str(i + 1)]["end"])
-        if r["scenario"] != "local-ssd" and mash_installed:
-          r["lowest_memory"], r["highest_memory"] = get_memory(
-              r["pod_name"],
-              r["start"],
-              r["end"],
-              project_number=args.project_number,
-          )
-          r["lowest_cpu"], r["highest_cpu"] = get_cpu(
-              r["pod_name"],
-              r["start"],
-              r["end"],
-              project_number=args.project_number,
-          )
-          pass
+
+        if r["scenario"] != "local-ssd":
+          if mash_installed:
+            r["lowest_memory"], r["highest_memory"] = get_memory(
+                r["pod_name"],
+                r["start"],
+                r["end"],
+                project_number=args.project_number,
+            )
+            r["lowest_cpu"], r["highest_cpu"] = get_cpu(
+                r["pod_name"],
+                r["start"],
+                r["end"],
+                project_number=args.project_number,
+            )
+          else:
+            r["lowest_memory"], r["highest_memory"] = (
+                get_memory_from_monitoring_api(
+                    pod_name=r["pod_name"],
+                    start_epoch=r["start_epoch"],
+                    end_epoch=r["end_epoch"],
+                    project_id=args.project_id,
+                    cluster_name=args.cluster_name,
+                    namespace_name=args.namespace_name,
+                )
+            )
+            r["lowest_cpu"], r["highest_cpu"] = get_cpu_from_monitoring_api(
+                pod_name=r["pod_name"],
+                start_epoch=r["start_epoch"],
+                end_epoch=r["end_epoch"],
+                project_id=args.project_id,
+                cluster_name=args.cluster_name,
+                namespace_name=args.namespace_name,
+            )
+        pass
 
         r["gcsfuse_mount_options"] = gcsfuse_mount_options
 

diff --git a/perfmetrics/scripts/testing_on_gke/examples/fio/parse_logs.py b/perfmetrics/scripts/testing_on_gke/examples/fio/parse_logs.py
@@ -22,7 +22,7 @@
 import fio_workload
 
 sys.path.append("../")
-from utils.utils import get_memory, get_cpu, unix_to_timestamp, is_mash_installed
+from utils.utils import get_memory, get_cpu, unix_to_timestamp, is_mash_installed, get_memory_from_monitoring_api, get_cpu_from_monitoring_api
 from utils.parse_logs_common import ensureDir, download_gcs_objects, parseLogParserArguments, SUPPORTED_SCENARIOS
 
 _LOCAL_LOGS_LOCATION = "../../bin/fio-logs"
@@ -35,6 +35,8 @@
     "IOPS": 0,
     "throughput_mb_per_second": 0,
     "throughput_over_local_ssd": 0,
+    "start_epoch": "",
+    "end_epoch": "",
     "start": "",
     "end": "",
     "highest_memory": 0,
@@ -203,24 +205,48 @@ def createOutputScenariosFromDownloadedFiles(args: dict) -> dict:
       r["throughput_mb_per_second"] = int(
           per_epoch_output_data["jobs"][0]["read"]["bw_bytes"] / (1024**2)
       )
+      r["start_epoch"] = per_epoch_output_data["jobs"][0]["job_start"] // 1000
+      r["end_epoch"] = per_epoch_output_data["timestamp_ms"] // 1000
       r["start"] = unix_to_timestamp(
           per_epoch_output_data["jobs"][0]["job_start"]
       )
       r["end"] = unix_to_timestamp(per_epoch_output_data["timestamp_ms"])
-      if r["scenario"] != "local-ssd" and mash_installed:
-        r["lowest_memory"], r["highest_memory"] = get_memory(
-            r["pod_name"],
-            r["start"],
-            r["end"],
-            project_number=args.project_number,
-        )
-        r["lowest_cpu"], r["highest_cpu"] = get_cpu(
-            r["pod_name"],
-            r["start"],
-            r["end"],
-            project_number=args.project_number,
-        )
+
+      if r["scenario"] != "local-ssd":
+        if mash_installed:
+          r["lowest_memory"], r["highest_memory"] = get_memory(
+              r["pod_name"],
+              r["start"],
+              r["end"],
+              project_number=args.project_number,
+          )
+          r["lowest_cpu"], r["highest_cpu"] = get_cpu(
+              r["pod_name"],
+              r["start"],
+              r["end"],
+              project_number=args.project_number,
+          )
+        else:
+          r["lowest_memory"], r["highest_memory"] = (
+              get_memory_from_monitoring_api(
+                  pod_name=r["pod_name"],
+                  start_epoch=r["start_epoch"],
+                  end_epoch=r["end_epoch"],
+                  project_id=args.project_id,
+                  cluster_name=args.cluster_name,
+                  namespace_name=args.namespace_name,
+              )
+          )
+          r["lowest_cpu"], r["highest_cpu"] = get_cpu_from_monitoring_api(
+              pod_name=r["pod_name"],
+              start_epoch=r["start_epoch"],
+              end_epoch=r["end_epoch"],
+              project_id=args.project_id,
+              cluster_name=args.cluster_name,
+              namespace_name=args.namespace_name,
+          )
         pass
+
       r["gcsfuse_mount_options"] = gcsfuse_mount_options
       r["blockSize"] = bs
       r["filesPerThread"] = nrfiles

diff --git a/perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh b/perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh
@@ -241,7 +241,16 @@ function installDependencies() {
     sudo apt install docker-ce -y
   fi
   # Ensure that mash is installed.
-  which mash || (sudo apt-get install -y monarch-tools)
+  if ! which mash ; then
+    if ! sudo apt-get install -y monarch-tools; then
+      # Ensure that gcloud monitoring tools are installed. This is alternative to
+      # mash on gce vm.
+      # pip install --upgrade google-cloud-storage
+      # pip install --ignore-installed --upgrade google-api-python-client
+      # pip install --ignore-installed --upgrade google-cloud
+      pip install --upgrade google-cloud-monitoring
+    fi
+  fi
 }
 
 # Make sure you have access to the necessary GCP resources. The easiest way to enable it is to use <your-ldap>@google.com as active auth.
@@ -529,14 +538,14 @@ function waitTillAllPodsComplete() {
 function fetchAndParseFioOutputs() {
   echo "Fetching and parsing fio outputs ..."
   cd "${gke_testing_dir}"/examples/fio
-  python3 parse_logs.py --project-number=${project_number} --workload-config "${workload_config}" --instance-id ${instance_id} --output-file "${output_dir}"/fio/output.csv
+  python3 parse_logs.py --project-number=${project_number} --workload-config "${workload_config}" --instance-id ${instance_id} --output-file "${output_dir}"/fio/output.csv --project-id=${project_id} --cluster-name=${cluster_name} --namespace-name=${appnamespace}
   cd -
 }
 
 function fetchAndParseDlioOutputs() {
   echo "Fetching and parsing dlio outputs ..."
   cd "${gke_testing_dir}"/examples/dlio
-  python3 parse_logs.py --project-number=${project_number} --workload-config "${workload_config}" --instance-id ${instance_id} --output-file "${output_dir}"/dlio/output.csv
+  python3 parse_logs.py --project-number=${project_number} --workload-config "${workload_config}" --instance-id ${instance_id} --output-file "${output_dir}"/dlio/output.csv --project-id=${project_id} --cluster-name=${cluster_name} --namespace-name=${appnamespace}
   cd -
 }
 

diff --git a/perfmetrics/scripts/testing_on_gke/examples/utils/parse_logs_common.py b/perfmetrics/scripts/testing_on_gke/examples/utils/parse_logs_common.py
@@ -74,8 +74,18 @@ def parseLogParserArguments() -> object:
       ),
       required=True,
   )
+  parser.add_argument(
+      "--project-id",
+      metavar="GCP Project ID/name",
+      help=(
+          "project-id (e.g. gcs-fuse-test) is needed to fetch the cpu/memory"
+          " utilization data from GCP."
+      ),
+      required=True,
+  )
   parser.add_argument(
       "--project-number",
+      metavar="GCP Project Number",
       help=(
           "project-number (e.g. 93817472919) is needed to fetch the cpu/memory"
           " utilization data from GCP."
@@ -87,6 +97,16 @@ def parseLogParserArguments() -> object:
       help="unique string ID for current test-run",
       required=True,
   )
+  parser.add_argument(
+      "--cluster-name",
+      help="Name of GKE cluster where the current test was run",
+      required=True,
+  )
+  parser.add_argument(
+      "--namespace-name",
+      help="kubernestes namespace used for the current test-run",
+      required=True,
+  )
   parser.add_argument(
       "-o",
       "--output-file",