Skip to content

Commit

Permalink
Use monitoring api for cpu/memory utilization
Browse files Browse the repository at this point in the history
.. only when mash is not available on the VM in question.

Add min/max split in cpu/memory values
cpu/memory from monotoring api
  • Loading branch information
gargnitingoogle committed Aug 21, 2024
1 parent 35ca01e commit 5f73162
Show file tree
Hide file tree
Showing 6 changed files with 364 additions and 33 deletions.
59 changes: 44 additions & 15 deletions perfmetrics/scripts/testing_on_gke/examples/dlio/parse_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import dlio_workload

sys.path.append("../")
from utils.utils import get_memory, get_cpu, standard_timestamp, is_mash_installed
from utils.utils import get_memory, get_cpu, unix_to_timestamp, standard_timestamp, is_mash_installed, get_memory_from_monitoring_api, get_cpu_from_monitoring_api, timestamp_to_epoch
from utils.parse_logs_common import ensureDir, download_gcs_objects, parseLogParserArguments, SUPPORTED_SCENARIOS

_LOCAL_LOGS_LOCATION = "../../bin/dlio-logs/logs"
Expand All @@ -36,6 +36,8 @@
"train_throughput_samples_per_second": 0,
"train_throughput_mb_per_second": 0,
"throughput_over_local_ssd": 0,
"start_epoch": "",
"end_epoch": "",
"start": "",
"end": "",
"highest_memory": 0,
Expand Down Expand Up @@ -161,24 +163,51 @@ def createOutputScenariosFromDownloadedFiles(args: dict) -> dict:
* int(output[key]["mean_file_size"])
/ (1024**2)
)
r["start_epoch"] = timestamp_to_epoch(
per_epoch_stats_data[str(i + 1)]["start"]
)
r["end_epoch"] = timestamp_to_epoch(
per_epoch_stats_data[str(i + 1)]["end"]
)
r["start"] = standard_timestamp(
per_epoch_stats_data[str(i + 1)]["start"]
)
r["end"] = standard_timestamp(per_epoch_stats_data[str(i + 1)]["end"])
if r["scenario"] != "local-ssd" and mash_installed:
r["lowest_memory"], r["highest_memory"] = get_memory(
r["pod_name"],
r["start"],
r["end"],
project_number=args.project_number,
)
r["lowest_cpu"], r["highest_cpu"] = get_cpu(
r["pod_name"],
r["start"],
r["end"],
project_number=args.project_number,
)
pass

if r["scenario"] != "local-ssd":
if mash_installed:
r["lowest_memory"], r["highest_memory"] = get_memory(
r["pod_name"],
r["start"],
r["end"],
project_number=args.project_number,
)
r["lowest_cpu"], r["highest_cpu"] = get_cpu(
r["pod_name"],
r["start"],
r["end"],
project_number=args.project_number,
)
else:
r["lowest_memory"], r["highest_memory"] = (
get_memory_from_monitoring_api(
pod_name=r["pod_name"],
start_epoch=r["start_epoch"],
end_epoch=r["end_epoch"],
project_id=args.project_id,
cluster_name=args.cluster_name,
namespace_name=args.namespace_name,
)
)
r["lowest_cpu"], r["highest_cpu"] = get_cpu_from_monitoring_api(
pod_name=r["pod_name"],
start_epoch=r["start_epoch"],
end_epoch=r["end_epoch"],
project_id=args.project_id,
cluster_name=args.cluster_name,
namespace_name=args.namespace_name,
)
pass

r["gcsfuse_mount_options"] = gcsfuse_mount_options

Expand Down
54 changes: 40 additions & 14 deletions perfmetrics/scripts/testing_on_gke/examples/fio/parse_logs.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
import fio_workload

sys.path.append("../")
from utils.utils import get_memory, get_cpu, unix_to_timestamp, is_mash_installed
from utils.utils import get_memory, get_cpu, unix_to_timestamp, is_mash_installed, get_memory_from_monitoring_api, get_cpu_from_monitoring_api
from utils.parse_logs_common import ensureDir, download_gcs_objects, parseLogParserArguments, SUPPORTED_SCENARIOS

_LOCAL_LOGS_LOCATION = "../../bin/fio-logs"
Expand All @@ -35,6 +35,8 @@
"IOPS": 0,
"throughput_mb_per_second": 0,
"throughput_over_local_ssd": 0,
"start_epoch": "",
"end_epoch": "",
"start": "",
"end": "",
"highest_memory": 0,
Expand Down Expand Up @@ -203,24 +205,48 @@ def createOutputScenariosFromDownloadedFiles(args: dict) -> dict:
r["throughput_mb_per_second"] = int(
per_epoch_output_data["jobs"][0]["read"]["bw_bytes"] / (1024**2)
)
r["start_epoch"] = per_epoch_output_data["jobs"][0]["job_start"] // 1000
r["end_epoch"] = per_epoch_output_data["timestamp_ms"] // 1000
r["start"] = unix_to_timestamp(
per_epoch_output_data["jobs"][0]["job_start"]
)
r["end"] = unix_to_timestamp(per_epoch_output_data["timestamp_ms"])
if r["scenario"] != "local-ssd" and mash_installed:
r["lowest_memory"], r["highest_memory"] = get_memory(
r["pod_name"],
r["start"],
r["end"],
project_number=args.project_number,
)
r["lowest_cpu"], r["highest_cpu"] = get_cpu(
r["pod_name"],
r["start"],
r["end"],
project_number=args.project_number,
)

if r["scenario"] != "local-ssd":
if mash_installed:
r["lowest_memory"], r["highest_memory"] = get_memory(
r["pod_name"],
r["start"],
r["end"],
project_number=args.project_number,
)
r["lowest_cpu"], r["highest_cpu"] = get_cpu(
r["pod_name"],
r["start"],
r["end"],
project_number=args.project_number,
)
else:
r["lowest_memory"], r["highest_memory"] = (
get_memory_from_monitoring_api(
pod_name=r["pod_name"],
start_epoch=r["start_epoch"],
end_epoch=r["end_epoch"],
project_id=args.project_id,
cluster_name=args.cluster_name,
namespace_name=args.namespace_name,
)
)
r["lowest_cpu"], r["highest_cpu"] = get_cpu_from_monitoring_api(
pod_name=r["pod_name"],
start_epoch=r["start_epoch"],
end_epoch=r["end_epoch"],
project_id=args.project_id,
cluster_name=args.cluster_name,
namespace_name=args.namespace_name,
)
pass

r["gcsfuse_mount_options"] = gcsfuse_mount_options
r["blockSize"] = bs
r["filesPerThread"] = nrfiles
Expand Down
15 changes: 12 additions & 3 deletions perfmetrics/scripts/testing_on_gke/examples/run-gke-tests.sh
Original file line number Diff line number Diff line change
Expand Up @@ -241,7 +241,16 @@ function installDependencies() {
sudo apt install docker-ce -y
fi
# Ensure that mash is installed.
which mash || (sudo apt-get install -y monarch-tools)
if ! which mash ; then
if ! sudo apt-get install -y monarch-tools; then
# Ensure that gcloud monitoring tools are installed. This is alternative to
# mash on gce vm.
# pip install --upgrade google-cloud-storage
# pip install --ignore-installed --upgrade google-api-python-client
# pip install --ignore-installed --upgrade google-cloud
pip install --upgrade google-cloud-monitoring
fi
fi
}

# Make sure you have access to the necessary GCP resources. The easiest way to enable it is to use <your-ldap>@google.com as active auth.
Expand Down Expand Up @@ -529,14 +538,14 @@ function waitTillAllPodsComplete() {
function fetchAndParseFioOutputs() {
echo "Fetching and parsing fio outputs ..."
cd "${gke_testing_dir}"/examples/fio
python3 parse_logs.py --project-number=${project_number} --workload-config "${workload_config}" --instance-id ${instance_id} --output-file "${output_dir}"/fio/output.csv
python3 parse_logs.py --project-number=${project_number} --workload-config "${workload_config}" --instance-id ${instance_id} --output-file "${output_dir}"/fio/output.csv --project-id=${project_id} --cluster-name=${cluster_name} --namespace-name=${appnamespace}
cd -
}

function fetchAndParseDlioOutputs() {
echo "Fetching and parsing dlio outputs ..."
cd "${gke_testing_dir}"/examples/dlio
python3 parse_logs.py --project-number=${project_number} --workload-config "${workload_config}" --instance-id ${instance_id} --output-file "${output_dir}"/dlio/output.csv
python3 parse_logs.py --project-number=${project_number} --workload-config "${workload_config}" --instance-id ${instance_id} --output-file "${output_dir}"/dlio/output.csv --project-id=${project_id} --cluster-name=${cluster_name} --namespace-name=${appnamespace}
cd -
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,8 +74,18 @@ def parseLogParserArguments() -> object:
),
required=True,
)
parser.add_argument(
"--project-id",
metavar="GCP Project ID/name",
help=(
"project-id (e.g. gcs-fuse-test) is needed to fetch the cpu/memory"
" utilization data from GCP."
),
required=True,
)
parser.add_argument(
"--project-number",
metavar="GCP Project Number",
help=(
"project-number (e.g. 93817472919) is needed to fetch the cpu/memory"
" utilization data from GCP."
Expand All @@ -87,6 +97,16 @@ def parseLogParserArguments() -> object:
help="unique string ID for current test-run",
required=True,
)
parser.add_argument(
"--cluster-name",
help="Name of GKE cluster where the current test was run",
required=True,
)
parser.add_argument(
"--namespace-name",
help="kubernestes namespace used for the current test-run",
required=True,
)
parser.add_argument(
"-o",
"--output-file",
Expand Down
Loading

0 comments on commit 5f73162

Please sign in to comment.