Skip to content

Commit

Permalink
Add a CPU utilization resource monitor for overload manager (#34713)
Browse files Browse the repository at this point in the history
Commit Message: Add a CPU utilization resource monitor for overload
manager. i.e. this can be configured to reject requests once CPU Utilization reaches a certain brownout point.


Signed-off-by: Can Cecen <[email protected]>
  • Loading branch information
cancecen authored Sep 11, 2024
1 parent 1ef5996 commit 4d12162
Show file tree
Hide file tree
Showing 23 changed files with 756 additions and 1 deletion.
1 change: 1 addition & 0 deletions CODEOWNERS
Validating CODEOWNERS rules …
Original file line number Diff line number Diff line change
Expand Up @@ -142,6 +142,7 @@ extensions/filters/common/original_src @klarose @mattklein123
/*/extensions/resource_monitors/common @eziskind @yanavlasov @nezdolik
/*/extensions/resource_monitors/fixed_heap @eziskind @yanavlasov @nezdolik
/*/extensions/resource_monitors/downstream_connections @nezdolik @mattklein123
/*/extensions/resource_monitors/cpu_utilization @cancecen @kbaichoo
/*/extensions/retry/priority @alyssawilk @mattklein123
/*/extensions/retry/priority/previous_priorities @alyssawilk @mattklein123
/*/extensions/retry/host @alyssawilk @mattklein123
Expand Down
1 change: 1 addition & 0 deletions api/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -311,6 +311,7 @@ proto_library(
"//envoy/extensions/rbac/matchers/upstream_ip_port/v3:pkg",
"//envoy/extensions/regex_engines/v3:pkg",
"//envoy/extensions/request_id/uuid/v3:pkg",
"//envoy/extensions/resource_monitors/cpu_utilization/v3:pkg",
"//envoy/extensions/resource_monitors/downstream_connections/v3:pkg",
"//envoy/extensions/resource_monitors/fixed_heap/v3:pkg",
"//envoy/extensions/resource_monitors/injected_resource/v3:pkg",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# DO NOT EDIT. This file is generated by tools/proto_format/proto_sync.py.

load("@envoy_api//bazel:api_build_system.bzl", "api_proto_package")

licenses(["notice"]) # Apache 2

api_proto_package(
deps = ["@com_github_cncf_xds//udpa/annotations:pkg"],
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
syntax = "proto3";

package envoy.extensions.resource_monitors.cpu_utilization.v3;

import "udpa/annotations/status.proto";

option java_package = "io.envoyproxy.envoy.extensions.resource_monitors.cpu_utilization.v3";
option java_outer_classname = "CpuUtilizationProto";
option java_multiple_files = true;
option go_package = "github.com/envoyproxy/go-control-plane/envoy/extensions/resource_monitors/cpu_utilization/v3;cpu_utilizationv3";
option (udpa.annotations.file_status).package_version_status = ACTIVE;

// [#protodoc-title: CPU utilization]
// [#extension: envoy.resource_monitors.cpu_utilization]

// The CPU utilization resource monitor reports the Envoy process the CPU Utilization of the entire host.
// Today, this only works on Linux and is calculated using the stats in the /proc/stat file.
message CpuUtilizationConfig {
}
1 change: 1 addition & 0 deletions api/versioning/BUILD
Original file line number Diff line number Diff line change
Expand Up @@ -250,6 +250,7 @@ proto_library(
"//envoy/extensions/rbac/matchers/upstream_ip_port/v3:pkg",
"//envoy/extensions/regex_engines/v3:pkg",
"//envoy/extensions/request_id/uuid/v3:pkg",
"//envoy/extensions/resource_monitors/cpu_utilization/v3:pkg",
"//envoy/extensions/resource_monitors/downstream_connections/v3:pkg",
"//envoy/extensions/resource_monitors/fixed_heap/v3:pkg",
"//envoy/extensions/resource_monitors/injected_resource/v3:pkg",
Expand Down
4 changes: 4 additions & 0 deletions changelogs/current.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -317,5 +317,9 @@ new_features:
change: |
The :ref:`xff <envoy_v3_api_msg_extensions.http.original_ip_detection.xff.v3.XffConfig>`
original IP detection method now supports using a list of trusted CIDRs when parsing ``x-forwarded-for``.
- area: resource_monitors
change: |
Added possibility to monitor CPU utilization in Linux based systems via :ref:`cpu utilization monitor
<envoy_v3_api_msg_extensions.resource_monitors.cpu_utilization.v3.CpuUtilizationConfig>` in overload manager.
deprecated:
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
static_resources:
listeners:
- address:
socket_address:
address: 0.0.0.0
port_value: 8000
filter_chains:
- filters:
- name: envoy.filters.network.http_connection_manager
typed_config:
'@type': type.googleapis.com/envoy.extensions.filters.network.http_connection_manager.v3.HttpConnectionManager
stat_prefix: ingress_http
http_filters:
- name: envoy.filters.http.router
typed_config:
'@type': type.googleapis.com/envoy.extensions.filters.http.router.v3.Router
route_config:
name: local_route
virtual_hosts:
- domains:
- '*'
name: local_service
routes:
- match: {prefix: "/"}
route: {cluster: default_service}
clusters:
- name: default_service
load_assignment:
cluster_name: default_service
endpoints:
- lb_endpoints:
- endpoint:
address:
socket_address:
address: 127.0.0.1
port_value: 10001
admin:
address:
socket_address:
address: 0.0.0.0
port_value: 9901

overload_manager:
refresh_interval: 0.25s
resource_monitors:
- name: "envoy.resource_monitors.cpu_utilization"
typed_config:
"@type": type.googleapis.com/envoy.extensions.resource_monitors.cpu_utilization.v3.CpuUtilizationConfig
actions:
- name: "envoy.overload_actions.stop_accepting_requests"
triggers:
- name: "envoy.resource_monitors.cpu_utilization"
scaled:
scaling_threshold: 0.80
saturation_threshold: 0.95
Original file line number Diff line number Diff line change
Expand Up @@ -353,6 +353,28 @@ It's expected that the first few gradations shouldn't trigger anything, unless
there's something seriously wrong e.g. in this example streams using ``>=
128MiB`` in buffers.

CPU Intensive Workload Brownout Protection
------------------------------------------

The ``envoy.overload_actions.stop_accepting_requests`` overload action can be used
to protect workloads from browning-out when an unexpected spike in the number of
requests the workload receives that causes the CPU to become saturated. This overload
action when used in conjunction with the ``envoy.resource_monitors.cpu_utilization``
resource monitor can reduce the pressure on the CPU by cheaply rejecting new requests.
While the real mitigation for such request spikes are horizantally scaling the workload,
this overload action can be used to ensure the fleet does not get into a cascading failure
mode.
Some platform owners may choose to install this overload action by default to protect the fleet,
since it is easier to configure a target CPU utilization percentage than to configure a request rate per
workload.

.. literalinclude:: _include/cpu_utilization_monitor_overload.yaml
:language: yaml
:lines: 43-55
:emphasize-lines: 3-13
:linenos:
:caption: :download:`cpu_utilization_monitor_overload.yaml <_include/cpu_utilization_monitor_overload.yaml>`


Statistics
----------
Expand Down Expand Up @@ -388,4 +410,3 @@ with the following statistics:

scale_percent, Gauge, "Scaled value of the action as a percent (0-99=scaling, 100=saturated)"
shed_load_count, Counter, "Total count the load is sheded"

1 change: 1 addition & 0 deletions source/extensions/extensions_build_config.bzl
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,7 @@ EXTENSIONS = {
"envoy.resource_monitors.fixed_heap": "//source/extensions/resource_monitors/fixed_heap:config",
"envoy.resource_monitors.injected_resource": "//source/extensions/resource_monitors/injected_resource:config",
"envoy.resource_monitors.global_downstream_max_connections": "//source/extensions/resource_monitors/downstream_connections:config",
"envoy.resource_monitors.cpu_utilization": "//source/extensions/resource_monitors/cpu_utilization:config",

#
# Stat sinks
Expand Down
7 changes: 7 additions & 0 deletions source/extensions/extensions_metadata.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -1077,6 +1077,13 @@ envoy.request_id.uuid:
status: stable
type_urls:
- envoy.extensions.request_id.uuid.v3.UuidRequestIdConfig
envoy.resource_monitors.cpu_utilization:
categories:
- envoy.resource_monitors
security_posture: data_plane_agnostic
status: alpha
type_urls:
- envoy.extensions.resource_monitors.cpu_utilization.v3.CpuUtilizationConfig
envoy.resource_monitors.global_downstream_max_connections:
categories:
- envoy.resource_monitors
Expand Down
59 changes: 59 additions & 0 deletions source/extensions/resource_monitors/cpu_utilization/BUILD
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
load(
"//bazel:envoy_build_system.bzl",
"envoy_cc_extension",
"envoy_cc_library",
"envoy_extension_package",
)

licenses(["notice"]) # Apache 2

envoy_extension_package()

envoy_cc_library(
name = "cpu_utilization_monitor",
srcs = ["cpu_utilization_monitor.cc"],
hdrs = [
"cpu_stats_reader.h",
"cpu_utilization_monitor.h",
],
tags = ["skip_on_windows"],
deps = [
"//envoy/common:exception_lib",
"//envoy/server:resource_monitor_config_interface",
"//source/common/runtime:runtime_features_lib",
"@envoy_api//envoy/extensions/resource_monitors/cpu_utilization/v3:pkg_cc_proto",
],
)

envoy_cc_library(
name = "linux_cpu_stats_reader",
srcs = ["linux_cpu_stats_reader.cc"],
hdrs = [
"cpu_stats_reader.h",
"cpu_utilization_monitor.h",
"linux_cpu_stats_reader.h",
],
tags = ["skip_on_windows"],
deps = [
"//source/common/common:logger_lib",
"@com_google_absl//absl/strings",
"@envoy_api//envoy/extensions/resource_monitors/cpu_utilization/v3:pkg_cc_proto",
],
)

envoy_cc_extension(
name = "config",
srcs = ["config.cc"],
hdrs = ["config.h"],
tags = ["skip_on_windows"],
deps = [
":cpu_utilization_monitor",
":linux_cpu_stats_reader",
"//envoy/registry",
"//envoy/server:resource_monitor_config_interface",
"//source/common/common:logger_lib",
"//source/extensions/resource_monitors/common:factory_base_lib",
"//source/server:configuration_lib",
"@envoy_api//envoy/extensions/resource_monitors/cpu_utilization/v3:pkg_cc_proto",
],
)
32 changes: 32 additions & 0 deletions source/extensions/resource_monitors/cpu_utilization/config.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
#include "source/extensions/resource_monitors/cpu_utilization/config.h"

#include "envoy/extensions/resource_monitors/cpu_utilization/v3/cpu_utilization.pb.h"
#include "envoy/extensions/resource_monitors/cpu_utilization/v3/cpu_utilization.pb.validate.h"
#include "envoy/registry/registry.h"

#include "source/common/protobuf/utility.h"
#include "source/extensions/resource_monitors/cpu_utilization/cpu_utilization_monitor.h"
#include "source/extensions/resource_monitors/cpu_utilization/linux_cpu_stats_reader.h"

namespace Envoy {
namespace Extensions {
namespace ResourceMonitors {
namespace CpuUtilizationMonitor {

Server::ResourceMonitorPtr CpuUtilizationMonitorFactory::createResourceMonitorFromProtoTyped(
const envoy::extensions::resource_monitors::cpu_utilization::v3::CpuUtilizationConfig& config,
Server::Configuration::ResourceMonitorFactoryContext& /*unused_context*/) {
// In the future, the below can be configurable based on the operating system.
auto cpu_stats_reader = std::make_unique<LinuxCpuStatsReader>();
return std::make_unique<CpuUtilizationMonitor>(config, std::move(cpu_stats_reader));
}

/**
* Static registration for the cpu resource monitor factory. @see RegistryFactory.
*/
REGISTER_FACTORY(CpuUtilizationMonitorFactory, Server::Configuration::ResourceMonitorFactory);

} // namespace CpuUtilizationMonitor
} // namespace ResourceMonitors
} // namespace Extensions
} // namespace Envoy
29 changes: 29 additions & 0 deletions source/extensions/resource_monitors/cpu_utilization/config.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#pragma once

#include "envoy/extensions/resource_monitors/cpu_utilization/v3/cpu_utilization.pb.h"
#include "envoy/extensions/resource_monitors/cpu_utilization/v3/cpu_utilization.pb.validate.h"
#include "envoy/server/resource_monitor_config.h"

#include "source/extensions/resource_monitors/common/factory_base.h"

namespace Envoy {
namespace Extensions {
namespace ResourceMonitors {
namespace CpuUtilizationMonitor {

class CpuUtilizationMonitorFactory
: public Common::FactoryBase<
envoy::extensions::resource_monitors::cpu_utilization::v3::CpuUtilizationConfig> {
public:
CpuUtilizationMonitorFactory() : FactoryBase("envoy.resource_monitors.cpu_utilization") {}

private:
Server::ResourceMonitorPtr createResourceMonitorFromProtoTyped(
const envoy::extensions::resource_monitors::cpu_utilization::v3::CpuUtilizationConfig& config,
Server::Configuration::ResourceMonitorFactoryContext& context) override;
};

} // namespace CpuUtilizationMonitor
} // namespace ResourceMonitors
} // namespace Extensions
} // namespace Envoy
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#pragma once

#include <dirent.h>
#include <unistd.h>

#include <algorithm>
#include <filesystem>
#include <fstream>

#include "source/common/common/logger.h"

#include "absl/strings/str_split.h"

namespace Envoy {
namespace Extensions {
namespace ResourceMonitors {
namespace CpuUtilizationMonitor {

struct CpuTimes {
bool is_valid;
uint64_t work_time;
uint64_t total_time;
};

class CpuStatsReader {
public:
CpuStatsReader() = default;
virtual ~CpuStatsReader() = default;
virtual CpuTimes getCpuTimes() = 0;
};

} // namespace CpuUtilizationMonitor
} // namespace ResourceMonitors
} // namespace Extensions
} // namespace Envoy
Loading

0 comments on commit 4d12162

Please sign in to comment.