Skip to content

Commit

Permalink
Merge pull request #13248 from tomponline/stable-5.21
Browse files Browse the repository at this point in the history
Backports (stable-5.21)
  • Loading branch information
tomponline authored Apr 2, 2024
2 parents 3ffeadc + 1878cfe commit 6413ccd
Show file tree
Hide file tree
Showing 5 changed files with 110 additions and 69 deletions.
62 changes: 41 additions & 21 deletions lxd/api_metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -109,7 +109,7 @@ func metricsGet(d *Daemon, r *http.Request) response.Response {
metricSet := metrics.NewMetricSet(nil)

var projectNames []string

var intMetrics *metrics.MetricSet
err := s.DB.Cluster.Transaction(r.Context(), func(ctx context.Context, tx *db.ClusterTx) error {
// Figure out the projects to retrieve.
if projectName != "" {
Expand All @@ -127,9 +127,8 @@ func metricsGet(d *Daemon, r *http.Request) response.Response {
}
}

// Add internal metrics.
metricSet.Merge(internalMetrics(ctx, s.StartTime, tx))

// Register internal metrics.
intMetrics = internalMetrics(ctx, s.StartTime, tx)
return nil
})
if err != nil {
Expand Down Expand Up @@ -228,23 +227,6 @@ func metricsGet(d *Daemon, r *http.Request) response.Response {
allProjectInstances[instance.Project().Name][instance.Type()]++
}

for project, instanceCountMap := range allProjectInstances {
metricSet.AddSamples(
metrics.Containers,
metrics.Sample{
Labels: map[string]string{"project": project},
Value: float64(instanceCountMap[instancetype.Container]),
},
)
metricSet.AddSamples(
metrics.VMs,
metrics.Sample{
Labels: map[string]string{"project": project},
Value: float64(instanceCountMap[instancetype.VM]),
},
)
}

// Prepare temporary metrics storage.
newMetrics := make(map[string]*metrics.MetricSet, len(projectsToFetch))
newMetricsLock := sync.Mutex{}
Expand All @@ -268,6 +250,12 @@ func metricsGet(d *Daemon, r *http.Request) response.Response {
// Ignore stopped instances.
if !errors.Is(err, instanceDrivers.ErrInstanceIsStopped) {
logger.Warn("Failed getting instance metrics", logger.Ctx{"instance": inst.Name(), "project": projectName, "err": err})
} else {
// If the instance is stopped, we still need to add the project to the cache.
// to fetch associated counter metrics.
if newMetrics[projectName] == nil {
newMetrics[projectName] = metrics.NewMetricSet(nil)
}
}
} else {
// Add the metrics.
Expand Down Expand Up @@ -304,8 +292,40 @@ func metricsGet(d *Daemon, r *http.Request) response.Response {
metricsCache = map[string]metricsCacheEntry{}
}

// Add counter metrics for instances to the metric cache.
// We need to do create a distinct metric set for each project.
counterMetrics := make(map[string]*metrics.MetricSet, len(allProjectInstances))
for project, instanceCountMap := range allProjectInstances {
counterMetricSetPerProject := metrics.NewMetricSet(nil)
counterMetricSetPerProject.AddSamples(
metrics.Instances,
metrics.Sample{
Labels: map[string]string{"project": project, "type": instancetype.Container.String()},
Value: float64(instanceCountMap[instancetype.Container]),
},
)
counterMetricSetPerProject.AddSamples(
metrics.Instances,
metrics.Sample{
Labels: map[string]string{"project": project, "type": instancetype.VM.String()},
Value: float64(instanceCountMap[instancetype.VM]),
},
)

counterMetrics[project] = counterMetricSetPerProject
}

updatedProjects := []string{}
for project, entries := range newMetrics {
if project == "default" {
entries.Merge(intMetrics) // internal metrics are always considered new. Add them to the default project.
}

counterMetric, ok := counterMetrics[project]
if ok {
entries.Merge(counterMetric)
}

metricsCache[project] = metricsCacheEntry{
expiry: time.Now().Add(cacheDuration),
metrics: entries,
Expand Down
45 changes: 30 additions & 15 deletions lxd/instance/drivers/driver_lxc.go
Original file line number Diff line number Diff line change
Expand Up @@ -8246,14 +8246,11 @@ func (d *lxc) Info() instance.Info {

// Metrics returns the metric set for the LXC driver. It collects various metrics related to memory, CPU, disk, filesystem, and network usage.
func (d *lxc) Metrics(hostInterfaces []net.Interface) (*metrics.MetricSet, error) {
state := instance.PowerStateStopped
isRunning := d.IsRunning()

if isRunning {
state = instance.PowerStateRunning
if !d.IsRunning() {
return nil, ErrInstanceIsStopped
}

out := metrics.NewMetricSet(map[string]string{"project": d.project.Name, "name": d.name, "type": instancetype.Container.String(), "state": state})
out := metrics.NewMetricSet(map[string]string{"project": d.project.Name, "name": d.name, "type": instancetype.Container.String()})

cc, err := d.initLXC(false)
if err != nil {
Expand All @@ -8276,7 +8273,7 @@ func (d *lxc) Metrics(hostInterfaces []net.Interface) (*metrics.MetricSet, error

// Get memory stats.
memStats, err := cg.GetMemoryStats()
if err != nil && isRunning {
if err != nil {
d.logger.Warn("Failed to get memory stats", logger.Ctx{"err": err})
} else {
for k, v := range memStats {
Expand Down Expand Up @@ -8318,7 +8315,7 @@ func (d *lxc) Metrics(hostInterfaces []net.Interface) (*metrics.MetricSet, error

// Get memory usage.
memoryUsage, err := cg.GetMemoryUsage()
if err != nil && isRunning {
if err != nil {
d.logger.Warn("Failed to get memory usage", logger.Ctx{"err": err})
}

Expand All @@ -8330,25 +8327,37 @@ func (d *lxc) Metrics(hostInterfaces []net.Interface) (*metrics.MetricSet, error

// Get oom kills.
oomKills, err := cg.GetOOMKills()
if err != nil && isRunning {
if err != nil {
d.logger.Warn("Failed to get oom kills", logger.Ctx{"err": err})
}

// If we failed to get OOM kills, because of a couple of reasons (instance stopped, cgroup controller not available, etc),
// we default to 0 instead of -1 for the MemoryOOMKillsTotal metric (a total of `-1` would be misleading).
if oomKills < 0 {
oomKills = 0
}

out.AddSamples(metrics.MemoryOOMKillsTotal, metrics.Sample{Value: float64(oomKills)})

// Handle swap.
if d.state.OS.CGInfo.Supports(cgroup.MemorySwapUsage, cg) {
swapUsage, err := cg.GetMemorySwapUsage()
if err != nil && isRunning {
if err != nil {
d.logger.Warn("Failed to get swap usage", logger.Ctx{"err": err})
} else {
// If we failed to get swap memory usage, because of a couple of reasons (instance stopped, cgroup controller not available, etc),
// we default to 0 instead of -1 for the MemorySwapBytes metric (`-1` bytes would be misleading).
if swapUsage < 0 {
swapUsage = 0
}

out.AddSamples(metrics.MemorySwapBytes, metrics.Sample{Value: float64(swapUsage)})
}
}

// Get CPU stats
usage, err := cg.GetCPUAcctUsageAll()
if err != nil && isRunning {
if err != nil {
d.logger.Warn("Failed to get CPU usage", logger.Ctx{"err": err})
} else {
for cpu, stats := range usage {
Expand All @@ -8361,15 +8370,21 @@ func (d *lxc) Metrics(hostInterfaces []net.Interface) (*metrics.MetricSet, error

// Get CPUs.
CPUs, err := cg.GetEffectiveCPUs()
if err != nil && isRunning {
if err != nil {
d.logger.Warn("Failed to get CPUs", logger.Ctx{"err": err})
} else {
// If we failed to get the number of total effective CPUs, because of a couple of reasons (instance stopped, cgroup controller not available, etc),
// we default to 0 instead of -1 for the CPUs metric (a total of `-1` would be misleading).
if CPUs < 0 {
CPUs = 0
}

out.AddSamples(metrics.CPUs, metrics.Sample{Value: float64(CPUs)})
}

// Get disk stats
diskStats, err := cg.GetIOStats()
if err != nil && isRunning {
if err != nil {
d.logger.Warn("Failed to get disk stats", logger.Ctx{"err": err})
} else {
for disk, stats := range diskStats {
Expand All @@ -8384,7 +8399,7 @@ func (d *lxc) Metrics(hostInterfaces []net.Interface) (*metrics.MetricSet, error

// Get filesystem stats
fsStats, err := d.getFSStats()
if err != nil && isRunning {
if err != nil {
d.logger.Warn("Failed to get fs stats", logger.Ctx{"err": err})
} else {
out.Merge(fsStats)
Expand All @@ -8408,7 +8423,7 @@ func (d *lxc) Metrics(hostInterfaces []net.Interface) (*metrics.MetricSet, error

// Get number of processes
pids, err := d.processesState(d.InitPID())
if err != nil && isRunning {
if err != nil {
d.logger.Warn("Failed to get total number of processes", logger.Ctx{"err": err})
} else {
out.AddSamples(metrics.ProcsTotal, metrics.Sample{Value: float64(pids)})
Expand Down
3 changes: 1 addition & 2 deletions lxd/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -94,8 +94,7 @@ func (m *MetricSet) String() string {
CPUs,
GoGoroutines,
GoHeapObjects,
Containers,
VMs,
Instances,
}

for _, metricType := range metricTypes {
Expand Down
12 changes: 4 additions & 8 deletions lxd/metrics/types.go
Original file line number Diff line number Diff line change
Expand Up @@ -144,10 +144,8 @@ const (
GoOtherSysBytes
// GoNextGCBytes represents the number of heap bytes when next garbage collection will take place.
GoNextGCBytes
// Containers represents the container count.
Containers
// VMs represents the VM count.
VMs
// Instances represents the instance count.
Instances
)

// MetricNames associates a metric type to its name.
Expand Down Expand Up @@ -216,8 +214,7 @@ var MetricNames = map[MetricType]string{
ProcsTotal: "lxd_procs_total",
UptimeSeconds: "lxd_uptime_seconds",
WarningsTotal: "lxd_warnings_total",
Containers: "lxd_containers",
VMs: "lxd_vms",
Instances: "lxd_instances",
}

// MetricHeaders represents the metric headers which contain help messages as specified by OpenMetrics.
Expand Down Expand Up @@ -286,6 +283,5 @@ var MetricHeaders = map[MetricType]string{
ProcsTotal: "# HELP lxd_procs_total The number of running processes.",
UptimeSeconds: "# HELP lxd_uptime_seconds The daemon uptime in seconds.",
WarningsTotal: "# HELP lxd_warnings_total The number of active warnings.",
Containers: "# HELP lxd_containers The number of containers.",
VMs: "# HELP lxd_vms The number of virtual machines.",
Instances: "# HELP lxd_instances The number of instances.",
}
57 changes: 34 additions & 23 deletions test/suites/metrics.sh
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,22 @@ test_metrics() {

# create another container in the non default project
lxc project create foo -c features.images=false -c features.profiles=false
lxc init testimage c3 --project foo
lxc launch testimage c3 --project foo

# c1 metrics should show as the container is running
lxc query "/1.0/metrics" | grep "name=\"c1\""
lxc query "/1.0/metrics?project=default" | grep "name=\"c1\""

# c2 metrics should show the container as stopped
lxc query "/1.0/metrics" | grep "name=\"c2\""
lxc query "/1.0/metrics?project=default" | grep "name=\"c2\""
lxc query "/1.0/metrics" | grep "name=\"c2\"" | grep "state=\"STOPPED\""
lxc query "/1.0/metrics?project=default" | grep "name=\"c2\"" | grep "state=\"STOPPED\""
# c2 metrics should not be shown as the container is stopped
! lxc query "/1.0/metrics" | grep "name=\"c2\"" || false
! lxc query "/1.0/metrics?project=default" | grep "name=\"c2\"" || false

# Check that we can get the count of existing instances.
lxc query /1.0/metrics | grep -xF 'lxd_instances{project="default",type="container"} 2'
lxc query /1.0/metrics | grep -xF 'lxd_instances{project="foo",type="container"} 1'
# Ensure lxd_instances reports VM count properly (0)
lxc query /1.0/metrics | grep -xF 'lxd_instances{project="default",type="virtual-machine"} 0'
lxc query /1.0/metrics | grep -xF 'lxd_instances{project="foo",type="virtual-machine"} 0'

# c3 metrics from another project also show up for non metrics unrestricted certificate
lxc query "/1.0/metrics" | grep "name=\"c3\""
Expand All @@ -39,13 +44,13 @@ test_metrics() {
curl -k -s --cert "${TEST_DIR}/metrics.crt" --key "${TEST_DIR}/metrics.key" -X GET "https://${LXD_ADDR}/1.0/metrics" | grep "name=\"c1\""
curl -k -s --cert "${TEST_DIR}/metrics.crt" --key "${TEST_DIR}/metrics.key" -X GET "https://${LXD_ADDR}/1.0/metrics?project=default" | grep "name=\"c1\""

# c2 metrics should show the container as stopped
curl -k -s --cert "${TEST_DIR}/metrics.crt" --key "${TEST_DIR}/metrics.key" -X GET "https://${LXD_ADDR}/1.0/metrics" | grep "name=\"c2\"" | grep "state=\"STOPPED\""
curl -k -s --cert "${TEST_DIR}/metrics.crt" --key "${TEST_DIR}/metrics.key" -X GET "https://${LXD_ADDR}/1.0/metrics?project=default" | grep "name=\"c2\"" | grep "state=\"STOPPED\""
# c2 metrics should not be shown as the container is stopped
! curl -k -s --cert "${TEST_DIR}/metrics.crt" --key "${TEST_DIR}/metrics.key" -X GET "https://${LXD_ADDR}/1.0/metrics" | grep "name=\"c2\"" || false
! curl -k -s --cert "${TEST_DIR}/metrics.crt" --key "${TEST_DIR}/metrics.key" -X GET "https://${LXD_ADDR}/1.0/metrics?project=default" | grep "name=\"c2\"" || false

# c3 metrics from another project should show the container as stopped for unrestricted certificate
curl -k -s --cert "${TEST_DIR}/metrics.crt" --key "${TEST_DIR}/metrics.key" -X GET "https://${LXD_ADDR}/1.0/metrics" | grep "name=\"c3\"" | grep "state=\"STOPPED\""
curl -k -s --cert "${TEST_DIR}/metrics.crt" --key "${TEST_DIR}/metrics.key" -X GET "https://${LXD_ADDR}/1.0/metrics?project=foo" | grep "name=\"c3\"" | grep "state=\"STOPPED\""
# c3 metrics from another project should be shown for unrestricted certificate
curl -k -s --cert "${TEST_DIR}/metrics.crt" --key "${TEST_DIR}/metrics.key" -X GET "https://${LXD_ADDR}/1.0/metrics" | grep "name=\"c3\""
curl -k -s --cert "${TEST_DIR}/metrics.crt" --key "${TEST_DIR}/metrics.key" -X GET "https://${LXD_ADDR}/1.0/metrics?project=foo" | grep "name=\"c3\""

# internal server metrics should be shown as the certificate is not restricted
curl -k -s --cert "${TEST_DIR}/metrics.crt" --key "${TEST_DIR}/metrics.key" -X GET "https://${LXD_ADDR}/1.0/metrics" | grep -E "^lxd_warnings_total [0-9]+$"
Expand All @@ -58,17 +63,17 @@ test_metrics() {

lxc config set core.metrics_address "${metrics_addr}"

# c1 metrics should show as the container is running
# c1 metrics should be shown as the container is running
curl -k -s --cert "${TEST_DIR}/metrics.crt" --key "${TEST_DIR}/metrics.key" -X GET "https://${metrics_addr}/1.0/metrics" | grep "name=\"c1\""
curl -k -s --cert "${TEST_DIR}/metrics.crt" --key "${TEST_DIR}/metrics.key" -X GET "https://${metrics_addr}/1.0/metrics?project=default" | grep "name=\"c1\""

# c2 metrics should show the container as stopped
curl -k -s --cert "${TEST_DIR}/metrics.crt" --key "${TEST_DIR}/metrics.key" -X GET "https://${metrics_addr}/1.0/metrics" | grep "name=\"c2\"" | grep "state=\"STOPPED\""
curl -k -s --cert "${TEST_DIR}/metrics.crt" --key "${TEST_DIR}/metrics.key" -X GET "https://${metrics_addr}/1.0/metrics?project=default" | grep "name=\"c2\"" | grep "state=\"STOPPED\""
# c2 metrics should not be shown as the container is stopped
! curl -k -s --cert "${TEST_DIR}/metrics.crt" --key "${TEST_DIR}/metrics.key" -X GET "https://${metrics_addr}/1.0/metrics" | grep "name=\"c2\"" || false
! curl -k -s --cert "${TEST_DIR}/metrics.crt" --key "${TEST_DIR}/metrics.key" -X GET "https://${metrics_addr}/1.0/metrics?project=default" | grep "name=\"c2\"" || false

# c3 metrics from another project should show the container as stopped for unrestricted metrics certificate
curl -k -s --cert "${TEST_DIR}/metrics.crt" --key "${TEST_DIR}/metrics.key" -X GET "https://${LXD_ADDR}/1.0/metrics" | grep "name=\"c3\"" | grep "state=\"STOPPED\""
curl -k -s --cert "${TEST_DIR}/metrics.crt" --key "${TEST_DIR}/metrics.key" -X GET "https://${LXD_ADDR}/1.0/metrics?project=foo" | grep "name=\"c3\"" | grep "state=\"STOPPED\""
# c3 metrics from another project should be shown for unrestricted metrics certificate
curl -k -s --cert "${TEST_DIR}/metrics.crt" --key "${TEST_DIR}/metrics.key" -X GET "https://${LXD_ADDR}/1.0/metrics" | grep "name=\"c3\""
curl -k -s --cert "${TEST_DIR}/metrics.crt" --key "${TEST_DIR}/metrics.key" -X GET "https://${LXD_ADDR}/1.0/metrics?project=foo" | grep "name=\"c3\""

# internal server metrics should be shown as the certificate is not restricted
curl -k -s --cert "${TEST_DIR}/metrics.crt" --key "${TEST_DIR}/metrics.key" -X GET "https://${metrics_addr}/1.0/metrics" | grep -E "^lxd_warnings_total [0-9]+$"
Expand All @@ -84,18 +89,24 @@ test_metrics() {
lxc config trust add "${TEST_DIR}/metrics-restricted.crt" --type=metrics --restricted --projects foo
lxc config trust show "$(openssl x509 -in "${TEST_DIR}/metrics-restricted.crt" -outform der | sha256sum | head -c12)" | grep -xF "restricted: true"

# c3 metrics should show the container as stopped for restricted metrics certificate
curl -k -s --cert "${TEST_DIR}/metrics-restricted.crt" --key "${TEST_DIR}/metrics-restricted.key" -X GET "https://${LXD_ADDR}/1.0/metrics?project=foo" | grep "name=\"c3\"" | grep "state=\"STOPPED\""
# c3 metrics should be showned
curl -k -s --cert "${TEST_DIR}/metrics-restricted.crt" --key "${TEST_DIR}/metrics-restricted.key" -X GET "https://${LXD_ADDR}/1.0/metrics?project=foo" | grep "name=\"c3\""

# c3 metrics for the stopped container cannot be viewed via the generic metrics endpoint if the certificate is restricted
# c3 metrics cannot be viewed via the generic metrics endpoint if the certificate is restricted
! curl -k -s --cert "${TEST_DIR}/metrics-restricted.crt" --key "${TEST_DIR}/metrics-restricted.key" -X GET "https://${LXD_ADDR}/1.0/metrics"

# other projects metrics aren't visible as they aren't allowed for the restricted certificate
! curl -k -s --cert "${TEST_DIR}/metrics-restricted.crt" --key "${TEST_DIR}/metrics-restricted.key" -X GET "https://${LXD_ADDR}/1.0/metrics?project=default"

# c1 and c2 metrics are not visible as they are in another project
! curl -k -s --cert "${TEST_DIR}/metrics-restricted.crt" --key "${TEST_DIR}/metrics-restricted.key" -X GET "https://${metrics_addr}/1.0/metrics?project=foo" | grep "name=\"c1\""
! curl -k -s --cert "${TEST_DIR}/metrics-restricted.crt" --key "${TEST_DIR}/metrics-restricted.key" -X GET "https://${metrics_addr}/1.0/metrics?project=foo" | grep "name=\"c2\"" | grep "state=\"STOPPED\""
! curl -k -s --cert "${TEST_DIR}/metrics-restricted.crt" --key "${TEST_DIR}/metrics-restricted.key" -X GET "https://${metrics_addr}/1.0/metrics?project=foo" | grep "name=\"c2\""

# Check that we can get the count of existing containers. There should be two in the default project: c1 (RUNNING) and c2 (STOPPED).
curl -k -s --cert "${TEST_DIR}/metrics.crt" --key "${TEST_DIR}/metrics.key" -X GET "https://${metrics_addr}/1.0/metrics" | grep -xF 'lxd_instances{project="default",type="container"} 2'
sleep 10
# Try again after the metric cache has expired. We should still see two containers.
curl -k -s --cert "${TEST_DIR}/metrics.crt" --key "${TEST_DIR}/metrics.key" -X GET "https://${metrics_addr}/1.0/metrics" | grep -xF 'lxd_instances{project="default",type="container"} 2'

# test unauthenticated connections
! curl -k -s -X GET "https://${metrics_addr}/1.0/metrics" | grep "name=\"c1\"" || false
Expand Down

0 comments on commit 6413ccd

Please sign in to comment.