Merge branch 'main' into bump-go-1.21.6

grafana · Feb 7, 2024 · e4e988a · e4e988a
2 parents ca7db55 + 8d9fa85
commit e4e988a
Show file tree

Hide file tree

Showing 23 changed files with 1,312 additions and 77 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -25,6 +25,8 @@ Main (unreleased)
 - A new `otelcol.processor.resourcedetection` component which inserts resource attributes 
   to OTLP telemetry based on the host on which Grafana Agent is running. (@ptodev)
 
+- Expose track_timestamps_staleness on Prometheus scraping, to fix the issue where container metrics live for 5 minutes after the container disappears. (@ptodev)
+
 ### Enhancements
 
 - Include line numbers in profiles produced by `pyrsocope.java` component. (@korniltsev)
@@ -51,6 +53,8 @@ Main (unreleased)
 
 - `service_name` label is inferred from discovery meta labels in `pyroscope.java` (@korniltsev)
 
+- Mutex and block pprofs are now available via the pprof endpoint. (@mattdurham)
+
 ### Bugfixes
 
 - Fix an issue in `remote.s3` where the exported content of an object would be an empty string if `remote.s3` failed to fully retrieve

diff --git a/cmd/internal/flowmode/cmd_run.go b/cmd/internal/flowmode/cmd_run.go
@@ -8,6 +8,8 @@ import (
 	"os"
 	"os/signal"
 	"path/filepath"
+	"runtime"
+	"strconv"
 	"strings"
 	"sync"
 	"syscall"
@@ -177,6 +179,9 @@ func (fr *flowRun) Run(configPath string) error {
 
 	level.Info(l).Log("boringcrypto enabled", boringcrypto.Enabled)
 
+	// Enable the profiling.
+	setMutexBlockProfiling(l)
+
 	// Immediately start the tracer.
 	go func() {
 		err := t.Run(ctx)
@@ -450,3 +455,34 @@ func splitPeers(s, sep string) []string {
 	}
 	return strings.Split(s, sep)
 }
+
+func setMutexBlockProfiling(l log.Logger) {
+	mutexPercent := os.Getenv("PPROF_MUTEX_PROFILING_PERCENT")
+	if mutexPercent != "" {
+		rate, err := strconv.Atoi(mutexPercent)
+		if err == nil && rate > 0 {
+			// The 100/rate is because the value is interpreted as 1/rate. So 50 would be 100/50 = 2 and become 1/2 or 50%.
+			runtime.SetMutexProfileFraction(100 / rate)
+		} else {
+			level.Error(l).Log("msg", "error setting PPROF_MUTEX_PROFILING_PERCENT", "err", err, "value", mutexPercent)
+			runtime.SetMutexProfileFraction(1000)
+		}
+	} else {
+		// Why 1000 because that is what istio defaults to and that seemed reasonable to start with. This is 00.1% sampling.
+		runtime.SetMutexProfileFraction(1000)
+	}
+	blockRate := os.Getenv("PPROF_BLOCK_PROFILING_RATE")
+	if blockRate != "" {
+		rate, err := strconv.Atoi(blockRate)
+		if err == nil && rate > 0 {
+			runtime.SetBlockProfileRate(rate)
+		} else {
+			level.Error(l).Log("msg", "error setting PPROF_BLOCK_PROFILING_RATE", "err", err, "value", blockRate)
+			runtime.SetBlockProfileRate(10_000)
+		}
+	} else {
+		// This should have a negligible impact. This will track anything over 10_000ns, and will randomly sample shorter durations.
+		// Default taken from https://github.com/DataDog/go-profiler-notes/blob/main/block.md
+		runtime.SetBlockProfileRate(10_000)
+	}
+}
diff --git a/component/prometheus/scrape/scrape.go b/component/prometheus/scrape/scrape.go
@@ -51,6 +51,8 @@ type Arguments struct {
 	HonorLabels bool `river:"honor_labels,attr,optional"`
 	// Indicator whether the scraped timestamps should be respected.
 	HonorTimestamps bool `river:"honor_timestamps,attr,optional"`
+	// Indicator whether to track the staleness of the scraped timestamps.
+	TrackTimestampsStaleness bool `river:"track_timestamps_staleness,attr,optional"`
 	// A set of query parameters with which the target is scraped.
 	Params url.Values `river:"params,attr,optional"`
 	// Whether to scrape a classic histogram that is also exposed as a native histogram.
@@ -94,13 +96,14 @@ type Arguments struct {
 // SetToDefault implements river.Defaulter.
 func (arg *Arguments) SetToDefault() {
 	*arg = Arguments{
-		MetricsPath:      "/metrics",
-		Scheme:           "http",
-		HonorLabels:      false,
-		HonorTimestamps:  true,
-		HTTPClientConfig: component_config.DefaultHTTPClientConfig,
-		ScrapeInterval:   1 * time.Minute,  // From config.DefaultGlobalConfig
-		ScrapeTimeout:    10 * time.Second, // From config.DefaultGlobalConfig
+		MetricsPath:              "/metrics",
+		Scheme:                   "http",
+		HonorLabels:              false,
+		HonorTimestamps:          true,
+		TrackTimestampsStaleness: false,
+		HTTPClientConfig:         component_config.DefaultHTTPClientConfig,
+		ScrapeInterval:           1 * time.Minute,  // From config.DefaultGlobalConfig
+		ScrapeTimeout:            10 * time.Second, // From config.DefaultGlobalConfig
 	}
 }
 
@@ -287,6 +290,7 @@ func getPromScrapeConfigs(jobName string, c Arguments) *config.ScrapeConfig {
 	}
 	dec.HonorLabels = c.HonorLabels
 	dec.HonorTimestamps = c.HonorTimestamps
+	dec.TrackTimestampsStaleness = c.TrackTimestampsStaleness
 	dec.Params = c.Params
 	dec.ScrapeClassicHistograms = c.ScrapeClassicHistograms
 	dec.ScrapeInterval = model.Duration(c.ScrapeInterval)

diff --git a/component/prometheus/scrape/scrape_test.go b/component/prometheus/scrape/scrape_test.go
@@ -29,6 +29,7 @@ func TestRiverConfig(t *testing.T) {
 	forward_to      = []
 	scrape_interval = "10s"
 	job_name        = "local"
+	track_timestamps_staleness = true
 
 	bearer_token = "token"
 	proxy_url = "http://0.0.0.0:11111"

diff --git a/converter/internal/prometheusconvert/component/scrape.go b/converter/internal/prometheusconvert/component/scrape.go
@@ -52,6 +52,7 @@ func toScrapeArguments(scrapeConfig *prom_config.ScrapeConfig, forwardTo []stora
 		JobName:                   scrapeConfig.JobName,
 		HonorLabels:               scrapeConfig.HonorLabels,
 		HonorTimestamps:           scrapeConfig.HonorTimestamps,
+		TrackTimestampsStaleness:  scrapeConfig.TrackTimestampsStaleness,
 		Params:                    scrapeConfig.Params,
 		ScrapeClassicHistograms:   scrapeConfig.ScrapeClassicHistograms,
 		ScrapeInterval:            time.Duration(scrapeConfig.ScrapeInterval),

diff --git a/converter/internal/prometheusconvert/testdata/scrape.river b/converter/internal/prometheusconvert/testdata/scrape.river
@@ -9,11 +9,12 @@ prometheus.scrape "prometheus_1" {
 			app         = "foo",
 		}],
 	)
-	forward_to       = [prometheus.remote_write.default.receiver]
-	job_name         = "prometheus-1"
-	honor_timestamps = false
-	scrape_interval  = "10s"
-	scrape_timeout   = "5s"
+	forward_to                 = [prometheus.remote_write.default.receiver]
+	job_name                   = "prometheus-1"
+	honor_timestamps           = false
+	track_timestamps_staleness = true
+	scrape_interval            = "10s"
+	scrape_timeout             = "5s"
 
 	basic_auth {
 		username = "user"

diff --git a/docs/sources/flow/reference/components/prometheus.scrape.md b/docs/sources/flow/reference/components/prometheus.scrape.md
@@ -51,6 +51,7 @@ Name | Type | Description | Default | Required
 `enable_protobuf_negotiation` | `bool`     | Whether to enable protobuf negotiation with the client. | `false` | no
 `honor_labels`                | `bool`     | Indicator whether the scraped metrics should remain unmodified. | `false` | no
 `honor_timestamps`            | `bool`     | Indicator whether the scraped timestamps should be respected. | `true` | no
+`track_timestamps_staleness`  | `bool`     | Indicator whether to track the staleness of the scraped timestamps. | `false` | no
 `params`                      | `map(list(string))` | A set of query parameters with which the target is scraped. | | no
 `scrape_classic_histograms`   | `bool`     | Whether to scrape a classic histogram that is also exposed as a native histogram. | `false` | no
 `scrape_interval`             | `duration` | How frequently to scrape the targets of this scrape configuration. | `"60s"` | no
@@ -76,6 +77,20 @@ Name | Type | Description | Default | Required
  - [`authorization` block][authorization].
  - [`oauth2` block][oauth2].
 
+`track_timestamps_staleness` controls whether Prometheus tracks [staleness][prom-staleness] of metrics which with an explicit timestamp present in scraped data.
+* An "explicit timestamp" is an optional timestamp in the [Prometheus metrics exposition format][prom-text-exposition-format]. For example, this sample has a timestamp of `1395066363000`:
+  ```
+  http_requests_total{method="post",code="200"} 1027 1395066363000
+  ```
+* If `track_timestamps_staleness` is set to `true`, a staleness marker will be inserted when a metric is no longer present or the target is down.
+* A "staleness marker" is just a {{< term "sample" >}}sample{{< /term >}} with a specific NaN value which is reserved for internal use by Prometheus.
+* It is recommended to set `track_timestamps_staleness` to `true` if the database where metrics are written to has enabled [out of order ingestion][mimir-ooo].
+* If `track_timestamps_staleness` is set to `false`, samples with explicit timestamps will only be labeled as stale after a certain time period, which in Prometheus is 5 minutes by default.
+
+[prom-text-exposition-format]: https://prometheus.io/docs/instrumenting/exposition_formats/#text-based-format
+[prom-staleness]: https://prometheus.io/docs/prometheus/latest/querying/basics/#staleness
+[mimir-ooo]: https://grafana.com/docs/mimir/latest/configure/configure-out-of-order-samples-ingestion/
+
 ## Blocks
 
 The following blocks are supported inside the definition of `prometheus.scrape`:

diff --git a/docs/sources/static/_index.md b/docs/sources/static/_index.md
@@ -10,7 +10,7 @@ weight: 200
 
 # Static mode
 
-Static mode is the original mode of Grafana Agent, and is the most mature.
+Static mode is the original mode of Grafana Agent.
 Static mode is composed of different _subsystems_:
 
 * The _metrics subsystem_ wraps around Prometheus for collecting Prometheus

diff --git a/docs/sources/static/api/_index.md b/docs/sources/static/api/_index.md
@@ -23,11 +23,11 @@ API endpoints are stable unless otherwise noted.
 
 ## Config management API (Beta)
 
-Grafana Agent exposes a config management REST API for managing instance configurations when it is running in [scraping service mode][scrape].
+Grafana Agent exposes a configuration management REST API for managing instance configurations when it's running in [scraping service mode][scrape].
 
 {{< admonition type="note" >}}
-The scraping service mode is a requirement for the config management
-API, however this is not a prerequisite for the Agent API or Ready/Healthy API.
+The scraping service mode is a requirement for the configuration management
+API, however this isn't a prerequisite for the Agent API or Ready/Healthy API.
 {{< /admonition >}}
 
 The following endpoints are exposed:
@@ -37,6 +37,14 @@ The following endpoints are exposed:
 - Update config: [`PUT /agent/api/v1/config/{name}`](#update-config)
 - Delete config: [`DELETE /agent/api/v1/config/{name}`](#delete-config)
 
+{{< admonition type="note" >}}
+If you are running Grafana Agent in a Docker container and you want to expose the API outside the Docker container, you must change the default HTTP listen address from `127.0.0.1:12345` to a valid network interface address.
+You can change the HTTP listen address with the command-line flag: `-server.http.address=0.0.0.0:12345`.
+For more information, refer to the [Server](https://grafana.com/docs/agent/latest/static/configuration/flags/#server) command-line flag documentation.
+
+You must also publish the port in Docker. Refer to [Published ports](https://docs.docker.com/network/#published-ports) in the Docker documentation for more information.
+{{< /admonition >}}
+
 ### API response
 
 All Config Management API endpoints will return responses in the following