Skip to content

Commit

Permalink
Use metrics namespace for more metrics (#11025)
Browse files Browse the repository at this point in the history
**What this PR does / why we need it**:

Use the metrics namespace setting instead of hardcoding to `cortex`.
This is a follow up to (and based on)
#11014.

**Checklist**
- [x] Reviewed the
[`CONTRIBUTING.md`](https://github.com/grafana/loki/blob/main/CONTRIBUTING.md)
guide (**required**)
- [ ] Documentation added
- [X] Tests updated
- [x] `CHANGELOG.md` updated
- [ ] If the change is worth mentioning in the release notes, add
`add-to-release-notes` label
- [ ] Changes that require user attention or interaction to upgrade are
documented in `docs/sources/setup/upgrade/_index.md`
- [ ] For Helm chart changes bump the Helm chart version in
`production/helm/loki/Chart.yaml` and update
`production/helm/loki/CHANGELOG.md` and
`production/helm/loki/README.md`. [Example
PR](d10549e)
- [ ] If the change is deprecating or removing a configuration option,
update the `deprecated-config.yaml` and `deleted-config.yaml` files
respectively in the `tools/deprecated-config-checker` directory. <!--
TODO(salvacorts): Add example PR -->

---------

Signed-off-by: Michel Hollands <[email protected]>
Co-authored-by: Ashwanth <[email protected]>
  • Loading branch information
MichelHollands and ashwanthgoli authored Oct 30, 2023
1 parent a118e99 commit 8628b15
Show file tree
Hide file tree
Showing 107 changed files with 592 additions and 419 deletions.
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

##### Enhancements

* [11003](https://github.com/grafana/loki/pull/11003) **MichelHollands**: Add the `metrics-namespace` flag to change the namespace of metrics currently using cortex as namespace.
* [11038](https://github.com/grafana/loki/pull/11038) **kavirajk**: Remove already deprecated `store.max-look-back-period`.
* [10906](https://github.com/grafana/loki/pull/10906) **kavirajk**: Support Loki ruler to notify WAL writes to remote storage.
* [10613](https://github.com/grafana/loki/pull/10613) **ngc4579**: Helm: allow GrafanaAgent tolerations
Expand Down
5 changes: 3 additions & 2 deletions cmd/migrate/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ func main() {
batch := flag.Int("batchLen", 500, "Specify how many chunks to read/write in one batch")
shardBy := flag.Duration("shardBy", 6*time.Hour, "Break down the total interval into shards of this size, making this too small can lead to syncing a lot of duplicate chunks")
parallel := flag.Int("parallel", 8, "How many parallel threads to process each shard")
metricsNamespace := flag.String("metrics.namespace", "cortex", "Namespace of the generated metrics")
flag.Parse()

go func() {
Expand Down Expand Up @@ -127,7 +128,7 @@ func main() {
// Create a new registerer to avoid registering duplicate metrics
prometheus.DefaultRegisterer = prometheus.NewRegistry()
clientMetrics := storage.NewClientMetrics()
s, err := storage.NewStore(sourceConfig.StorageConfig, sourceConfig.ChunkStoreConfig, sourceConfig.SchemaConfig, limits, clientMetrics, prometheus.DefaultRegisterer, util_log.Logger)
s, err := storage.NewStore(sourceConfig.StorageConfig, sourceConfig.ChunkStoreConfig, sourceConfig.SchemaConfig, limits, clientMetrics, prometheus.DefaultRegisterer, util_log.Logger, *metricsNamespace)
if err != nil {
log.Println("Failed to create source store:", err)
os.Exit(1)
Expand All @@ -136,7 +137,7 @@ func main() {
// Create a new registerer to avoid registering duplicate metrics
prometheus.DefaultRegisterer = prometheus.NewRegistry()

d, err := storage.NewStore(destConfig.StorageConfig, destConfig.ChunkStoreConfig, destConfig.SchemaConfig, limits, clientMetrics, prometheus.DefaultRegisterer, util_log.Logger)
d, err := storage.NewStore(destConfig.StorageConfig, destConfig.ChunkStoreConfig, destConfig.SchemaConfig, limits, clientMetrics, prometheus.DefaultRegisterer, util_log.Logger, *metricsNamespace)
if err != nil {
log.Println("Failed to create destination store:", err)
os.Exit(1)
Expand Down
4 changes: 4 additions & 0 deletions docs/sources/configure/_index.md
Original file line number Diff line number Diff line change
Expand Up @@ -224,6 +224,10 @@ Pass the `-config.expand-env` flag at the command line to enable this way of set
# will report 503 Service Unavailable status via /ready endpoint.
# CLI flag: -shutdown-delay
[shutdown_delay: <duration> | default = 0s]

# Namespace of the metrics that in previous releases had cortex as namespace.
# CLI flag: -metrics-namespace
[metrics_namespace: <string> | default = "cortex"]
```
### server
Expand Down
7 changes: 4 additions & 3 deletions pkg/bloomgateway/bloomgateway.go
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,7 @@ import (
"github.com/grafana/loki/pkg/storage/config"
"github.com/grafana/loki/pkg/storage/stores/shipper/bloomshipper"
"github.com/grafana/loki/pkg/util"
"github.com/grafana/loki/pkg/util/constants"
)

var errGatewayUnhealthy = errors.New("bloom-gateway is unhealthy in the ring")
Expand All @@ -79,14 +80,14 @@ type metrics struct {
func newMetrics(subsystem string, registerer prometheus.Registerer) *metrics {
return &metrics{
queueDuration: promauto.With(registerer).NewHistogram(prometheus.HistogramOpts{
Namespace: "loki",
Namespace: constants.Loki,
Subsystem: subsystem,
Name: "queue_duration_seconds",
Help: "Time spent by tasks in queue before getting picked up by a worker.",
Buckets: prometheus.DefBuckets,
}),
inflightRequests: promauto.With(registerer).NewSummary(prometheus.SummaryOpts{
Namespace: "loki",
Namespace: constants.Loki,
Subsystem: subsystem,
Name: "inflight_tasks",
Help: "Number of inflight tasks (either queued or processing) sampled at a regular interval. Quantile buckets keep track of inflight tasks over the last 60s.",
Expand Down Expand Up @@ -195,7 +196,7 @@ func New(cfg Config, schemaCfg config.SchemaConfig, storageCfg storage.Config, s
pendingTasks: makePendingTasks(pendingTasksInitialCap),
}

g.queueMetrics = queue.NewMetrics("bloom_gateway", reg)
g.queueMetrics = queue.NewMetrics(reg, constants.Loki, "bloom_gateway")
g.queue = queue.NewRequestQueue(maxTasksPerTenant, time.Minute, g.queueMetrics)
g.activeUsers = util.NewActiveUsersCleanupWithDefaultValues(g.queueMetrics.Cleanup)

Expand Down
3 changes: 2 additions & 1 deletion pkg/bloomgateway/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import (
"github.com/grafana/loki/pkg/distributor/clientpool"
"github.com/grafana/loki/pkg/logproto"
"github.com/grafana/loki/pkg/util"
"github.com/grafana/loki/pkg/util/constants"
)

// GRPCPool represents a pool of gRPC connections to different bloom gateway instances.
Expand Down Expand Up @@ -94,7 +95,7 @@ type GatewayClient struct {

func NewGatewayClient(cfg ClientConfig, limits Limits, registerer prometheus.Registerer, logger log.Logger) (*GatewayClient, error) {
latency := promauto.With(registerer).NewHistogramVec(prometheus.HistogramOpts{
Namespace: "loki",
Namespace: constants.Loki,
Subsystem: "bloom_gateway",
Name: "request_duration_seconds",
Help: "Time (in seconds) spent serving requests when using the bloom gateway",
Expand Down
4 changes: 2 additions & 2 deletions pkg/compactor/compactor.go
Original file line number Diff line number Diff line change
Expand Up @@ -202,7 +202,7 @@ type Limits interface {
DefaultLimits() *validation.Limits
}

func NewCompactor(cfg Config, objectStoreClients map[config.DayTime]client.ObjectClient, deleteStoreClient client.ObjectClient, schemaConfig config.SchemaConfig, limits Limits, r prometheus.Registerer) (*Compactor, error) {
func NewCompactor(cfg Config, objectStoreClients map[config.DayTime]client.ObjectClient, deleteStoreClient client.ObjectClient, schemaConfig config.SchemaConfig, limits Limits, r prometheus.Registerer, metricsNamespace string) (*Compactor, error) {
retentionEnabledStats.Set("false")
if cfg.RetentionEnabled {
retentionEnabledStats.Set("true")
Expand Down Expand Up @@ -245,7 +245,7 @@ func NewCompactor(cfg Config, objectStoreClients map[config.DayTime]client.Objec
}

ringCfg := cfg.CompactorRing.ToRingConfig(ringReplicationFactor)
compactor.ring, err = ring.NewWithStoreClientAndStrategy(ringCfg, ringNameForServer, ringKey, ringStore, ring.NewIgnoreUnhealthyInstancesReplicationStrategy(), prometheus.WrapRegistererWithPrefix("cortex_", r), util_log.Logger)
compactor.ring, err = ring.NewWithStoreClientAndStrategy(ringCfg, ringNameForServer, ringKey, ringStore, ring.NewIgnoreUnhealthyInstancesReplicationStrategy(), prometheus.WrapRegistererWithPrefix(metricsNamespace+"_", r), util_log.Logger)
if err != nil {
return nil, errors.Wrap(err, "create ring client")
}
Expand Down
3 changes: 2 additions & 1 deletion pkg/compactor/compactor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ import (
"github.com/grafana/loki/pkg/storage/chunk/client"
"github.com/grafana/loki/pkg/storage/chunk/client/local"
"github.com/grafana/loki/pkg/storage/config"
"github.com/grafana/loki/pkg/util/constants"
loki_net "github.com/grafana/loki/pkg/util/net"
)

Expand Down Expand Up @@ -49,7 +50,7 @@ func setupTestCompactor(t *testing.T, objectClients map[config.DayTime]client.Ob

c, err := NewCompactor(cfg, objectClients, nil, config.SchemaConfig{
Configs: periodConfigs,
}, nil, nil)
}, nil, nil, constants.Loki)
require.NoError(t, err)

c.RegisterIndexCompactor("dummy", testIndexCompactor{})
Expand Down
22 changes: 12 additions & 10 deletions pkg/compactor/deletion/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@ package deletion
import (
"github.com/prometheus/client_golang/prometheus"
"github.com/prometheus/client_golang/prometheus/promauto"

"github.com/grafana/loki/pkg/util/constants"
)

type DeleteRequestClientMetrics struct {
Expand All @@ -14,13 +16,13 @@ func NewDeleteRequestClientMetrics(r prometheus.Registerer) *DeleteRequestClient
m := DeleteRequestClientMetrics{}

m.deleteRequestsLookupsTotal = promauto.With(r).NewCounter(prometheus.CounterOpts{
Namespace: "loki",
Namespace: constants.Loki,
Name: "delete_request_lookups_total",
Help: "Number times the client has looked up delete requests",
})

m.deleteRequestsLookupsFailedTotal = promauto.With(r).NewCounter(prometheus.CounterOpts{
Namespace: "loki",
Namespace: constants.Loki,
Name: "delete_request_lookups_failed_total",
Help: "Number times the client has failed to look up delete requests",
})
Expand All @@ -36,7 +38,7 @@ func newDeleteRequestHandlerMetrics(r prometheus.Registerer) *deleteRequestHandl
m := deleteRequestHandlerMetrics{}

m.deleteRequestsReceivedTotal = promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Namespace: "loki",
Namespace: constants.Loki,
Name: "compactor_delete_requests_received_total",
Help: "Number of delete requests received per user",
}, []string{"user"})
Expand All @@ -58,37 +60,37 @@ func newDeleteRequestsManagerMetrics(r prometheus.Registerer) *deleteRequestsMan
m := deleteRequestsManagerMetrics{}

m.deleteRequestsProcessedTotal = promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Namespace: "loki",
Namespace: constants.Loki,
Name: "compactor_delete_requests_processed_total",
Help: "Number of delete requests processed per user",
}, []string{"user"})
m.deleteRequestsChunksSelectedTotal = promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Namespace: "loki",
Namespace: constants.Loki,
Name: "compactor_delete_requests_chunks_selected_total",
Help: "Number of chunks selected while building delete plans per user",
}, []string{"user"})
m.deletionFailures = promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Namespace: "loki",
Namespace: constants.Loki,
Name: "compactor_delete_processing_fails_total",
Help: "Number of times the delete phase of compaction has failed",
}, []string{"cause"})
m.loadPendingRequestsAttemptsTotal = promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Namespace: "loki",
Namespace: constants.Loki,
Name: "compactor_load_pending_requests_attempts_total",
Help: "Number of attempts that were made to load pending requests with status",
}, []string{"status"})
m.oldestPendingDeleteRequestAgeSeconds = promauto.With(r).NewGauge(prometheus.GaugeOpts{
Namespace: "loki",
Namespace: constants.Loki,
Name: "compactor_oldest_pending_delete_request_age_seconds",
Help: "Age of oldest pending delete request in seconds since they are over their cancellation period",
})
m.pendingDeleteRequestsCount = promauto.With(r).NewGauge(prometheus.GaugeOpts{
Namespace: "loki",
Namespace: constants.Loki,
Name: "compactor_pending_delete_requests_count",
Help: "Count of delete requests which are over their cancellation period and have not finished processing yet",
})
m.deletedLinesTotal = promauto.With(r).NewCounterVec(prometheus.CounterOpts{
Namespace: "loki",
Namespace: constants.Loki,
Name: "compactor_deleted_lines",
Help: "Number of deleted lines per user",
}, []string{"user"})
Expand Down
4 changes: 3 additions & 1 deletion pkg/compactor/generationnumber/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ package generationnumber

import (
"github.com/prometheus/client_golang/prometheus"

"github.com/grafana/loki/pkg/util/constants"
)

// Make this package level because we want several instances of a loader to be able to report metrics
Expand All @@ -21,7 +23,7 @@ func newGenLoaderMetrics(r prometheus.Registerer) *genLoaderMetrics {
}

cacheGenLoadFailures := prometheus.NewCounterVec(prometheus.CounterOpts{
Namespace: "loki",
Namespace: constants.Loki,
Name: "delete_cache_gen_load_failures_total",
Help: "Total number of failures while loading cache generation number using gen number loader",
}, []string{"source"})
Expand Down
2 changes: 1 addition & 1 deletion pkg/distributor/clientpool/ingester_client_pool.go
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,6 @@ func NewPool(name string, cfg PoolConfig, ring ring.ReadRing, factory ring_clien
HealthCheckTimeout: cfg.RemoteTimeout,
}

// TODO(chaudum): Allow cofiguration of metric name by the caller.
// TODO(chaudum): Allow configuration of metric name by the caller.
return ring_client.NewPool(name, poolCfg, ring_client.NewRingServiceDiscovery(ring), factory, clients, logger)
}
18 changes: 10 additions & 8 deletions pkg/distributor/distributor.go
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ import (
"github.com/grafana/loki/pkg/logql/syntax"
"github.com/grafana/loki/pkg/runtime"
"github.com/grafana/loki/pkg/util"
"github.com/grafana/loki/pkg/util/constants"
util_log "github.com/grafana/loki/pkg/util/log"
lokiring "github.com/grafana/loki/pkg/util/ring"
"github.com/grafana/loki/pkg/validation"
Expand Down Expand Up @@ -133,6 +134,7 @@ func New(
ingestersRing ring.ReadRing,
overrides Limits,
registerer prometheus.Registerer,
metricsNamespace string,
) (*Distributor, error) {
factory := cfg.factory
if factory == nil {
Expand Down Expand Up @@ -178,22 +180,22 @@ func New(
healthyInstancesCount: atomic.NewUint32(0),
rateLimitStrat: rateLimitStrat,
ingesterAppends: promauto.With(registerer).NewCounterVec(prometheus.CounterOpts{
Namespace: "loki",
Namespace: constants.Loki,
Name: "distributor_ingester_appends_total",
Help: "The total number of batch appends sent to ingesters.",
}, []string{"ingester"}),
ingesterAppendTimeouts: promauto.With(registerer).NewCounterVec(prometheus.CounterOpts{
Namespace: "loki",
Namespace: constants.Loki,
Name: "distributor_ingester_append_timeouts_total",
Help: "The total number of failed batch appends sent to ingesters due to timeouts.",
}, []string{"ingester"}),
replicationFactor: promauto.With(registerer).NewGauge(prometheus.GaugeOpts{
Namespace: "loki",
Namespace: constants.Loki,
Name: "distributor_replication_factor",
Help: "The configured replication factor.",
}),
streamShardCount: promauto.With(registerer).NewCounter(prometheus.CounterOpts{
Namespace: "loki",
Namespace: constants.Loki,
Name: "stream_sharding_count",
Help: "Total number of times the distributor has sharded streams",
}),
Expand All @@ -203,7 +205,7 @@ func New(
if overrides.IngestionRateStrategy() == validation.GlobalIngestionRateStrategy {
d.rateLimitStrat = validation.GlobalIngestionRateStrategy

distributorsRing, distributorsLifecycler, err = newRingAndLifecycler(cfg.DistributorRing, d.healthyInstancesCount, util_log.Logger, registerer)
distributorsRing, distributorsLifecycler, err = newRingAndLifecycler(cfg.DistributorRing, d.healthyInstancesCount, util_log.Logger, registerer, metricsNamespace)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -731,7 +733,7 @@ func calculateShards(rate int64, pushSize, desiredRate int) int {
}

// newRingAndLifecycler creates a new distributor ring and lifecycler with all required lifecycler delegates
func newRingAndLifecycler(cfg RingConfig, instanceCount *atomic.Uint32, logger log.Logger, reg prometheus.Registerer) (*ring.Ring, *ring.BasicLifecycler, error) {
func newRingAndLifecycler(cfg RingConfig, instanceCount *atomic.Uint32, logger log.Logger, reg prometheus.Registerer, metricsNamespace string) (*ring.Ring, *ring.BasicLifecycler, error) {
kvStore, err := kv.NewClient(cfg.KVStore, ring.GetCodec(), kv.RegistererWithKVName(reg, "distributor-lifecycler"), logger)
if err != nil {
return nil, nil, errors.Wrap(err, "failed to initialize distributors' KV store")
Expand All @@ -748,12 +750,12 @@ func newRingAndLifecycler(cfg RingConfig, instanceCount *atomic.Uint32, logger l
delegate = ring.NewLeaveOnStoppingDelegate(delegate, logger)
delegate = ring.NewAutoForgetDelegate(ringAutoForgetUnhealthyPeriods*cfg.HeartbeatTimeout, delegate, logger)

distributorsLifecycler, err := ring.NewBasicLifecycler(lifecyclerCfg, "distributor", ringKey, kvStore, delegate, logger, prometheus.WrapRegistererWithPrefix("cortex_", reg))
distributorsLifecycler, err := ring.NewBasicLifecycler(lifecyclerCfg, "distributor", ringKey, kvStore, delegate, logger, prometheus.WrapRegistererWithPrefix(metricsNamespace+"_", reg))
if err != nil {
return nil, nil, errors.Wrap(err, "failed to initialize distributors' lifecycler")
}

distributorsRing, err := ring.New(cfg.ToRingConfig(), "distributor", ringKey, logger, prometheus.WrapRegistererWithPrefix("cortex_", reg))
distributorsRing, err := ring.New(cfg.ToRingConfig(), "distributor", ringKey, logger, prometheus.WrapRegistererWithPrefix(metricsNamespace+"_", reg))
if err != nil {
return nil, nil, errors.Wrap(err, "failed to initialize distributors' ring client")
}
Expand Down
3 changes: 2 additions & 1 deletion pkg/distributor/distributor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ import (
"github.com/grafana/loki/pkg/logproto"
"github.com/grafana/loki/pkg/logql/syntax"
"github.com/grafana/loki/pkg/runtime"
"github.com/grafana/loki/pkg/util/constants"
fe "github.com/grafana/loki/pkg/util/flagext"
loki_flagext "github.com/grafana/loki/pkg/util/flagext"
util_log "github.com/grafana/loki/pkg/util/log"
Expand Down Expand Up @@ -1159,7 +1160,7 @@ func prepare(t *testing.T, numDistributors, numIngesters int, limits *validation
overrides, err := validation.NewOverrides(*limits, nil)
require.NoError(t, err)

d, err := New(distributorConfig, clientConfig, runtime.DefaultTenantConfigs(), ingestersRing, overrides, prometheus.NewPedanticRegistry())
d, err := New(distributorConfig, clientConfig, runtime.DefaultTenantConfigs(), ingestersRing, overrides, prometheus.NewPedanticRegistry(), constants.Loki)
require.NoError(t, err)
require.NoError(t, services.StartAndAwaitRunning(context.Background(), d))
distributors[i] = d
Expand Down
Loading

0 comments on commit 8628b15

Please sign in to comment.