dsmiley commented on code in PR #4210: URL: https://github.com/apache/solr/pull/4210#discussion_r3054838979
########## solr/monitoring/mixin/dashboards/dashboards.libsonnet: ########## @@ -0,0 +1,725 @@ +// Licensed to the Apache Software Foundation (ASF) under one or more +// contributor license agreements. See the NOTICE file distributed with +// this work for additional information regarding copyright ownership. +// The ASF licenses this file to You under the Apache License, Version 2.0 +// (the "License"); you may not use this file except in compliance with +// the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// dashboards.libsonnet — Solr 10.x Grafana dashboard definition. +// +// Rows: +// Cluster Overview (open by default) — distributed query/update rates, latency, cores, disk +// JVM (open by default) — heap, GC, threads, CPU +// Solr Core (collapsed) — per-core QPS, update rate, latency, commits, optimizes +// SolrCloud (collapsed) — Overseer queues, ZK ops, shard leaders +// Index Health (collapsed) — segments, index size, merge rates, MMap efficiency +// Solr Caches (collapsed) — filter/query/document cache hit rates and evictions + +local config = import '../config.libsonnet'; +local g = import 'github.com/grafana/grafonnet/gen/grafonnet-latest/main.libsonnet'; + +local d = g.dashboard; +local p = g.panel; +local q = g.query.prometheus; +local v = g.dashboard.variable; +local cfg = config._config; + +// ----------------------------------------------------------------------- +// Computed label selectors (uses configurable label names from config.libsonnet) +// ----------------------------------------------------------------------- +local envSel = '%s=~"$environment"' % cfg.environmentLabel; +local clusterSel = '%s=~"$cluster"' % cfg.clusterLabel; +local instSel = '%s=~"$instance"' % cfg.instanceLabel; +local colSel = 'collection=~"$collection",shard=~"$shard",replica_type=~"$replica_type"'; + +// ----------------------------------------------------------------------- +// Template variables (T012) +// Ordered: datasource → environment → cluster → instance → +// collection → shard → replica_type → interval +// ----------------------------------------------------------------------- +local datasourceVar = + v.datasource.new('datasource', 'prometheus') + + v.datasource.generalOptions.withLabel('Data Source'); + +local environmentVar = + v.query.new( + 'environment', + 'label_values(solr_cores_loaded, %s)' % cfg.environmentLabel + ) + + v.query.withDatasourceFromVariable(datasourceVar) + + v.query.selectionOptions.withMulti() + + v.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*') + + v.query.refresh.onTime() + + v.query.generalOptions.withLabel('Environment'); + +local clusterVar = + v.query.new( + 'cluster', + 'label_values(solr_cores_loaded{%s}, %s)' % [envSel, cfg.clusterLabel] + ) + + v.query.withDatasourceFromVariable(datasourceVar) + + v.query.selectionOptions.withMulti() + + v.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*') + + v.query.refresh.onTime() + + v.query.generalOptions.withLabel('Cluster'); + +local instanceVar = + v.query.new( + 'instance', + 'label_values(solr_cores_loaded{%s,%s}, %s)' % [envSel, clusterSel, cfg.instanceLabel] + ) + + v.query.withDatasourceFromVariable(datasourceVar) + + v.query.selectionOptions.withMulti() + + v.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*') + + v.query.refresh.onTime() + + v.query.generalOptions.withLabel('Instance'); + +local collectionVar = + v.query.new( + 'collection', + 'label_values(solr_core_requests_total{%s}, collection)' % instSel + ) + + v.query.withDatasourceFromVariable(datasourceVar) + + v.query.selectionOptions.withMulti() + + v.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*') + + v.query.refresh.onTime() + + v.query.generalOptions.withLabel('Collection'); + +local shardVar = + v.query.new( + 'shard', + 'label_values(solr_core_requests_total{%s,collection=~"$collection"}, shard)' % instSel + ) + + v.query.withDatasourceFromVariable(datasourceVar) + + v.query.selectionOptions.withMulti() + + v.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*') + + v.query.refresh.onTime() + + v.query.generalOptions.withLabel('Shard'); + +local replicaTypeVar = + v.query.new( + 'replica_type', + 'label_values(solr_core_requests_total{%s,collection=~"$collection"}, replica_type)' % instSel + ) + + v.query.withDatasourceFromVariable(datasourceVar) + + v.query.selectionOptions.withMulti() + + v.query.selectionOptions.withIncludeAll(value=true, customAllValue='.*') + + v.query.refresh.onTime() + + v.query.generalOptions.withLabel('Replica Type'); + +local intervalVar = + v.interval.new('interval', ['1m', '5m', '10m', '30m', '1h']) + + v.interval.generalOptions.withCurrent(cfg.defaultRateInterval) + + v.interval.generalOptions.withLabel('Interval'); + +// ----------------------------------------------------------------------- +// Panel builder helpers +// ----------------------------------------------------------------------- +local ts(title, exprs, unit='short', desc='') = + p.timeSeries.new(title) + + p.timeSeries.queryOptions.withTargets(exprs) + + p.timeSeries.standardOptions.withUnit(unit) + + p.timeSeries.panelOptions.withDescription(desc) + + p.timeSeries.options.legend.withDisplayMode('list') + + p.timeSeries.options.tooltip.withMode('multi'); + +local statPanel(title, exprs, unit='short', desc='') = + p.stat.new(title) + + p.stat.queryOptions.withTargets(exprs) + + p.stat.standardOptions.withUnit(unit) + + p.stat.panelOptions.withDescription(desc) + + p.stat.options.withColorMode('value') + + p.stat.options.withGraphMode('none') + + p.stat.options.reduceOptions.withCalcs(['lastNotNull']); + +local gaugePanel(title, exprs, unit='percent', desc='', min=0, max=100, steps=[]) = + p.gauge.new(title) + + p.gauge.queryOptions.withTargets(exprs) + + p.gauge.standardOptions.withUnit(unit) + + p.gauge.standardOptions.withMin(min) + + p.gauge.standardOptions.withMax(max) + + p.gauge.panelOptions.withDescription(desc) + + p.gauge.options.reduceOptions.withCalcs(['lastNotNull']) + + (if std.length(steps) > 0 + then p.gauge.standardOptions.thresholds.withSteps(steps) + p.gauge.standardOptions.color.withMode('thresholds') + else {}); + +local barPanel(title, exprs, unit='short', desc='') = + p.barChart.new(title) + + p.barChart.queryOptions.withTargets(exprs) + + p.barChart.standardOptions.withUnit(unit) + + p.barChart.panelOptions.withDescription(desc); + +local prom(expr, legend='{{instance}}') = + q.new('$datasource', expr) + + q.withLegendFormat(legend) + + q.withInterval('$interval'); + +local promInstant(expr, legend='{{instance}}') = + q.new('$datasource', expr) + + q.withLegendFormat(legend) + + q.withInstant(true); + +local gp(x, y, w, h) = p.timeSeries.gridPos.withX(x) + p.timeSeries.gridPos.withY(y) + + p.timeSeries.gridPos.withW(w) + p.timeSeries.gridPos.withH(h); + +// ----------------------------------------------------------------------- +// Cluster Overview panels (T013) — open by default, y starts at 0 +// ----------------------------------------------------------------------- +local nodeOverviewPanels = [ + p.row.new('Cluster Overview') + + p.row.withCollapsed(false) + + { gridPos: { x: 0, y: 0, w: 24, h: 1 } }, + + ts( + 'Distributed QPS', + [prom( + 'sum by (collection)(rate(solr_core_requests_times_milliseconds_count{%s,%s,%s,category="QUERY",internal="false"}[$interval]))' % [envSel, clusterSel, instSel], + '{{collection}}' + )], + unit='reqps', + desc='Incoming user query requests per second per collection (internal shard fan-out excluded). Shows cluster-wide QPS distribution across collections. Use the collection dropdown to focus on a specific collection.' + ) + { gridPos: { x: 0, y: 1, w: 12, h: 8 } }, + + ts( + 'Search Latency p50 / p95 / p99', + [ + prom( + 'histogram_quantile(0.50, sum by (le, collection)(rate(solr_core_requests_times_milliseconds_bucket{%s,%s,%s,handler=~"/select.*",internal="false"}[$interval])))' % [envSel, clusterSel, instSel], + 'p50 {{collection}}' + ), + prom( + 'histogram_quantile(0.95, sum by (le, collection)(rate(solr_core_requests_times_milliseconds_bucket{%s,%s,%s,handler=~"/select.*",internal="false"}[$interval])))' % [envSel, clusterSel, instSel], + 'p95 {{collection}}' + ), + prom( + 'histogram_quantile(0.99, sum by (le, collection)(rate(solr_core_requests_times_milliseconds_bucket{%s,%s,%s,handler=~"/select.*",internal="false"}[$interval])))' % [envSel, clusterSel, instSel], + 'p99 {{collection}}' + ), + ], + unit='ms', + desc='Search request latency percentiles per collection for /select handlers (user-facing requests only). Alert fires at p99 > 1000ms for 5 minutes (SolrHighSearchLatency).' + ) + { gridPos: { x: 12, y: 1, w: 12, h: 8 } }, + + ts( + 'Total Update Rate', + [prom( + 'sum by (collection)(rate(solr_core_requests_times_milliseconds_count{%s,%s,%s,category="UPDATE"}[$interval]))' % [envSel, clusterSel, instSel], Review Comment: It's absolutely an oversight that nobody stepped up to do this for /update. I've known about it for some time. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
