piby180 opened a new issue, #12888: URL: https://github.com/apache/pinot/issues/12888
Hey all! We are facing serious issue of high availability despite replication across all components. # Cluster and Table Setup Some context about our cluster configuration: 1. Our cluster is deployed using pinot official helm chart on AWS EKS 2. Machine Type : c6a.8xlarge (32 cores, 64 GB) 3. Machine count : 6 4. Pinot components : 3 servers, 3 controllers, 3 brokers, 3 minions, 5 zookeeper 5. Resources : 3 * Server (4 cores, 14 GB, 1000 GB), 3 * Controller (4 cores, 14 GB, 100 GB), 3 * Broker (4 cores, 14 GB, 100GB), 5 * Zookeeper (4 cores, 14 GB, 100 GB) 6. Pinot Version : 1.0.0 . Zookeeper version : 3.8.0-5a02a05eddb59aee6ac762f7ea82e92a68eb9c0f, built on 2022-02-25 08:49 UTC (deployed with pinot helm chart) Here is our values.yaml ```` image: repository: ${CI_REGISTRY_IMAGE} tag: "${PINOT_IMAGE_TAG}" pullPolicy: Always imagePullSecrets: - name: ${CI_PROJECT_NAME} cluster: name: "${PINOT_CLUSTER_NAME}" serviceAccount: annotations: eks.amazonaws.com/role-arn: "${PINOT_IRSA_ROLE_ARN}" name: "pinot" probes: initialDelaySeconds: 300 periodSeconds: 30 pinotAuth: enabled: true controllerFactoryClass: org.apache.pinot.controller.api.access.ZkBasicAuthAccessControlFactory brokerFactoryClass: org.apache.pinot.broker.broker.ZkBasicAuthAccessControlFactory configs: - access.control.principals=admin - access.control.principals.admin.password=${PINOT_ADMIN_PASSWORD} - access.control.init.username=admin - access.control.init.password=${PINOT_ADMIN_PASSWORD} # ------------------------------------------------------------------------------ # Pinot Controller: # ------------------------------------------------------------------------------ controller: replicaCount: 3 probes: livenessEnabled: true readinessEnabled: true persistence: size: ${PINOT_CONTROLLER_VOL_SIZE} storageClass: ${PINOT_STORAGE_CLASS} data: dir: "${PINOT_SEGMENT_DIR}" podSecurityContext: fsGroupChangePolicy: Always runAsUser: 1000 runAsGroup: 3000 fsGroup: 3000 securityContext: runAsNonRoot: true runAsUser: 1000 runAsGroup: 3000 jvmOpts: "-XX:+ExitOnOutOfMemoryError -Xms1G -Xmx14G -Djute.maxbuffer=100000000 -XX:+UseG1GC -XX:MaxGCPauseMillis=200 -Xlog:gc*:file=/opt/pinot/gc-pinot-controller.log -javaagent:/opt/pinot/etc/jmx_prometheus_javaagent/jmx_prometheus_javaagent.jar=8008:/opt/pinot/etc/jmx_prometheus_javaagent/configs/pinot.yml" service: annotations: "prometheus.io/scrape": "true" "prometheus.io/port": "8008" external: enabled: false resources: requests: cpu : 4 memory: "14Gi" nodeSelector: workload-type: ${PINOT_WORKLOAD_TYPE} podAnnotations: "prometheus.io/scrape": "true" "prometheus.io/port": "8008" extraEnv: - name: LOG4J_CONSOLE_LEVEL value: error # Extra configs will be appended to pinot-controller.conf file extra: configs: |- pinot.set.instance.id.to.hostname=true controller.task.scheduler.enabled=true controller.task.frequencyPeriod=1h access.control.init.username=admin access.control.init.password=${PINOT_ADMIN_PASSWORD} controller.local.temp.dir=/tmp/pinot-tmp-data/ controller.allow.hlc.tables=false controller.enable.split.commit=true controller.realtime.segment.deepStoreUploadRetryEnabled=true controller.segment.fetcher.auth.token=${PINOT_AUTH_TOKEN} pinot.controller.storage.factory.s3.disableAcl=false pinot.controller.storage.factory.class.s3=org.apache.pinot.plugin.filesystem.S3PinotFS pinot.controller.storage.factory.s3.region=${AWS_S3_REGION} pinot.controller.storage.factory.s3.httpclient.maxConnections=100 pinot.controller.storage.factory.s3.httpclient.socketTimeout=30s pinot.controller.storage.factory.s3.httpclient.connectionTimeout=2s pinot.controller.storage.factory.s3.httpclient.connectionTimeToLive=0s pinot.controller.storage.factory.s3.httpclient.connectionAcquisitionTimeout=10s pinot.controller.segment.fetcher.protocols=file,http,s3 pinot.controller.segment.fetcher.s3.class=org.apache.pinot.common.utils.fetcher.PinotFSSegmentFetcher pinot.multistage.engine.enabled=true pinot.server.instance.currentDataTableVersion=4 pinot.query.server.port=8421 pinot.query.runner.port=8442 pinot.query.scheduler.accounting.factory.name=org.apache.pinot.core.accounting.PerQueryCPUMemAccountantFactory pinot.query.scheduler.accounting.enable.thread.memory.sampling=true pinot.query.scheduler.accounting.enable.thread.cpu.sampling=true pinot.query.scheduler.accounting.oom.enable.killing.query=true pinot.query.scheduler.accounting.publishing.jvm.heap.usage=true # ------------------------------------------------------------------------------ # Pinot Broker: # ------------------------------------------------------------------------------ broker: replicaCount: 3 jvmOpts: "-XX:+ExitOnOutOfMemoryError -Xms1G -Xmx14G -Djute.maxbuffer=100000000 -XX:+UseG1GC -XX:MaxGCPauseMillis=200 -Xlog:gc*:file=/opt/pinot/gc-pinot-controller.log -javaagent:/opt/pinot/etc/jmx_prometheus_javaagent/jmx_prometheus_javaagent.jar=8008:/opt/pinot/etc/jmx_prometheus_javaagent/configs/pinot.yml" podSecurityContext: fsGroupChangePolicy: Always runAsUser: 1000 runAsGroup: 3000 fsGroup: 3000 securityContext: runAsNonRoot: true runAsUser: 1000 runAsGroup: 3000 service: annotations: "prometheus.io/scrape": "true" "prometheus.io/port": "8008" external: enabled: false ingress: v1: enabled: true ingressClassName: "" annotations: kubernetes.io/ingress.class: alb alb.ingress.kubernetes.io/target-type: ip alb.ingress.kubernetes.io/target-group-attributes: stickiness.enabled=true,stickiness.lb_cookie.duration_seconds=60 alb.ingress.kubernetes.io/certificate-arn: "${PINOT_BROKER_ALB_ACM_CERTIFICATE_ARN}" alb.ingress.kubernetes.io/listen-ports: '[{"HTTPS":443}]' alb.ingress.kubernetes.io/load-balancer-attributes: access_logs.s3.enabled=false alb.ingress.kubernetes.io/scheme: internet-facing alb.ingress.kubernetes.io/security-groups: ${PINOT_BROKER_ALB_SECURITY_GROUP} alb.ingress.kubernetes.io/ssl-policy: ELBSecurityPolicy-TLS-1-2-Ext-2018-06 alb.ingress.kubernetes.io/tags: ${PINOT_BROKER_ALB_TAGS} tls: [] path: / hosts: - ${PINOT_BROKER_ALB_HOST} resources: requests: cpu : 4 memory: "14Gi" nodeSelector: workload-type: ${PINOT_WORKLOAD_TYPE} podAnnotations: "prometheus.io/scrape": "true" "prometheus.io/port": "8008" extraEnv: - name: LOG4J_CONSOLE_LEVEL value: debug extra: configs: |- pinot.set.instance.id.to.hostname=true pinot.multistage.engine.enabled=true pinot.server.instance.currentDataTableVersion=4 pinot.query.server.port=8421 pinot.query.runner.port=8442 pinot.broker.enable.query.cancellation=true pinot.query.scheduler.accounting.factory.name=org.apache.pinot.core.accounting.PerQueryCPUMemAccountantFactory pinot.query.scheduler.accounting.enable.thread.memory.sampling=true pinot.query.scheduler.accounting.enable.thread.cpu.sampling=true pinot.query.scheduler.accounting.oom.enable.killing.query=true pinot.query.scheduler.accounting.publishing.jvm.heap.usage=true # ------------------------------------------------------------------------------ # Pinot Server: # ------------------------------------------------------------------------------ server: replicaCount: 3 probes: livenessEnabled: true readinessEnabled: true podSecurityContext: fsGroupChangePolicy: Always runAsUser: 1000 runAsGroup: 3000 fsGroup: 3000 securityContext: runAsNonRoot: true runAsUser: 1000 runAsGroup: 3000 persistence: size: ${PINOT_SERVER_VOL_SIZE} storageClass: ${PINOT_STORAGE_CLASS} jvmOpts: "-XX:+ExitOnOutOfMemoryError -Xms1G -Xmx6G -Djute.maxbuffer=100000000 -XX:+UseG1GC -XX:MaxGCPauseMillis=200 -Xlog:gc*:file=/opt/pinot/gc-pinot-controller.log -javaagent:/opt/pinot/etc/jmx_prometheus_javaagent/jmx_prometheus_javaagent.jar=8008:/opt/pinot/etc/jmx_prometheus_javaagent/configs/pinot.yml" resources: requests: cpu : 4 memory: "14Gi" nodeSelector: workload-type: ${PINOT_WORKLOAD_TYPE} podAnnotations: "prometheus.io/scrape": "true" "prometheus.io/port": "8008" extraEnv: - name: LOG4J_CONSOLE_LEVEL value: error extra: configs: |- pinot.set.instance.id.to.hostname=true pinot.server.instance.realtime.alloc.offheap=true pinot.server.instance.enable.split.commit=true realtime.segment.serverUploadToDeepStore=true pinot.server.instance.segment.store.uri=${PINOT_SEGMENT_DIR} pinot.server.storage.factory.s3.disableAcl=false pinot.server.storage.factory.class.s3=org.apache.pinot.plugin.filesystem.S3PinotFS pinot.server.storage.factory.s3.region=${AWS_S3_REGION} pinot.server.segment.fetcher.protocols=file,http,s3 pinot.server.storage.factory.s3.httpclient.maxConnections=1000 pinot.server.storage.factory.s3.httpclient.socketTimeout=30s pinot.server.storage.factory.s3.httpclient.connectionTimeout=2s pinot.server.storage.factory.s3.httpclient.connectionTimeToLive=0s pinot.server.storage.factory.s3.httpclient.connectionAcquisitionTimeout=10s pinot.server.segment.fetcher.s3.class=org.apache.pinot.common.utils.fetcher.PinotFSSegmentFetcher pinot.server.segment.fetcher.auth.token=${PINOT_AUTH_TOKEN} pinot.server.segment.uploader.auth.token=${PINOT_AUTH_TOKEN} pinot.server.instance.auth.token=${PINOT_AUTH_TOKEN} pinot.multistage.engine.enabled=true pinot.server.instance.currentDataTableVersion=4 pinot.query.server.port=8421 pinot.query.runner.port=8442 pinot.server.enable.query.cancellation=true pinot.query.scheduler.accounting.factory.name=org.apache.pinot.core.accounting.PerQueryCPUMemAccountantFactory pinot.query.scheduler.accounting.enable.thread.memory.sampling=true pinot.query.scheduler.accounting.enable.thread.cpu.sampling=true pinot.query.scheduler.accounting.oom.enable.killing.query=true pinot.query.scheduler.accounting.publishing.jvm.heap.usage=true # ------------------------------------------------------------------------------ # Pinot Minion: # ------------------------------------------------------------------------------ minionStateless: enabled: false minion: enabled: true replicaCount: 3 dataDir: "${PINOT_MINION_DATA_DIR}" jvmOpts: "-XX:+ExitOnOutOfMemoryError -Xms1G -Xmx8G -Djute.maxbuffer=100000000 -XX:+UseG1GC -XX:MaxGCPauseMillis=200 -Xlog:gc*:file=/opt/pinot/gc-pinot-controller.log" podSecurityContext: fsGroupChangePolicy: Always runAsUser: 1000 runAsGroup: 3000 fsGroup: 3000 securityContext: runAsNonRoot: true runAsUser: 1000 runAsGroup: 3000 persistence: enabled: true accessMode: ReadWriteOnce size: ${PINOT_MINION_VOL_SIZE} storageClass: ${PINOT_STORAGE_CLASS} resources: requests: cpu : 4 memory: "14Gi" nodeSelector: workload-type: ${PINOT_WORKLOAD_TYPE} podAnnotations: "prometheus.io/scrape": "true" "prometheus.io/port": "8008" extraEnv: - name: LOG4J_CONSOLE_LEVEL value: error extra: configs: |- pinot.set.instance.id.to.hostname=true pinot.minion.storage.factory.class.s3=org.apache.pinot.plugin.filesystem.S3PinotFS pinot.minion.storage.factory.s3.region=${AWS_S3_REGION} pinot.minion.segment.fetcher.protocols=file,http,s3 pinot.minion.segment.fetcher.s3.class=org.apache.pinot.common.utils.fetcher.PinotFSSegmentFetcher pinot.minion.storage.factory.s3.httpclient.maxConnections=1000 pinot.minion.storage.factory.s3.httpclient.socketTimeout=30s pinot.minion.storage.factory.s3.httpclient.connectionTimeout=2s pinot.minion.storage.factory.s3.httpclient.connectionTimeToLive=0s pinot.minion.storage.factory.s3.httpclient.connectionAcquisitionTimeout=10s segment.fetcher.auth.token=${PINOT_AUTH_TOKEN} task.auth.token=${PINOT_AUTH_TOKEN} pinot.multistage.engine.enabled=true pinot.server.instance.currentDataTableVersion=4 pinot.query.server.port=8421 pinot.query.runner.port=8442 pinot.query.scheduler.accounting.factory.name=org.apache.pinot.core.accounting.PerQueryCPUMemAccountantFactory pinot.query.scheduler.accounting.enable.thread.memory.sampling=true pinot.query.scheduler.accounting.enable.thread.cpu.sampling=true pinot.query.scheduler.accounting.oom.enable.killing.query=true pinot.query.scheduler.accounting.publishing.jvm.heap.usage=true zookeeper: enabled: true urlOverride: "my-zookeeper:2181/my-pinot" port: 2181 podSecurityContext: runAsNonRoot: true runAsUser: 1000 fsGroup: 1001 containerSecurityContext: runAsNonRoot: true runAsUser: 1000 env: # https://github.com/mrbobbytables/zookeeper/blob/master/README.md ZOO_HEAP_SIZE: "10G" ZOOKEEPER_LOG_STDOUT_THRESHOLD: "ERROR" JAVA_OPTS: "-XX:+ExitOnOutOfMemoryError -Xms4G -Xmx10G -Djute.maxbuffer=100000000 -XX:+UseG1GC -XX:MaxGCPauseMillis=200 -Xlog:gc*:file=/opt/bitnami/zookeeper/logs/gc-pinot-zookeeper.log" resources: requests: cpu : 4 memory: "14Gi" replicaCount: 3 persistence: enabled: true size: ${PINOT_ZOOKEEPER_VOL_SIZE} storageClass: ${PINOT_STORAGE_CLASS} image: PullPolicy: "IfNotPresent" nodeSelector: workload-type: ${PINOT_WORKLOAD_TYPE} # References # https://docs.pinot.apache.org/operators/operating-pinot/oom-protection-using-automatic-query-killing # https://docs.pinot.apache.org/operators/tutorials/deployment-pinot-on-kubernetes # https://startree.ai/blog/capacity-planning-in-apache-pinot-part-1 # https://startree.ai/blog/capacity-planning-in-apache-pinot-part-2 ```` We query Pinot using three ways: 1. Via Superset with URL : pinot://<username>:<password>@<pinot-broker-host>:<pinot-broker-port>/query/sql?controller=http://<pinot-controller-host>:<pinot-controller-port>/verify_ssl=true 2. Via Pinot Broker Ingress Load Balancer (deployed with pinot helm chart) 3. Via Pinot admin UI accessible by port-forwarding pinot-controller service object. Our standard offline tables have the following config ```` { "tableName": "pinot_metadata_feeds", "tableType": "OFFLINE", "segmentsConfig": { "retentionTimeUnit": "DAYS", "retentionTimeValue": "365", "schemaName": "pinot_metadata_feeds", "replication": "3", "replicasPerPartition": "3", "segmentPushType": "APPEND", "segmentAssignmentStrategy": "BalanceNumSegmentAssignmentStrategy" }, "ingestionConfig": {}, "task": { "taskTypeConfigsMap": { "MergeRollupTask": { "1day.mergeType": "concat", "1day.bucketTimePeriod": "1d", "1day.bufferTimePeriod": "1d" } } }, "tenants": {}, "tableIndexConfig": { "loadMode": "MMAP", "nullHandlingEnabled": "true" }, "metadata": { "customConfigs": {} } } ````` # Problem When broker or server pod get restarted during cluster update or when our ops team make changes to kubernetes cluster, some of our queries fail. With multistage disabled : It seems like the queries are routed in round robin fashion. If you retry the same query for 5 times, it will fail 1-2 times when it reach the server pod or broker pod which is restarting. For 3-4 times, it reach the healthy broker/server pods and return result. With multistage enabled: The queries almost always fail when one of the broker or server pod is restarting. It seems the queries are fanning out to all servers. Disabling multistage is not an option for us since we are using joins in some queries. The error log we get when for example server-2 is restarting ```` Error dispatching query to server=pinot-server-2.pinot-server-headless.pinot-dev.svc.cluster.local@{8421,8442} stage=1 org.apache.pinot.query.service.dispatch.QueryDispatcher.submit(QueryDispatcher.java:144) org.apache.pinot.query.service.dispatch.QueryDispatcher.submitAndReduce(QueryDispatcher.java:93) org.apache.pinot.broker.requesthandler.MultiStageBrokerRequestHandler.handleRequest(MultiStageBrokerRequestHandler.java:179) org.apache.pinot.broker.requesthandler.BaseBrokerRequestHandler.handleRequest(BaseBrokerRequestHandler.java:263) UNAVAILABLE: Unable to resolve host pinot-server-2.pinot-server-headless.pinot-dev.svc.cluster.local io.grpc.Status.asRuntimeException(Status.java:539) io.grpc.stub.ClientCalls$StreamObserverToCallListenerAdapter.onClose(ClientCalls.java:487) io.grpc.internal.ClientCallImpl.closeObserver(ClientCallImpl.java:576) io.grpc.internal.ClientCallImpl.access$300(ClientCallImpl.java:70) java.net.UnknownHostException: pinot-server-2.pinot-server-headless.pinot-dev.svc.cluster.local: Name or service not known io.grpc.internal.DnsNameResolver.resolveAddresses(DnsNameResolver.java:223) io.grpc.internal.DnsNameResolver.doResolve(DnsNameResolver.java:282) io.grpc.internal.DnsNameResolver$Resolve.run(DnsNameResolver.java:318) java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) pinot-server-2.pinot-server-headless.pinot-dev.svc.cluster.local: Name or service not known java.base/java.net.Inet6AddressImpl.lookupAllHostAddr(Native Method) java.base/java.net.InetAddress$PlatformNameService.lookupAllHostAddr(InetAddress.java:930) java.base/java.net.InetAddress.getAddressesFromNameService(InetAddress.java:1543) java.base/java.net.InetAddress$NameServiceAddresses.get(InetAddress.java:848) ```` Full query response in Pinot UI for a succeeded query with multistage enabled (here you can see the query is being routed to multiple servers) SELECT COUNT(*) FROM pinot_metadata_feeds; ``` { "resultTable": { "dataSchema": { "columnNames": [ "EXPR$0" ], "columnDataTypes": [ "LONG" ] }, "rows": [ [ 1206170 ] ] }, "requestId": "92319303000002827", "stageStats": { "1": { "numBlocks": 6, "numRows": 3, "stageExecutionTimeMs": 266, "stageExecutionUnit": 3, "stageExecWallTimeMs": 89, "operatorStats": { "aggregateoperator_...@pinot-server-2.pinot-server-headless.pinot-dev.svc.cluster.local:8442": { "numBlocks": "2", "numRows": "1", "operatorExecutionTimeMs": "88", "operatorExecStartTimeMs": "1712833326722", "operatorId": "aggregateoperator_...@pinot-server-2.pinot-server-headless.pinot-dev.svc.cluster.local:8442", "operatorExecEndTimeMs": "1712833326811" }, "mailboxsendoperator_...@pinot-server-2.pinot-server-headless.pinot-dev.svc.cluster.local:8442": { "numBlocks": "1", "numRows": "1", "operatorExecutionTimeMs": "90", "operatorExecStartTimeMs": "1712833326722", "operatorId": "mailboxsendoperator_...@pinot-server-2.pinot-server-headless.pinot-dev.svc.cluster.local:8442", "operatorExecEndTimeMs": "1712833326811" }, "mailboxreceiveoperator_...@pinot-server-2.pinot-server-headless.pinot-dev.svc.cluster.local:8442": { "numBlocks": "3", "numRows": "1", "operatorExecutionTimeMs": "88", "operatorExecStartTimeMs": "1712833326722", "operatorId": "mailboxreceiveoperator_...@pinot-server-2.pinot-server-headless.pinot-dev.svc.cluster.local:8442", "operatorExecEndTimeMs": "1712833326811" } } }, "2": { "numBlocks": 3, "numRows": 2, "stageExecutionTimeMs": 142, "stageExecutionUnit": 2, "stageExecWallTimeMs": 72, "numSegmentsQueried": 14144, "numSegmentsProcessed": 14144, "numSegmentsMatched": 14144, "numDocsScanned": 1206170, "totalDocs": 1206170, "traceInfo": { "pinot_metadata_feeds": "[{\"0\":[{\"SegmentPrunerService Time\":8},{\"CombinePlanNode Time\":5},{\"AggregationCombineOperator Time\":35},{\"StreamingInstanceResponseOperator Time\":36}]},{\"0_0\":[]},{\"0_1\":[]},{\"0_2\":[{\"FastFilteredCountOperator Time\":0},{\"FastFilteredCountOperator Time\":0},{\"FastFilteredCountOperator Time\":0},{\"FastFilteredCountOperator Time\":0},...{\"FastFilteredCountOperator Time\":0},{\"FastFilteredCountOperator Time\":0},{\"FastFilteredCountOperator Time\":0},{\"FastFilteredCountOperator Time\":0}]}]", "operatorExecutionTimeMs": "71", "operatorExecStartTimeMs": "1712833326730", "numSegmentsPrunedByServer": "0", "numSegmentsProcessed": "14144", "operatorId": "leafstagetransferableblockoperator_...@pinot-server-0.pinot-server-headless.pinot-dev.svc.cluster.local:8442", "numEntriesScannedInFilter": "0", "table": "pinot_metadata_feeds" } }, "tableNames": [ "pinot_metadata_feeds" ] } }, "exceptions": [], "numServersQueried": 0, "numServersResponded": 0, "numSegmentsQueried": 14144, "numSegmentsProcessed": 14144, "numSegmentsMatched": 14144, "numConsumingSegmentsQueried": 0, "numConsumingSegmentsProcessed": 0, "numConsumingSegmentsMatched": 0, "numDocsScanned": 1206170, "numEntriesScannedInFilter": 0, "numEntriesScannedPostFilter": 0, "numGroupsLimitReached": false, "totalDocs": 1206170, "timeUsedMs": 172, "offlineThreadCpuTimeNs": 0, "realtimeThreadCpuTimeNs": 0, "offlineSystemActivitiesCpuTimeNs": 0, "realtimeSystemActivitiesCpuTimeNs": 0, "offlineResponseSerializationCpuTimeNs": 0, "realtimeResponseSerializationCpuTimeNs": 0, "offlineTotalCpuTimeNs": 0, "realtimeTotalCpuTimeNs": 0, "segmentStatistics": [], "traceInfo": {}, "minConsumingFreshnessTimeMs": 0, "numSegmentsPrunedByBroker": 0, "numSegmentsPrunedByServer": 0, "numSegmentsPrunedInvalid": 0, "numSegmentsPrunedByLimit": 0, "numSegmentsPrunedByValue": 0, "explainPlanNumEmptyFilterSegments": 0, "explainPlanNumMatchAllFilterSegments": 0, "brokerId": "Broker_pinot-broker-0.pinot-broker-headless.pinot-dev.svc.cluster.local_8099", "brokerReduceTimeMs": 88, "numRowsResultSet": 1 } ``` Full query response in Pinot UI for succeeded query with multistage disabled SELECT COUNT(*) FROM pinot_metadata_feeds; ```` { "resultTable": { "dataSchema": { "columnNames": [ "count(*)" ], "columnDataTypes": [ "LONG" ] }, "rows": [ [ 1206170 ] ] }, "requestId": "132994053000000022", "brokerId": "Broker_pinot-broker-2.pinot-broker-headless.pinot-dev.svc.cluster.local_8099", "exceptions": [], "numServersQueried": 1, "numServersResponded": 1, "numSegmentsQueried": 14144, "numSegmentsProcessed": 14144, "numSegmentsMatched": 14144, "numConsumingSegmentsQueried": 0, "numConsumingSegmentsProcessed": 0, "numConsumingSegmentsMatched": 0, "numDocsScanned": 1206170, "numEntriesScannedInFilter": 0, "numEntriesScannedPostFilter": 0, "numGroupsLimitReached": false, "totalDocs": 1206170, "timeUsedMs": 189, "offlineThreadCpuTimeNs": 0, "realtimeThreadCpuTimeNs": 0, "offlineSystemActivitiesCpuTimeNs": 0, "realtimeSystemActivitiesCpuTimeNs": 0, "offlineResponseSerializationCpuTimeNs": 0, "realtimeResponseSerializationCpuTimeNs": 0, "offlineTotalCpuTimeNs": 0, "realtimeTotalCpuTimeNs": 0, "brokerReduceTimeMs": 0, "segmentStatistics": [], "traceInfo": { "pinot-server-2_O": "[{\"0\":[{\"SegmentPrunerService Time\":6},{\"CombinePlanNode Time\":3},{\"AggregationCombineOperator Time\":49},{\"InstanceResponseOperator Time\":49}]},{\"0_0\":[]},{\"0_1\":[]},{\"0_2\":[{\"FastFilteredCountOperator Time\":0},{\"FastFilteredCountOperator Time\":0},{\"FastFilteredCountOperator Time\":0},{\"FastFilteredCountOperator Time\":0},...{\"FastFilteredCountOperator Time\":0},{\"FastFilteredCountOperator Time\":0},{\"FastFilteredCountOperator Time\":0}]}]" }, "minConsumingFreshnessTimeMs": 0, "numSegmentsPrunedByBroker": 0, "numSegmentsPrunedByServer": 0, "numSegmentsPrunedInvalid": 0, "numSegmentsPrunedByLimit": 0, "numSegmentsPrunedByValue": 0, "explainPlanNumEmptyFilterSegments": 0, "explainPlanNumMatchAllFilterSegments": 0, "numRowsResultSet": 1 } ```` # Expectation Since we have 3 replicas for every segment and 3 replicas for every component, pinot must only route the queries to heathy broker/server pods and the query must not fail in case 1 out 3 server/broker pod is unavailable. # Solutions tried so far 1. https://docs.pinot.apache.org/operators/operating-pinot/tuning/query-routing-using-adaptive-server-selection Added the following to broker-conf ```` pinot.broker.adaptive.server.selector.enable.stats.collection = true pinot.broker.adaptive.server.selector.type=HYBRID ```` 2. Enabled replicaGroup using default tags ``` { "tableName": "pinot_metadata_feeds", "tableType": "OFFLINE", "quota": { "maxQueriesPerSecond": 300, "storage": "140G" }, "routing": { "segmentPrunerTypes": ["partition"], "instanceSelectorType": "replicaGroup" }, "segmentsConfig": { "retentionTimeUnit": "DAYS", "retentionTimeValue": "3650", "schemaName": "pinot_metadata_feeds", "replication": "3", "replicasPerPartition": "1", "segmentPushType": "APPEND", "segmentAssignmentStrategy": "BalanceNumSegmentAssignmentStrategy" }, "ingestionConfig": {}, "task": { "taskTypeConfigsMap": { "MergeRollupTask": { "1day.mergeType": "concat", "1day.bucketTimePeriod": "1d", "1day.bufferTimePeriod": "1d" } } }, "tenants": {}, "tableIndexConfig": { "loadMode": "MMAP", "nullHandlingEnabled": "true" }, "instanceAssignmentConfigMap": { "OFFLINE": { "tagPoolConfig": { "tag": "DefaultTenant_OFFLINE" }, "replicaGroupPartitionConfig": { "replicaGroupBased": true, "numInstances": 3, "numReplicaGroups": 3, "numInstancesPerReplicaGroup": 1 } } }, "metadata": { "customConfigs": {} } } ``` 3. Upgrade 1.0.0 to 1.1.0 : Will test in the evening. As of now, Pinot for us is not highly available despite following all best practices regarding replication. This scares us currently as we can face downtime any day if kubernetes restart one pod randomly which is not uncommon. This issue is quite important for us as it was our base assumption that Pinot is Highly Available. Any help is much appreciated! Thanks! FYI : This issue was first discussed on Slack [here](https://apache-pinot.slack.com/archives/C011C9JHN7R/p1712665057902059) -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@pinot.apache.org For additional commands, e-mail: commits-h...@pinot.apache.org