This is an automated email from the ASF dual-hosted git repository.

dongjoon pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/spark-kubernetes-operator.git


The following commit(s) were added to refs/heads/main by this push:
     new 4ccfcd6  [SPARK-55804] Expose informer cache sync timeout as operator 
config
4ccfcd6 is described below

commit 4ccfcd600cfed0e237cb36971172304cc3e98d40
Author: Zhou JIANG <[email protected]>
AuthorDate: Wed Mar 4 19:25:00 2026 -0800

    [SPARK-55804] Expose informer cache sync timeout as operator config
    
    ### What changes were proposed in this pull request?
    
    Add `spark.kubernetes.operator.cacheSyncTimeoutSeconds` (default: 30s) as a 
new `SparkOperatorConf` config option and wire it into the JOSDK operator 
configuration via `withCacheSyncTimeout` in `SparkOperator`. Update 
`docs/config_properties.md` accordingly.
    
    ### Why are the changes needed?
    
    The JOSDK informer cache sync timeout was previously hardcoded to the 
framework default with no way for operators to tune it. In environments with 
large clusters or slow API servers, the default may be too short.
    
    ### Does this PR introduce _any_ user-facing change?
    
    Yes. A new configuration property is introduced.
    
    ### How was this patch tested?
    
    Pass the CIs.
    
    ### Was this patch authored or co-authored using generative AI tooling?
    
    No
    
    Closes #532 from jiangzho/cachesync.
    
    Authored-by: Zhou JIANG <[email protected]>
    Signed-off-by: Dongjoon Hyun <[email protected]>
---
 docs/config_properties.md                                   |  3 ++-
 .../java/org/apache/spark/k8s/operator/SparkOperator.java   |  2 ++
 .../apache/spark/k8s/operator/config/SparkOperatorConf.java | 13 ++++++++++++-
 3 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/docs/config_properties.md b/docs/config_properties.md
index 6cbd373..adac1a4 100644
--- a/docs/config_properties.md
+++ b/docs/config_properties.md
@@ -12,6 +12,8 @@
  | spark.kubernetes.operator.health.probePort | Integer | 19091 | false | The 
port used for health/readiness check probe status. | 
  | spark.kubernetes.operator.health.sentinelExecutorPoolSize | Integer | 3 | 
false | Size of executor service in Sentinel Managers to check the health of 
sentinel resources. | 
  | spark.kubernetes.operator.health.sentinelResourceReconciliationDelaySeconds 
| Integer | 60 | true | Allowed max time(seconds) between spec update and 
reconciliation for sentinel resources. | 
+ | spark.kubernetes.operator.informer.cacheSyncTimeoutSeconds | Integer | 30 | 
false | Timeout threshold for operator to sync informer cache in seconds. | 
+ | spark.kubernetes.operator.informer.terminateOnInformerFailureEnabled | 
Boolean | false | false | Enable to indicate informer errors should stop 
operator startup. If disabled, operator startup will ignore recoverable errors, 
caused for example by RBAC issues and will retry periodically. | 
  | spark.kubernetes.operator.leaderElection.enabled | Boolean | false | false 
| Enable leader election for the operator to allow running standby instances. 
When this is disabled, only one operator instance is expected to be up and 
running at any time (replica = 1) to avoid race condition. | 
  | spark.kubernetes.operator.leaderElection.leaseDurationSeconds | Integer | 
180 | false | Leader election lease duration in seconds, non-negative. | 
  | spark.kubernetes.operator.leaderElection.leaseName | String | 
spark-operator-lease | false | Leader election lease name, must be unique for 
leases in the same namespace. | 
@@ -39,7 +41,6 @@
  | spark.kubernetes.operator.reconciler.retry.maxIntervalSeconds | Integer | 
-1 | false | Max interval(in seconds) of retries on unhandled controller 
errors. Set to non-positive for unlimited. | 
  | spark.kubernetes.operator.reconciler.terminationTimeoutSeconds | Integer | 
30 | false | Grace period for operator shutdown before reconciliation threads 
are killed. | 
  | spark.kubernetes.operator.reconciler.trimStateTransitionHistoryEnabled | 
Boolean | true | true | When enabled, operator would trim state transition 
history when a new attempt starts, keeping previous attempt summary only. | 
- | spark.kubernetes.operator.terminateOnInformerFailureEnabled | Boolean | 
false | false | Enable to indicate informer errors should stop operator 
startup. If disabled, operator startup will ignore recoverable errors, caused 
for example by RBAC issues and will retry periodically. | 
  | spark.kubernetes.operator.watchedNamespaces | String | default | true | 
Comma-separated list of namespaces that the operator would be watching for 
Spark resources. If set to '*', operator would watch all namespaces. | 
  | spark.logConf | Boolean | false | true | When enabled, operator will print 
configurations | 
 
diff --git 
a/spark-operator/src/main/java/org/apache/spark/k8s/operator/SparkOperator.java 
b/spark-operator/src/main/java/org/apache/spark/k8s/operator/SparkOperator.java
index b01685f..dee821e 100644
--- 
a/spark-operator/src/main/java/org/apache/spark/k8s/operator/SparkOperator.java
+++ 
b/spark-operator/src/main/java/org/apache/spark/k8s/operator/SparkOperator.java
@@ -202,6 +202,8 @@ public class SparkOperator {
         SparkOperatorConf.TERMINATE_ON_INFORMER_FAILURE_ENABLED.getValue());
     overrider.withReconciliationTerminationTimeout(
         
Duration.ofSeconds(SparkOperatorConf.RECONCILER_TERMINATION_TIMEOUT_SECONDS.getValue()));
+    overrider.withCacheSyncTimeout(
+        
Duration.ofSeconds(SparkOperatorConf.CACHE_SYNC_TIMEOUT_SECONDS.getValue()));
     int parallelism = SparkOperatorConf.RECONCILER_PARALLELISM.getValue();
     if (parallelism > 0) {
       log.info("Configuring operator with {} reconciliation threads.", 
parallelism);
diff --git 
a/spark-operator/src/main/java/org/apache/spark/k8s/operator/config/SparkOperatorConf.java
 
b/spark-operator/src/main/java/org/apache/spark/k8s/operator/config/SparkOperatorConf.java
index c72f355..9a3b9df 100644
--- 
a/spark-operator/src/main/java/org/apache/spark/k8s/operator/config/SparkOperatorConf.java
+++ 
b/spark-operator/src/main/java/org/apache/spark/k8s/operator/config/SparkOperatorConf.java
@@ -98,7 +98,7 @@ public final class SparkOperatorConf {
    */
   public static final ConfigOption<Boolean> 
TERMINATE_ON_INFORMER_FAILURE_ENABLED =
       ConfigOption.<Boolean>builder()
-          .key("spark.kubernetes.operator.terminateOnInformerFailureEnabled")
+          
.key("spark.kubernetes.operator.informer.terminateOnInformerFailureEnabled")
           .enableDynamicOverride(false)
           .description(
               "Enable to indicate informer errors should stop operator 
startup. If "
@@ -109,6 +109,17 @@ public final class SparkOperatorConf {
           .defaultValue(false)
           .build();
 
+  /** Timeout threshold for operator to sync informer cache in seconds. */
+  public static final ConfigOption<Integer> CACHE_SYNC_TIMEOUT_SECONDS =
+      ConfigOption.<Integer>builder()
+          .key("spark.kubernetes.operator.informer.cacheSyncTimeoutSeconds")
+          .enableDynamicOverride(false)
+          .description(
+              "Timeout threshold for operator to sync informer cache in 
seconds.")
+          .typeParameterClass(Integer.class)
+          .defaultValue(30)
+          .build();
+
   /** Grace period for operator shutdown before reconciliation threads are 
killed. */
   public static final ConfigOption<Integer> 
RECONCILER_TERMINATION_TIMEOUT_SECONDS =
       ConfigOption.<Integer>builder()


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to