Databricks): Used connection param of batchSize for cloudFetch (#3518)

curth Tue, 07 Oct 2025 11:48:48 -0700

This is an automated email from the ASF dual-hosted git repository.

curth pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-adbc.git



The following commit(s) were added to refs/heads/main by this push:
     new 605bf7556 feat(csharp/src/Drivers/Databricks): Used connection param 
of batchSize for cloudFetch (#3518)
605bf7556 is described below

commit 605bf75569fb770514d01754b2b6a785035ce12d
Author: msrathore-db <[email protected]>
AuthorDate: Wed Oct 8 00:18:34 2025 +0530

    feat(csharp/src/Drivers/Databricks): Used connection param of batchSize for 
cloudFetch (#3518)
    
    Used connection param of batchSize for cloudFetch. Changed the default
    value of the param to 2 million instead of 50k.
    
    [PECO-2732](https://databricks.atlassian.net/browse/PECO-2732)
---
 csharp/src/Drivers/Apache/Hive2/HiveServer2Statement.cs  |  2 +-
 csharp/src/Drivers/Databricks/DatabricksStatement.cs     | 16 ++++++++++++++++
 .../Reader/CloudFetch/CloudFetchDownloadManager.cs       |  3 +--
 csharp/src/Drivers/Databricks/readme.md                  |  1 +
 4 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/csharp/src/Drivers/Apache/Hive2/HiveServer2Statement.cs 
b/csharp/src/Drivers/Apache/Hive2/HiveServer2Statement.cs
index 10b8ec2d6..bbaee0c0b 100644
--- a/csharp/src/Drivers/Apache/Hive2/HiveServer2Statement.cs
+++ b/csharp/src/Drivers/Apache/Hive2/HiveServer2Statement.cs
@@ -350,7 +350,7 @@ namespace Apache.Arrow.Adbc.Drivers.Apache.Hive2
 
         protected internal int PollTimeMilliseconds { get; private set; } = 
HiveServer2Connection.PollTimeMillisecondsDefault;
 
-        public long BatchSize { get; private set; } = 
HiveServer2Connection.BatchSizeDefault;
+        public virtual long BatchSize { get; protected set; } = 
HiveServer2Connection.BatchSizeDefault;
 
         public int QueryTimeoutSeconds
         {
diff --git a/csharp/src/Drivers/Databricks/DatabricksStatement.cs 
b/csharp/src/Drivers/Databricks/DatabricksStatement.cs
index 3610da6fa..006c78ad2 100644
--- a/csharp/src/Drivers/Databricks/DatabricksStatement.cs
+++ b/csharp/src/Drivers/Databricks/DatabricksStatement.cs
@@ -36,6 +36,10 @@ namespace Apache.Arrow.Adbc.Drivers.Databricks
     /// </summary>
     internal class DatabricksStatement : SparkStatement, IHiveServer2Statement
     {
+        // Databricks CloudFetch supports much larger batch sizes than 
standard Arrow batches (1024MB vs 10MB limit).
+        // Using 2M rows significantly reduces round trips for medium/large 
result sets compared to the base 50K default,
+        // improving query performance by reducing the number of FetchResults 
calls needed.
+        private const long DatabricksBatchSizeDefault = 2000000;
         private bool useCloudFetch;
         private bool canDecompressLz4;
         private long maxBytesPerFile;
@@ -44,6 +48,8 @@ namespace Apache.Arrow.Adbc.Drivers.Databricks
         private bool enablePKFK;
         private bool runAsyncInThrift;
 
+        public override long BatchSize { get; protected set; } = 
DatabricksBatchSizeDefault;
+
         public DatabricksStatement(DatabricksConnection connection)
             : base(connection)
         {
@@ -178,6 +184,16 @@ namespace Apache.Arrow.Adbc.Drivers.Databricks
                         throw new ArgumentException($"Invalid value for {key}: 
{value}. Valid formats: number with optional unit suffix (B, KB, MB, GB). 
Examples: '400MB', '1024KB', '1073741824'.");
                     }
                     break;
+                case ApacheParameters.BatchSize:
+                    if (long.TryParse(value, out long batchSize) && batchSize 
> 0)
+                    {
+                        this.BatchSize = batchSize;
+                    }
+                    else
+                    {
+                        throw new ArgumentOutOfRangeException(key, value, 
$"The value '{value}' for option '{key}' is invalid. Must be a numeric value 
greater than zero.");
+                    }
+                    break;
                 default:
                     base.SetOption(key, value);
                     break;
diff --git 
a/csharp/src/Drivers/Databricks/Reader/CloudFetch/CloudFetchDownloadManager.cs 
b/csharp/src/Drivers/Databricks/Reader/CloudFetch/CloudFetchDownloadManager.cs
index a7a98648f..97b9fa968 100644
--- 
a/csharp/src/Drivers/Databricks/Reader/CloudFetch/CloudFetchDownloadManager.cs
+++ 
b/csharp/src/Drivers/Databricks/Reader/CloudFetch/CloudFetchDownloadManager.cs
@@ -35,7 +35,6 @@ namespace 
Apache.Arrow.Adbc.Drivers.Databricks.Reader.CloudFetch
         private const int DefaultPrefetchCount = 2;
         private const int DefaultMemoryBufferSizeMB = 200;
         private const bool DefaultPrefetchEnabled = true;
-        private const int DefaultFetchBatchSize = 2000000;
         private const int DefaultTimeoutMinutes = 5;
         private const int DefaultMaxUrlRefreshAttempts = 3;
         private const int DefaultUrlExpirationBufferSeconds = 60;
@@ -203,7 +202,7 @@ namespace 
Apache.Arrow.Adbc.Drivers.Databricks.Reader.CloudFetch
                 initialResults,
                 _memoryManager,
                 _downloadQueue,
-                DefaultFetchBatchSize,
+                _statement.BatchSize,
                 urlExpirationBufferSeconds);
 
             // Initialize the downloader
diff --git a/csharp/src/Drivers/Databricks/readme.md 
b/csharp/src/Drivers/Databricks/readme.md
index 9159a7521..caf02987b 100644
--- a/csharp/src/Drivers/Databricks/readme.md
+++ b/csharp/src/Drivers/Databricks/readme.md
@@ -128,6 +128,7 @@ CloudFetch is Databricks' high-performance result retrieval 
system that download
 | `adbc.databricks.use_desc_table_extended` | Whether to use DESC TABLE 
EXTENDED to get extended column metadata when supported by DBR | `true` |
 | `adbc.databricks.enable_run_async_thrift` | Whether to enable RunAsync flag 
in Thrift operations | `true` |
 | `adbc.databricks.driver_config_take_precedence` | Whether driver 
configuration overrides passed-in properties during configuration merging | 
`false` |
+| `adbc.apache.statement.batch_size` | Sets the maximum number of rows to 
retrieve in a single batch request | `2000000` |
 
 ### Tracing Properties

(arrow-adbc) branch main updated: feat(csharp/src/Drivers/Databricks): Used connection param of batchSize for cloudFetch (#3518)

Reply via email to