This is an automated email from the ASF dual-hosted git repository.
curth pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow-adbc.git
The following commit(s) were added to refs/heads/main by this push:
new 605bf7556 feat(csharp/src/Drivers/Databricks): Used connection param
of batchSize for cloudFetch (#3518)
605bf7556 is described below
commit 605bf75569fb770514d01754b2b6a785035ce12d
Author: msrathore-db <[email protected]>
AuthorDate: Wed Oct 8 00:18:34 2025 +0530
feat(csharp/src/Drivers/Databricks): Used connection param of batchSize for
cloudFetch (#3518)
Used connection param of batchSize for cloudFetch. Changed the default
value of the param to 2 million instead of 50k.
[PECO-2732](https://databricks.atlassian.net/browse/PECO-2732)
---
csharp/src/Drivers/Apache/Hive2/HiveServer2Statement.cs | 2 +-
csharp/src/Drivers/Databricks/DatabricksStatement.cs | 16 ++++++++++++++++
.../Reader/CloudFetch/CloudFetchDownloadManager.cs | 3 +--
csharp/src/Drivers/Databricks/readme.md | 1 +
4 files changed, 19 insertions(+), 3 deletions(-)
diff --git a/csharp/src/Drivers/Apache/Hive2/HiveServer2Statement.cs
b/csharp/src/Drivers/Apache/Hive2/HiveServer2Statement.cs
index 10b8ec2d6..bbaee0c0b 100644
--- a/csharp/src/Drivers/Apache/Hive2/HiveServer2Statement.cs
+++ b/csharp/src/Drivers/Apache/Hive2/HiveServer2Statement.cs
@@ -350,7 +350,7 @@ namespace Apache.Arrow.Adbc.Drivers.Apache.Hive2
protected internal int PollTimeMilliseconds { get; private set; } =
HiveServer2Connection.PollTimeMillisecondsDefault;
- public long BatchSize { get; private set; } =
HiveServer2Connection.BatchSizeDefault;
+ public virtual long BatchSize { get; protected set; } =
HiveServer2Connection.BatchSizeDefault;
public int QueryTimeoutSeconds
{
diff --git a/csharp/src/Drivers/Databricks/DatabricksStatement.cs
b/csharp/src/Drivers/Databricks/DatabricksStatement.cs
index 3610da6fa..006c78ad2 100644
--- a/csharp/src/Drivers/Databricks/DatabricksStatement.cs
+++ b/csharp/src/Drivers/Databricks/DatabricksStatement.cs
@@ -36,6 +36,10 @@ namespace Apache.Arrow.Adbc.Drivers.Databricks
/// </summary>
internal class DatabricksStatement : SparkStatement, IHiveServer2Statement
{
+ // Databricks CloudFetch supports much larger batch sizes than
standard Arrow batches (1024MB vs 10MB limit).
+ // Using 2M rows significantly reduces round trips for medium/large
result sets compared to the base 50K default,
+ // improving query performance by reducing the number of FetchResults
calls needed.
+ private const long DatabricksBatchSizeDefault = 2000000;
private bool useCloudFetch;
private bool canDecompressLz4;
private long maxBytesPerFile;
@@ -44,6 +48,8 @@ namespace Apache.Arrow.Adbc.Drivers.Databricks
private bool enablePKFK;
private bool runAsyncInThrift;
+ public override long BatchSize { get; protected set; } =
DatabricksBatchSizeDefault;
+
public DatabricksStatement(DatabricksConnection connection)
: base(connection)
{
@@ -178,6 +184,16 @@ namespace Apache.Arrow.Adbc.Drivers.Databricks
throw new ArgumentException($"Invalid value for {key}:
{value}. Valid formats: number with optional unit suffix (B, KB, MB, GB).
Examples: '400MB', '1024KB', '1073741824'.");
}
break;
+ case ApacheParameters.BatchSize:
+ if (long.TryParse(value, out long batchSize) && batchSize
> 0)
+ {
+ this.BatchSize = batchSize;
+ }
+ else
+ {
+ throw new ArgumentOutOfRangeException(key, value,
$"The value '{value}' for option '{key}' is invalid. Must be a numeric value
greater than zero.");
+ }
+ break;
default:
base.SetOption(key, value);
break;
diff --git
a/csharp/src/Drivers/Databricks/Reader/CloudFetch/CloudFetchDownloadManager.cs
b/csharp/src/Drivers/Databricks/Reader/CloudFetch/CloudFetchDownloadManager.cs
index a7a98648f..97b9fa968 100644
---
a/csharp/src/Drivers/Databricks/Reader/CloudFetch/CloudFetchDownloadManager.cs
+++
b/csharp/src/Drivers/Databricks/Reader/CloudFetch/CloudFetchDownloadManager.cs
@@ -35,7 +35,6 @@ namespace
Apache.Arrow.Adbc.Drivers.Databricks.Reader.CloudFetch
private const int DefaultPrefetchCount = 2;
private const int DefaultMemoryBufferSizeMB = 200;
private const bool DefaultPrefetchEnabled = true;
- private const int DefaultFetchBatchSize = 2000000;
private const int DefaultTimeoutMinutes = 5;
private const int DefaultMaxUrlRefreshAttempts = 3;
private const int DefaultUrlExpirationBufferSeconds = 60;
@@ -203,7 +202,7 @@ namespace
Apache.Arrow.Adbc.Drivers.Databricks.Reader.CloudFetch
initialResults,
_memoryManager,
_downloadQueue,
- DefaultFetchBatchSize,
+ _statement.BatchSize,
urlExpirationBufferSeconds);
// Initialize the downloader
diff --git a/csharp/src/Drivers/Databricks/readme.md
b/csharp/src/Drivers/Databricks/readme.md
index 9159a7521..caf02987b 100644
--- a/csharp/src/Drivers/Databricks/readme.md
+++ b/csharp/src/Drivers/Databricks/readme.md
@@ -128,6 +128,7 @@ CloudFetch is Databricks' high-performance result retrieval
system that download
| `adbc.databricks.use_desc_table_extended` | Whether to use DESC TABLE
EXTENDED to get extended column metadata when supported by DBR | `true` |
| `adbc.databricks.enable_run_async_thrift` | Whether to enable RunAsync flag
in Thrift operations | `true` |
| `adbc.databricks.driver_config_take_precedence` | Whether driver
configuration overrides passed-in properties during configuration merging |
`false` |
+| `adbc.apache.statement.batch_size` | Sets the maximum number of rows to
retrieve in a single batch request | `2000000` |
### Tracing Properties