This is an automated email from the ASF dual-hosted git repository.
kou pushed a commit to branch main
in repository https://gitbox.apache.org/repos/asf/arrow.git
The following commit(s) were added to refs/heads/main by this push:
new adef2efa26 GH-49043: [C++][FS][Azure] Avoid bugs caused by empty first
page(s) followed by non-empty subsequent page(s) (#49049)
adef2efa26 is described below
commit adef2efa26b84d00ddddbb110fa3e877a42b9a8c
Author: Thomas Newton <[email protected]>
AuthorDate: Fri Jan 30 08:37:49 2026 +0000
GH-49043: [C++][FS][Azure] Avoid bugs caused by empty first page(s)
followed by non-empty subsequent page(s) (#49049)
### Rationale for this change
Prevent bugs similar to https://github.com/apache/arrow/issues/49043
### What changes are included in this PR?
- Implement `SkipStartingEmptyPages` for various types of PagedResponses
used in the `AzureFileSystem`.
- Apply `SkipStartingEmptyPages` on the response from every list operation
that returns a paged response.
### Are these changes tested?
Ran the tests in the codebase including the ones that need to connect to
real blob storage. This makes me fairly confident that I haven't introduced a
regression.
The only reproduce I've found involves reading a production Azure blob
storage account. With this I've tested that this PR solves
https://github.com/apache/arrow/issues/49043, but I haven't been able to
reproduce it in any checked in tests. I tried copying a chunk of data around
our prod reproduce into azurite, but still can't reproduce.
### Are there any user-facing changes?
Some low probability bugs will be gone. No interface changes.
* GitHub Issue: #49043
Authored-by: Thomas Newton <[email protected]>
Signed-off-by: Sutou Kouhei <[email protected]>
---
cpp/src/arrow/filesystem/azurefs.cc | 36 ++++++++++++++++++++++++++++++++++++
1 file changed, 36 insertions(+)
diff --git a/cpp/src/arrow/filesystem/azurefs.cc
b/cpp/src/arrow/filesystem/azurefs.cc
index 6580476d38..7b1776a2af 100644
--- a/cpp/src/arrow/filesystem/azurefs.cc
+++ b/cpp/src/arrow/filesystem/azurefs.cc
@@ -967,6 +967,32 @@ Status StageBlock(Blobs::BlockBlobClient*
block_blob_client, const std::string&
return Status::OK();
}
+// Usually if the first page is empty it means there are no results. This was
assumed in
+// several places in AzureFilesystem. The Azure docs do not guarantee this and
we have
+// evidence (GH-49043) that there can be subsequent non-empty pages.
+// Applying `SkipStartingEmptyPages` on a paged response corrects this
assumption.
+void SkipStartingEmptyPages(Blobs::ListBlobContainersPagedResponse&
paged_response) {
+ while (paged_response.HasPage() && paged_response.BlobContainers.empty()) {
+ paged_response.MoveToNextPage();
+ }
+}
+void SkipStartingEmptyPages(Blobs::ListBlobsPagedResponse& paged_response) {
+ while (paged_response.HasPage() && paged_response.Blobs.size() == 0) {
+ paged_response.MoveToNextPage();
+ }
+}
+void SkipStartingEmptyPages(Blobs::ListBlobsByHierarchyPagedResponse&
paged_response) {
+ while (paged_response.HasPage() && paged_response.Blobs.empty() &&
+ paged_response.BlobPrefixes.empty()) {
+ paged_response.MoveToNextPage();
+ }
+}
+void SkipStartingEmptyPages(DataLake::ListPathsPagedResponse& paged_response) {
+ while (paged_response.HasPage() && paged_response.Paths.empty()) {
+ paged_response.MoveToNextPage();
+ }
+}
+
/// Writes will be buffered up to this size (in bytes) before actually
uploading them.
static constexpr int64_t kBlockUploadSizeBytes = 10 * 1024 * 1024;
/// The maximum size of a block in Azure Blob (as per docs).
@@ -1805,12 +1831,14 @@ class AzureFileSystem::Impl {
try {
FileInfo info{location.all};
auto list_response = container_client.ListBlobsByHierarchy(kDelimiter,
options);
+ SkipStartingEmptyPages(list_response);
// Since PageSizeHint=1, we expect at most one entry in either Blobs or
// BlobPrefixes. A BlobPrefix always ends with kDelimiter ("/"), so we
can
// distinguish between a directory and a file by checking if we received
a
// prefix or a blob.
// This strategy allows us to implement GetFileInfo with just 1 blob
storage
// operation in almost every case.
+
if (!list_response.BlobPrefixes.empty()) {
// Ensure the returned BlobPrefixes[0] string doesn't contain more
characters than
// the requested Prefix. For instance, if we request with
Prefix="dir/abra" and
@@ -1847,6 +1875,7 @@ class AzureFileSystem::Impl {
// whether the path is a directory.
options.Prefix = internal::EnsureTrailingSlash(location.path);
auto list_with_trailing_slash_response =
container_client.ListBlobs(options);
+ SkipStartingEmptyPages(list_with_trailing_slash_response);
if (!list_with_trailing_slash_response.Blobs.empty()) {
info.set_type(FileType::Directory);
return info;
@@ -1909,6 +1938,7 @@ class AzureFileSystem::Impl {
try {
auto container_list_response =
blob_service_client_->ListBlobContainers(options, context);
+ SkipStartingEmptyPages(container_list_response);
for (; container_list_response.HasPage();
container_list_response.MoveToNextPage(context)) {
for (const auto& container : container_list_response.BlobContainers) {
@@ -1950,6 +1980,7 @@ class AzureFileSystem::Impl {
auto base_path_depth = internal::GetAbstractPathDepth(base_location.path);
try {
auto list_response = directory_client.ListPaths(select.recursive,
options, context);
+ SkipStartingEmptyPages(list_response);
for (; list_response.HasPage(); list_response.MoveToNextPage(context)) {
if (list_response.Paths.empty()) {
continue;
@@ -2040,6 +2071,7 @@ class AzureFileSystem::Impl {
try {
auto list_response =
container_client.ListBlobsByHierarchy(/*delimiter=*/"/", options,
context);
+ SkipStartingEmptyPages(list_response);
for (; list_response.HasPage(); list_response.MoveToNextPage(context)) {
if (list_response.Blobs.empty() && list_response.BlobPrefixes.empty())
{
continue;
@@ -2442,6 +2474,7 @@ class AzureFileSystem::Impl {
bool found_dir_marker_blob = false;
try {
auto list_response = container_client.ListBlobs(options);
+ SkipStartingEmptyPages(list_response);
if (list_response.Blobs.empty()) {
if (require_dir_to_exist) {
return PathNotFound(location);
@@ -2575,6 +2608,7 @@ class AzureFileSystem::Impl {
auto directory_client = adlfs_client.GetDirectoryClient(location.path);
try {
auto list_response = directory_client.ListPaths(false);
+ SkipStartingEmptyPages(list_response);
for (; list_response.HasPage(); list_response.MoveToNextPage()) {
for (const auto& path : list_response.Paths) {
if (path.IsDirectory) {
@@ -2899,6 +2933,7 @@ class AzureFileSystem::Impl {
list_blobs_options.PageSizeHint = 1;
try {
auto dest_list_response =
dest_container_client.ListBlobs(list_blobs_options);
+ SkipStartingEmptyPages(dest_list_response);
dest_is_empty = dest_list_response.Blobs.empty();
if (!dest_is_empty) {
return NotEmpty(dest);
@@ -2952,6 +2987,7 @@ class AzureFileSystem::Impl {
list_blobs_options.PageSizeHint = 1;
try {
auto src_list_response =
src_container_client.ListBlobs(list_blobs_options);
+ SkipStartingEmptyPages(src_list_response);
if (!src_list_response.Blobs.empty()) {
// Reminder: dest is used here because we're semantically replacing
dest
// with src. By deleting src if it's empty just like dest.