adamreeve opened a new issue, #41771:
URL: https://github.com/apache/arrow/issues/41771

   ### Describe the bug, including details regarding any error messages, 
version, and platform.
   
   Code to reproduce as a unit test that I added to 
`cpp/src/arrow/dataset/dataset_test.cc`, which logs the open files in the 
dataset directory (only works on Linux). This needs some extra headers:
   ```C++
   #include <unistd.h>
   #include <filesystem>
   #include "arrow/dataset/file_ipc.h"
   #include "arrow/ipc/api.h" 
   ```
   
   Test methods:
   ```C++
   void ListOpenFilesInDir(const std::string& directory, const std::string& 
context) {
     std::cout << "Open files in directory " << directory << " " << context << 
":" << std::endl;
     auto open_files = std::filesystem::directory_iterator("/proc/self/fd");
     for (const auto& entry : open_files)
     {
       char target_path[PATH_MAX];
       ssize_t len = ::readlink(entry.path().c_str(), target_path, PATH_MAX - 
1);
       if (len != -1) {
         target_path[len] = '\0';
         std::string open_file_path(target_path);
         if (open_file_path.find(directory) == 0)
         {
           std::cout << open_file_path << std::endl;
         }
       }
     }
   }
   
   TEST(TestDatasetScan, ScanToRecordBatchReader) {
     ASSERT_OK_AND_ASSIGN(auto tempdir, 
arrow::internal::TemporaryDir::Make("dataset-scan-test-"));
     std::string tempdir_path = tempdir->path().ToString();
   
     auto schema = arrow::schema({field("x", int64()), field("y", int64())});
     auto table = TableFromJSON(schema, {R"([
         [1, 2],
         [3, 4]
       ])"});
   
     auto format = std::make_shared<arrow::dataset::IpcFileFormat>();
     auto file_system = std::make_shared<fs::LocalFileSystem>();
     ASSERT_OK_AND_ASSIGN(auto file_path, tempdir->path().Join("data.arrow"));
     std::string file_path_str = file_path.ToString();
   
     {
       EXPECT_OK_AND_ASSIGN(auto out_stream, 
file_system->OpenOutputStream(file_path_str));
       ASSERT_OK_AND_ASSIGN(
           auto file_writer,
           MakeFileWriter(out_stream, schema, 
arrow::ipc::IpcWriteOptions::Defaults()));
       ASSERT_OK(file_writer->WriteTable(*table));
       ASSERT_OK(file_writer->Close());
     }
   
     std::vector<std::string> paths {file_path_str};
     FileSystemFactoryOptions options;
     ASSERT_OK_AND_ASSIGN(auto factory, 
arrow::dataset::FileSystemDatasetFactory::Make(file_system, paths, format, 
options));
     ASSERT_OK_AND_ASSIGN(auto dataset, factory->Finish());
   
     {
       ASSERT_OK_AND_ASSIGN(auto scanner_builder, dataset->NewScan());
       ASSERT_OK_AND_ASSIGN(auto scanner, scanner_builder->Finish());
       {
         ASSERT_OK_AND_ASSIGN(auto record_batch_reader, 
scanner->ToRecordBatchReader());
         ASSERT_OK_AND_ASSIGN(auto read_table, record_batch_reader->ToTable());
         ListOpenFilesInDir(tempdir_path, "after read");
         ASSERT_OK(record_batch_reader->Close());
         ListOpenFilesInDir(tempdir_path, "after close");
       }
       ListOpenFilesInDir(tempdir_path, "after reader destruct");
     }
     ListOpenFilesInDir(tempdir_path, "after scanner destruct");
   }
   ```
   
   When I run this (on Fedora 39, using GCC 13)  I get output like:
   ```
   Open files in directory /tmp/dataset-scan-test-268jyz3s/ after read:
   /tmp/dataset-scan-test-268jyz3s/data.arrow
   Open files in directory /tmp/dataset-scan-test-268jyz3s/ after close:
   /tmp/dataset-scan-test-268jyz3s/data.arrow
   Open files in directory /tmp/dataset-scan-test-268jyz3s/ after reader 
destruct:
   Open files in directory /tmp/dataset-scan-test-268jyz3s/ after scanner 
destruct:
   ```
   
   This shows that neither consuming the `RecordBatchReader` by reading it into 
a table nor calling the `Close` method results in the IPC file being closed, 
it's only closed after the reader is destroyed. The `Close` implementation 
doesn't do anything other than consume all the data: 
https://github.com/apache/arrow/blob/37e5240e2430564b1c2dfa5d1e6a7a6b58576f83/cpp/src/arrow/dataset/scanner.cc#L113-L120
   
   For context, this causes errors trying to remove the dataset directory in 
Windows when using the GLib bindings via Ruby, where there isn't a way to force 
destruction of the reader and we have to rely on GC (#41750).
   
   ### Component(s)
   
   C++


-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: issues-unsubscr...@arrow.apache.org.apache.org

For queries about this service, please contact Infrastructure at:
us...@infra.apache.org

Reply via email to