This is an automated email from the ASF dual-hosted git repository. kakachen pushed a commit to branch orc in repository https://gitbox.apache.org/repos/asf/doris-thirdparty.git
The following commit(s) were added to refs/heads/orc by this push: new 401881eb394 [fix](cherry-pick) ORC-1525: Fix bad read in RleDecoderV2::readByte. 401881eb394 is described below commit 401881eb39434654691db69ba0cef7ebc9e46820 Author: Qi Chen <che...@selectdb.com> AuthorDate: Tue Jun 17 09:19:25 2025 +0800 [fix](cherry-pick) ORC-1525: Fix bad read in RleDecoderV2::readByte. --- c++/src/ByteRLE.cc | 8 ++++++ c++/test/TestWriter.cc | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 86 insertions(+) diff --git a/c++/src/ByteRLE.cc b/c++/src/ByteRLE.cc index 890dae2a38a..b81d282e353 100644 --- a/c++/src/ByteRLE.cc +++ b/c++/src/ByteRLE.cc @@ -241,6 +241,8 @@ namespace orc { virtual void recordPosition(PositionRecorder* recorder) const override; + virtual void suppress() override; + private: int bitsRemained; char current; @@ -291,6 +293,12 @@ namespace orc { recorder->add(static_cast<uint64_t>(8 - bitsRemained)); } + void BooleanRleEncoderImpl::suppress() { + ByteRleEncoderImpl::suppress(); + bitsRemained = 8; + current = static_cast<char>(0); + } + std::unique_ptr<ByteRleEncoder> createBooleanRleEncoder( std::unique_ptr<BufferedOutputStream> output) { BooleanRleEncoderImpl* encoder = new BooleanRleEncoderImpl(std::move(output)); diff --git a/c++/test/TestWriter.cc b/c++/test/TestWriter.cc index c8c3ca1396d..803f14de780 100644 --- a/c++/test/TestWriter.cc +++ b/c++/test/TestWriter.cc @@ -1994,6 +1994,84 @@ namespace orc { EXPECT_FALSE(rowReader->next(*batch)); } + // first stripe has no null value and second stripe has null value. + // make sure stripes do not have dirty data in the present streams. + TEST_P(WriterTest, testSuppressPresentStreamInPreStripe) { + MemoryOutputStream memStream(DEFAULT_MEM_STREAM_SIZE); + MemoryPool* pool = getDefaultPool(); + + // [1-998000): notNull, value is equal to index + // [998000-999000): null + // [999000-1000000]: notNoll, value is equal to index + size_t rowCount = 1000000; + size_t nullBeginCount = 998000; + size_t nullEndCount = 999000; + size_t batchSize = 5; + { + auto type = std::unique_ptr<Type>(Type::buildTypeFromString("struct<col1:int>")); + WriterOptions options; + options.setStripeSize(16 * 1024) + .setCompressionBlockSize(1024) + .setCompression(CompressionKind_NONE) + .setMemoryPool(pool) + .setRowIndexStride(1000); + + auto writer = createWriter(*type, &memStream, options); + + uint64_t batchCount = rowCount / batchSize; + size_t rowsWrite = 0; + for (uint64_t batchIdx = 0; batchIdx < batchCount; batchIdx++) { + auto batch = writer->createRowBatch(batchSize); + auto& structBatch = dynamic_cast<StructVectorBatch&>(*batch); + auto& longBatch = dynamic_cast<LongVectorBatch&>(*structBatch.fields[0]); + structBatch.numElements = batchSize; + longBatch.numElements = batchSize; + longBatch.hasNulls = false; + for (uint64_t row = 0; row < batchSize; ++row) { + size_t rowIndex = rowsWrite + row + 1; + if (rowIndex < nullBeginCount || rowIndex >= nullEndCount) { + longBatch.data[row] = static_cast<int64_t>(rowIndex); + } else { + longBatch.notNull[row] = 0; + longBatch.hasNulls = true; + } + } + + writer->add(*batch); + rowsWrite += batch->numElements; + } + writer->close(); + } + // read file & check the column value correct + { + std::unique_ptr<MemoryInputStream> inStream(new MemoryInputStream( + memStream.getData(), memStream.getLength())); + ReaderOptions readerOptions; + readerOptions.setMemoryPool(*pool); + std::unique_ptr<Reader> reader = createReader(std::move(inStream), readerOptions); + EXPECT_EQ(reader->getNumberOfStripes(), 2); + EXPECT_EQ(rowCount, reader->getNumberOfRows()); + std::unique_ptr<RowReader> rowReader = createRowReader(reader.get()); + size_t rowsRead = 0; + while (rowsRead < rowCount) { + auto batch = rowReader->createRowBatch(1000); + EXPECT_TRUE(rowReader->next(*batch)); + auto& structBatch = dynamic_cast<StructVectorBatch&>(*batch); + auto& longBatch = dynamic_cast<LongVectorBatch&>(*structBatch.fields[0]); + for (size_t i = 0; i < batch->numElements; ++i) { + size_t rowIndex = rowsRead + i + 1; + if (rowIndex < nullBeginCount || rowIndex >= nullEndCount) { + EXPECT_TRUE(longBatch.notNull[i]); + EXPECT_EQ(longBatch.data[i], static_cast<int64_t>(rowIndex)); + } else { + EXPECT_FALSE(longBatch.notNull[i]); + } + } + rowsRead += batch->numElements; + } + } + } + INSTANTIATE_TEST_SUITE_P(OrcTest, WriterTest, Values(FileVersion::v_0_11(), FileVersion::v_0_12(), FileVersion::UNSTABLE_PRE_2_0())); --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org