bryanck commented on code in PR #8303: URL: https://github.com/apache/iceberg/pull/8303#discussion_r1293688494
########## azure/src/test/java/org/apache/iceberg/azure/adlsv2/ADLSv2InputStreamTest.java: ########## @@ -0,0 +1,239 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.azure.adlsv2; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.assertj.core.api.Assertions.assertThatThrownBy; +import static org.mockito.ArgumentMatchers.any; +import static org.mockito.Mockito.mock; +import static org.mockito.Mockito.spy; +import static org.mockito.Mockito.when; + +import com.azure.storage.file.datalake.DataLakeFileClient; +import com.azure.storage.file.datalake.models.DataLakeFileOpenInputStreamResult; +import com.azure.storage.file.datalake.models.FileRange; +import com.azure.storage.file.datalake.models.PathProperties; +import com.azure.storage.file.datalake.options.DataLakeFileInputStreamOptions; +import java.io.ByteArrayInputStream; +import java.io.IOException; +import java.util.Arrays; +import java.util.Random; +import org.apache.iceberg.azure.AzureProperties; +import org.apache.iceberg.io.IOUtil; +import org.apache.iceberg.io.RangeReadable; +import org.apache.iceberg.io.SeekableInputStream; +import org.apache.iceberg.metrics.MetricsContext; +import org.assertj.core.api.Assertions; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; + +public class ADLSv2InputStreamTest { + + private final Random random = new Random(1); + private final AzureProperties azureProperties = new AzureProperties(); + private DataLakeFileClient fileClient; + + @BeforeEach + public void before() { + fileClient = mock(DataLakeFileClient.class); + } + + private void setupData(byte[] data) { + PathProperties pathProps = mock(PathProperties.class); + when(pathProps.getFileSize()).thenReturn((long) data.length); + + DataLakeFileOpenInputStreamResult openResult = mock(DataLakeFileOpenInputStreamResult.class); + when(openResult.getProperties()).thenReturn(pathProps); + + when(fileClient.openInputStream(any())) + .thenAnswer( + i -> { + DataLakeFileInputStreamOptions options = + (DataLakeFileInputStreamOptions) i.getArguments()[0]; + FileRange range = options.getRange(); + + byte[] streamData; + if (range == null) { + streamData = data; + } else { + int start = (int) range.getOffset(); + int maxLen = data.length - start; + int len = + range.getCount() == null + ? data.length - start + : Math.min(maxLen, range.getCount().intValue()); + streamData = new byte[len]; + System.arraycopy(data, start, streamData, 0, len); + } + + // disable available() so large seek will trigger new stream + ByteArrayInputStream in = spy(new ByteArrayInputStream(streamData)); + when(in.available()).thenReturn(0); + + when(openResult.getInputStream()).thenReturn(in); + return openResult; + }); + } + + @Test + public void testRead() throws Exception { + int dataSize = 1024 * 1024 * 10; + byte[] data = randomData(dataSize); + + setupData(data); + + try (SeekableInputStream in = + new ADLSv2InputStream(fileClient, null, azureProperties, MetricsContext.nullMetrics())) { + int readSize = 1024; + + readAndCheck(in, in.getPos(), readSize, data, false); + readAndCheck(in, in.getPos(), readSize, data, true); + + // Seek forward in current stream + int seekSize = 1024; + readAndCheck(in, in.getPos() + seekSize, readSize, data, false); + readAndCheck(in, in.getPos() + seekSize, readSize, data, true); + + // Buffered read + readAndCheck(in, in.getPos(), readSize, data, true); + readAndCheck(in, in.getPos(), readSize, data, false); + + // Seek with new stream + long seekNewStreamPosition = 2 * 1024 * 1024; + readAndCheck(in, in.getPos() + seekNewStreamPosition, readSize, data, true); + readAndCheck(in, in.getPos() + seekNewStreamPosition, readSize, data, false); + + // Backseek and read + readAndCheck(in, 0, readSize, data, true); + readAndCheck(in, 0, readSize, data, false); + } + } + + @Test + public void testReadSingle() throws Exception { + int i0 = 1; + int i1 = 255; + byte[] data = {(byte) i0, (byte) i1}; + + setupData(data); + + try (SeekableInputStream in = + new ADLSv2InputStream(fileClient, null, azureProperties, MetricsContext.nullMetrics())) { + assertThat(in.read()).isEqualTo(i0); + assertThat(in.read()).isEqualTo(i1); + } + } + + private void readAndCheck( + SeekableInputStream in, long rangeStart, int size, byte[] original, boolean buffered) + throws IOException { + in.seek(rangeStart); + assertThat(rangeStart).isEqualTo(in.getPos()); + + long rangeEnd = rangeStart + size; + byte[] actual = new byte[size]; + + if (buffered) { + IOUtil.readFully(in, actual, 0, actual.length); + } else { + int read = 0; + while (read < size) { + actual[read++] = (byte) in.read(); + } + } + + assertThat(in.getPos()).isEqualTo(rangeEnd); + assertThat(actual).isEqualTo(Arrays.copyOfRange(original, (int) rangeStart, (int) rangeEnd)); + } + + @Test + public void testRangeRead() throws Exception { + int dataSize = 1024 * 1024 * 10; + byte[] expected = randomData(dataSize); + byte[] actual = new byte[dataSize]; + + long position; + int offset; + int length; + + setupData(expected); + + try (RangeReadable in = + new ADLSv2InputStream(fileClient, null, azureProperties, MetricsContext.nullMetrics())) { + // first 1k + position = 0; + offset = 0; + length = 1024; + readAndCheckRanges(in, expected, position, actual, offset, length); + + // last 1k + position = dataSize - 1024; + offset = dataSize - 1024; + readAndCheckRanges(in, expected, position, actual, offset, length); + + // middle 2k + position = dataSize / 2 - 1024; + offset = dataSize / 2 - 1024; + length = 1024 * 2; + readAndCheckRanges(in, expected, position, actual, offset, length); + } + } + + private void readAndCheckRanges( + RangeReadable in, byte[] original, long position, byte[] buffer, int offset, int length) + throws IOException { + in.readFully(position, buffer, offset, length); + + Assertions.assertThat(Arrays.copyOfRange(buffer, offset, offset + length)) + .isEqualTo(Arrays.copyOfRange(original, offset, offset + length)); + } + + @Test + public void testClose() throws Exception { + setupData(randomData(2)); + SeekableInputStream closed = + new ADLSv2InputStream(fileClient, null, azureProperties, MetricsContext.nullMetrics()); + closed.close(); + assertThatThrownBy(() -> closed.seek(0)).isInstanceOf(IllegalStateException.class); Review Comment: Thanks for pointing that out, I added the message assertion, and also added a test for negative seek. ########## azure/src/main/java/org/apache/iceberg/azure/AzureProperties.java: ########## @@ -0,0 +1,59 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +package org.apache.iceberg.azure; + +import com.azure.identity.DefaultAzureCredential; +import com.azure.identity.DefaultAzureCredentialBuilder; +import com.azure.storage.file.datalake.DataLakePathClientBuilder; +import java.io.Serializable; +import java.util.Map; +import java.util.Optional; + +public class AzureProperties implements Serializable { + public static final String ADLSV2_READ_BLOCK_SIZE = "adlsv2.read.block-size-bytes"; + public static final String ADLSV2_WRITE_BLOCK_SIZE = "adlsv2.write.block-size-bytes"; + private static final DefaultAzureCredential DEFAULT_CREDENTIAL = + new DefaultAzureCredentialBuilder().build(); + + private Integer adlsv2ReadBlockSize; + private Long adlsv2WriteBlockSize; + + public AzureProperties() {} + + public AzureProperties(Map<String, String> properties) { + if (properties.containsKey(ADLSV2_READ_BLOCK_SIZE)) { + adlsv2ReadBlockSize = Integer.parseInt(properties.get(ADLSV2_READ_BLOCK_SIZE)); + } + if (properties.containsKey(ADLSV2_WRITE_BLOCK_SIZE)) { + adlsv2WriteBlockSize = Long.parseLong(properties.get(ADLSV2_WRITE_BLOCK_SIZE)); + } + } + + public Optional<Integer> adlsv2ReadBlockSize() { + return Optional.ofNullable(adlsv2ReadBlockSize); + } + + public Optional<Long> adlsv2WriteBlockSize() { + return Optional.ofNullable(adlsv2WriteBlockSize); + } + + public <T extends DataLakePathClientBuilder> void applyCredentialConfigurations(T builder) { Review Comment: This mimicked the S3 config class but singular makes more sense so I changed this. -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: [email protected] For queries about this service, please contact Infrastructure at: [email protected] --------------------------------------------------------------------- To unsubscribe, e-mail: [email protected] For additional commands, e-mail: [email protected]
