nastra commented on code in PR #8303:
URL: https://github.com/apache/iceberg/pull/8303#discussion_r1293464118


##########
azure/src/test/java/org/apache/iceberg/azure/adlsv2/ADLSv2LocationTest.java:
##########
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.azure.adlsv2;
+
+import java.util.stream.Stream;
+import org.apache.iceberg.exceptions.ValidationException;
+import org.assertj.core.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+public class ADLSv2LocationTest {
+  @Test
+  public void testLocationParsing() {
+    Stream.of("abfs", "abfss")

Review Comment:
   maybe this test should be parameterized to make it explicit that we test 
with different parameters: 
   ```
   @ParameterizedTest
   @ValueSource(strings = {"abfs", "abfss"})
   public void testLocationParsing(String scheme)
   ...
   ```



##########
azure/src/test/java/org/apache/iceberg/azure/adlsv2/ADLSv2OutputStreamTest.java:
##########
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.azure.adlsv2;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.when;
+
+import com.azure.storage.file.datalake.DataLakeFileClient;
+import java.io.ByteArrayOutputStream;
+import java.io.IOException;
+import java.io.UncheckedIOException;
+import java.util.Random;
+import java.util.stream.Stream;
+import org.apache.iceberg.azure.AzureProperties;
+import org.apache.iceberg.metrics.MetricsContext;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+public class ADLSv2OutputStreamTest {
+
+  private final Random random = new Random(1);
+  private final AzureProperties azureProperties = new AzureProperties();
+  private DataLakeFileClient fileClient;
+
+  @BeforeEach
+  public void before() {
+    fileClient = mock(DataLakeFileClient.class);
+  }
+
+  @Test
+  public void testWrite() {

Review Comment:
   maybe this should be a parameterized test using `@ParameterizedTest  
@ValueSource(booleans = {true, false})`?



##########
azure/src/test/java/org/apache/iceberg/azure/adlsv2/ADLSv2InputStreamTest.java:
##########
@@ -0,0 +1,239 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.azure.adlsv2;
+
+import static org.assertj.core.api.Assertions.assertThat;
+import static org.assertj.core.api.Assertions.assertThatThrownBy;
+import static org.mockito.ArgumentMatchers.any;
+import static org.mockito.Mockito.mock;
+import static org.mockito.Mockito.spy;
+import static org.mockito.Mockito.when;
+
+import com.azure.storage.file.datalake.DataLakeFileClient;
+import 
com.azure.storage.file.datalake.models.DataLakeFileOpenInputStreamResult;
+import com.azure.storage.file.datalake.models.FileRange;
+import com.azure.storage.file.datalake.models.PathProperties;
+import com.azure.storage.file.datalake.options.DataLakeFileInputStreamOptions;
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Random;
+import org.apache.iceberg.azure.AzureProperties;
+import org.apache.iceberg.io.IOUtil;
+import org.apache.iceberg.io.RangeReadable;
+import org.apache.iceberg.io.SeekableInputStream;
+import org.apache.iceberg.metrics.MetricsContext;
+import org.assertj.core.api.Assertions;
+import org.junit.jupiter.api.BeforeEach;
+import org.junit.jupiter.api.Test;
+
+public class ADLSv2InputStreamTest {
+
+  private final Random random = new Random(1);
+  private final AzureProperties azureProperties = new AzureProperties();
+  private DataLakeFileClient fileClient;
+
+  @BeforeEach
+  public void before() {
+    fileClient = mock(DataLakeFileClient.class);
+  }
+
+  private void setupData(byte[] data) {
+    PathProperties pathProps = mock(PathProperties.class);
+    when(pathProps.getFileSize()).thenReturn((long) data.length);
+
+    DataLakeFileOpenInputStreamResult openResult = 
mock(DataLakeFileOpenInputStreamResult.class);
+    when(openResult.getProperties()).thenReturn(pathProps);
+
+    when(fileClient.openInputStream(any()))
+        .thenAnswer(
+            i -> {
+              DataLakeFileInputStreamOptions options =
+                  (DataLakeFileInputStreamOptions) i.getArguments()[0];
+              FileRange range = options.getRange();
+
+              byte[] streamData;
+              if (range == null) {
+                streamData = data;
+              } else {
+                int start = (int) range.getOffset();
+                int maxLen = data.length - start;
+                int len =
+                    range.getCount() == null
+                        ? data.length - start
+                        : Math.min(maxLen, range.getCount().intValue());
+                streamData = new byte[len];
+                System.arraycopy(data, start, streamData, 0, len);
+              }
+
+              // disable available() so large seek will trigger new stream
+              ByteArrayInputStream in = spy(new 
ByteArrayInputStream(streamData));
+              when(in.available()).thenReturn(0);
+
+              when(openResult.getInputStream()).thenReturn(in);
+              return openResult;
+            });
+  }
+
+  @Test
+  public void testRead() throws Exception {
+    int dataSize = 1024 * 1024 * 10;
+    byte[] data = randomData(dataSize);
+
+    setupData(data);
+
+    try (SeekableInputStream in =
+        new ADLSv2InputStream(fileClient, null, azureProperties, 
MetricsContext.nullMetrics())) {
+      int readSize = 1024;
+
+      readAndCheck(in, in.getPos(), readSize, data, false);
+      readAndCheck(in, in.getPos(), readSize, data, true);
+
+      // Seek forward in current stream
+      int seekSize = 1024;
+      readAndCheck(in, in.getPos() + seekSize, readSize, data, false);
+      readAndCheck(in, in.getPos() + seekSize, readSize, data, true);
+
+      // Buffered read
+      readAndCheck(in, in.getPos(), readSize, data, true);
+      readAndCheck(in, in.getPos(), readSize, data, false);
+
+      // Seek with new stream
+      long seekNewStreamPosition = 2 * 1024 * 1024;
+      readAndCheck(in, in.getPos() + seekNewStreamPosition, readSize, data, 
true);
+      readAndCheck(in, in.getPos() + seekNewStreamPosition, readSize, data, 
false);
+
+      // Backseek and read
+      readAndCheck(in, 0, readSize, data, true);
+      readAndCheck(in, 0, readSize, data, false);
+    }
+  }
+
+  @Test
+  public void testReadSingle() throws Exception {
+    int i0 = 1;
+    int i1 = 255;
+    byte[] data = {(byte) i0, (byte) i1};
+
+    setupData(data);
+
+    try (SeekableInputStream in =
+        new ADLSv2InputStream(fileClient, null, azureProperties, 
MetricsContext.nullMetrics())) {
+      assertThat(in.read()).isEqualTo(i0);
+      assertThat(in.read()).isEqualTo(i1);
+    }
+  }
+
+  private void readAndCheck(
+      SeekableInputStream in, long rangeStart, int size, byte[] original, 
boolean buffered)
+      throws IOException {
+    in.seek(rangeStart);
+    assertThat(rangeStart).isEqualTo(in.getPos());
+
+    long rangeEnd = rangeStart + size;
+    byte[] actual = new byte[size];
+
+    if (buffered) {
+      IOUtil.readFully(in, actual, 0, actual.length);
+    } else {
+      int read = 0;
+      while (read < size) {
+        actual[read++] = (byte) in.read();
+      }
+    }
+
+    assertThat(in.getPos()).isEqualTo(rangeEnd);
+    assertThat(actual).isEqualTo(Arrays.copyOfRange(original, (int) 
rangeStart, (int) rangeEnd));
+  }
+
+  @Test
+  public void testRangeRead() throws Exception {
+    int dataSize = 1024 * 1024 * 10;
+    byte[] expected = randomData(dataSize);
+    byte[] actual = new byte[dataSize];
+
+    long position;
+    int offset;
+    int length;
+
+    setupData(expected);
+
+    try (RangeReadable in =
+        new ADLSv2InputStream(fileClient, null, azureProperties, 
MetricsContext.nullMetrics())) {
+      // first 1k
+      position = 0;
+      offset = 0;
+      length = 1024;
+      readAndCheckRanges(in, expected, position, actual, offset, length);
+
+      // last 1k
+      position = dataSize - 1024;
+      offset = dataSize - 1024;
+      readAndCheckRanges(in, expected, position, actual, offset, length);
+
+      // middle 2k
+      position = dataSize / 2 - 1024;
+      offset = dataSize / 2 - 1024;
+      length = 1024 * 2;
+      readAndCheckRanges(in, expected, position, actual, offset, length);
+    }
+  }
+
+  private void readAndCheckRanges(
+      RangeReadable in, byte[] original, long position, byte[] buffer, int 
offset, int length)
+      throws IOException {
+    in.readFully(position, buffer, offset, length);
+
+    Assertions.assertThat(Arrays.copyOfRange(buffer, offset, offset + length))
+        .isEqualTo(Arrays.copyOfRange(original, offset, offset + length));
+  }
+
+  @Test
+  public void testClose() throws Exception {
+    setupData(randomData(2));
+    SeekableInputStream closed =
+        new ADLSv2InputStream(fileClient, null, azureProperties, 
MetricsContext.nullMetrics());
+    closed.close();
+    assertThatThrownBy(() -> 
closed.seek(0)).isInstanceOf(IllegalStateException.class);

Review Comment:
   do you know if there's an actual error message? Would be good to add a 
`.hasMessage(...)` check. If the message is null, then we could add 
`.hasMessage(null)`



##########
azure/src/main/java/org/apache/iceberg/azure/adlsv2/ADLSv2FileIO.java:
##########
@@ -0,0 +1,144 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.azure.adlsv2;
+
+import com.azure.core.http.HttpClient;
+import com.azure.storage.file.datalake.DataLakeFileClient;
+import com.azure.storage.file.datalake.DataLakePathClientBuilder;
+import java.util.Map;
+import org.apache.iceberg.azure.AzureProperties;
+import org.apache.iceberg.common.DynConstructors;
+import org.apache.iceberg.io.FileIO;
+import org.apache.iceberg.io.InputFile;
+import org.apache.iceberg.io.OutputFile;
+import org.apache.iceberg.metrics.MetricsContext;
+import 
org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting;
+import org.apache.iceberg.util.SerializableMap;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+/**
+ * FileIO implementation backed by Azure Data Lake Storage v2 (ADLSv2)
+ *
+ * <p>Locations follow the conventions used by Hadoop's Azure support, i.e.
+ *
+ * <pre>{@code abfs[s]://<container>@<storage 
account>.dfs.core.windows.net/<blob_path>}</pre>
+ *
+ * <p>See <a 
href="https://hadoop.apache.org/docs/stable/hadoop-azure/abfs.html";>Hadoop Azure
+ * Support</a>
+ */
+public class ADLSv2FileIO implements FileIO {
+  private static final Logger LOG = 
LoggerFactory.getLogger(ADLSv2FileIO.class);
+  private static final String DEFAULT_METRICS_IMPL =
+      "org.apache.iceberg.hadoop.HadoopMetricsContext";
+
+  private static final HttpClient HTTP = HttpClient.createDefault();
+
+  private AzureProperties azureProperties;
+  private MetricsContext metrics = MetricsContext.nullMetrics();
+  private SerializableMap<String, String> properties = null;
+
+  /**
+   * No-arg constructor to load the FileIO dynamically.
+   *
+   * <p>All fields are initialized by calling {@link 
ADLSv2FileIO#initialize(Map)} later.
+   */
+  public ADLSv2FileIO() {}
+
+  /**
+   * Constructor with Azure properties.
+   *
+   * <p>Calling {@link ADLSv2FileIO#initialize(Map)} will overwrite 
information set in this
+   * constructor.
+   *
+   * @param azureProperties Azure properties
+   */
+  public ADLSv2FileIO(AzureProperties azureProperties) {

Review Comment:
   do we need this one? It doesn't seem to be used



##########
azure/src/main/java/org/apache/iceberg/azure/AzureProperties.java:
##########
@@ -0,0 +1,59 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.azure;
+
+import com.azure.identity.DefaultAzureCredential;
+import com.azure.identity.DefaultAzureCredentialBuilder;
+import com.azure.storage.file.datalake.DataLakePathClientBuilder;
+import java.io.Serializable;
+import java.util.Map;
+import java.util.Optional;
+
+public class AzureProperties implements Serializable {
+  public static final String ADLSV2_READ_BLOCK_SIZE = 
"adlsv2.read.block-size-bytes";
+  public static final String ADLSV2_WRITE_BLOCK_SIZE = 
"adlsv2.write.block-size-bytes";
+  private static final DefaultAzureCredential DEFAULT_CREDENTIAL =
+      new DefaultAzureCredentialBuilder().build();
+
+  private Integer adlsv2ReadBlockSize;
+  private Long adlsv2WriteBlockSize;
+
+  public AzureProperties() {}
+
+  public AzureProperties(Map<String, String> properties) {
+    if (properties.containsKey(ADLSV2_READ_BLOCK_SIZE)) {
+      adlsv2ReadBlockSize = 
Integer.parseInt(properties.get(ADLSV2_READ_BLOCK_SIZE));
+    }
+    if (properties.containsKey(ADLSV2_WRITE_BLOCK_SIZE)) {
+      adlsv2WriteBlockSize = 
Long.parseLong(properties.get(ADLSV2_WRITE_BLOCK_SIZE));
+    }
+  }
+
+  public Optional<Integer> adlsv2ReadBlockSize() {
+    return Optional.ofNullable(adlsv2ReadBlockSize);
+  }
+
+  public Optional<Long> adlsv2WriteBlockSize() {
+    return Optional.ofNullable(adlsv2WriteBlockSize);
+  }
+
+  public <T extends DataLakePathClientBuilder> void 
applyCredentialConfigurations(T builder) {

Review Comment:
   nit: should the name be in singular form rather than plural? 



##########
azure/src/test/java/org/apache/iceberg/azure/adlsv2/ADLSv2LocationTest.java:
##########
@@ -0,0 +1,96 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.iceberg.azure.adlsv2;
+
+import java.util.stream.Stream;
+import org.apache.iceberg.exceptions.ValidationException;
+import org.assertj.core.api.Assertions;
+import org.junit.jupiter.api.Test;
+
+public class ADLSv2LocationTest {
+  @Test
+  public void testLocationParsing() {
+    Stream.of("abfs", "abfss")
+        .forEach(
+            scheme -> {
+              String p1 = scheme + 
"://[email protected]/path/to/file";
+              ADLSv2Location location = new ADLSv2Location(p1);
+
+              Assertions.assertThat(location.storageAccountUrl())
+                  .isEqualTo("https://account.dfs.core.windows.net";);
+              
Assertions.assertThat(location.container()).isEqualTo("container");
+              Assertions.assertThat(location.path()).isEqualTo("path/to/file");
+            });
+  }
+
+  @Test
+  public void testEncodedString() {
+    String p1 = 
"abfs://[email protected]/path%20to%20file";
+    ADLSv2Location location = new ADLSv2Location(p1);
+
+    Assertions.assertThat(location.storageAccountUrl())
+        .isEqualTo("https://account.dfs.core.windows.net";);
+    Assertions.assertThat(location.container()).isEqualTo("container");
+    Assertions.assertThat(location.path()).isEqualTo("path%20to%20file");

Review Comment:
   nit: it's probably ok to add a static import for assertThat / 
assertThatThrownBy on newly introduced test classes



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

Reply via email to