kevinjqliu commented on code in PR #2111: URL: https://github.com/apache/iceberg-python/pull/2111#discussion_r2157259714
########## pyiceberg/io/pyarrow.py: ########## @@ -394,6 +402,9 @@ def _initialize_fs(self, scheme: str, netloc: Optional[str] = None) -> FileSyste elif scheme in {"gs", "gcs"}: return self._initialize_gcs_fs() + elif scheme in {"abfs", "abfss", "wasb", "wasbs"}: Review Comment: I cant find any pyarrow docs that indicates `wasb` and `wasbs` is supported. ########## pyiceberg/io/__init__.py: ########## @@ -82,6 +82,10 @@ ADLS_CLIENT_ID = "adls.client-id" ADLS_CLIENT_SECRET = "adls.client-secret" ADLS_ACCOUNT_HOST = "adls.account-host" +ADLS_BLOB_STORAGE_AUTHORITY = "adls.blob-storage-authority" Review Comment: doc reference: https://arrow.apache.org/docs/python/filesystems.html#azure-storage-file-system and https://arrow.apache.org/docs/cpp/api/filesystem.html#azure-filesystem 👍 ########## pyiceberg/io/pyarrow.py: ########## @@ -197,6 +204,7 @@ MAP_VALUE_NAME = "value" DOC = "doc" UTC_ALIASES = {"UTC", "+00:00", "Etc/UTC", "Z"} +MIN_PYARROW_VERSION_SUPPORTING_AZURE_FS = "20.0.0" Review Comment: nit: inline this at the function level ########## tests/io/test_pyarrow.py: ########## @@ -1670,9 +1678,8 @@ def test_new_output_file_gcs(pyarrow_fileio_gcs: PyArrowFileIO) -> None: @pytest.mark.gcs -@pytest.mark.skip(reason="Open issue on Arrow: https://github.com/apache/arrow/issues/36993") Review Comment: i see that https://github.com/apache/arrow/issues/36993 is still open. is the issue resolved and we can run this test? ########## pyiceberg/io/pyarrow.py: ########## @@ -475,6 +486,42 @@ def _initialize_s3_fs(self, netloc: Optional[str]) -> FileSystem: return S3FileSystem(**client_kwargs) + def _initialize_azure_fs(self) -> FileSystem: + from packaging import version + + if version.parse(pyarrow.__version__) < version.parse(MIN_PYARROW_VERSION_SUPPORTING_AZURE_FS): + raise ImportError( + f"pyarrow version >= {MIN_PYARROW_VERSION_SUPPORTING_AZURE_FS} required for AzureFileSystem support, " + f"but found version {pyarrow.__version__}." + ) + + from pyarrow.fs import AzureFileSystem + + client_kwargs: Dict[str, str] = {} + + if account_name := self.properties.get(ADLS_ACCOUNT_NAME): + client_kwargs["account_name"] = account_name + + if account_key := self.properties.get(ADLS_ACCOUNT_KEY): + client_kwargs["account_key"] = account_key + + if blob_storage_authority := self.properties.get(ADLS_BLOB_STORAGE_AUTHORITY): + client_kwargs["blob_storage_authority"] = blob_storage_authority + + if dfs_storage_authority := self.properties.get(ADLS_DFS_STORAGE_AUTHORITY): + client_kwargs["dfs_storage_authority"] = dfs_storage_authority + + if blob_storage_scheme := self.properties.get(ADLS_BLOB_STORAGE_SCHEME): + client_kwargs["blob_storage_scheme"] = blob_storage_scheme + + if dfs_storage_scheme := self.properties.get(ADLS_DFS_STORAGE_SCHEME): + client_kwargs["dfs_storage_scheme"] = dfs_storage_scheme + + if sas_token := self.properties.get(ADLS_SAS_TOKEN): + client_kwargs["sas_token"] = sas_token + + return AzureFileSystem(**client_kwargs) Review Comment: We are not using some of the ADLS config variables: ``` ADLS_CONNECTION_STRING = "adls.connection-string" ADLS_TENANT_ID = "adls.tenant-id" ADLS_CLIENT_ID = "adls.client-id" ADLS_CLIENT_SECRET = "adls.client-secret" ADLS_ACCOUNT_HOST = "adls.account-host" ``` I wonder if these are fsspec specific or we can also include them here ########## pyiceberg/io/__init__.py: ########## @@ -82,6 +82,10 @@ ADLS_CLIENT_ID = "adls.client-id" ADLS_CLIENT_SECRET = "adls.client-secret" ADLS_ACCOUNT_HOST = "adls.account-host" +ADLS_BLOB_STORAGE_AUTHORITY = "adls.blob-storage-authority" Review Comment: we should add these to the docs https://py.iceberg.apache.org/configuration/#azure-data-lake ########## tests/conftest.py: ########## @@ -348,6 +354,11 @@ def table_schema_with_all_types() -> Schema: ) +@pytest.fixture(params=["abfss", "wasbs"]) Review Comment: ```suggestion @pytest.fixture(params=["abfs", "abfss", "wasb", "wasbs"]) ``` should we include the other 2 variants? -- This is an automated message from the Apache Git Service. To respond to the message, please log on to GitHub and use the URL above to go to the specific comment. To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For queries about this service, please contact Infrastructure at: us...@infra.apache.org --------------------------------------------------------------------- To unsubscribe, e-mail: issues-unsubscr...@iceberg.apache.org For additional commands, e-mail: issues-h...@iceberg.apache.org