[GitHub] [iceberg] Fokko commented on a diff in pull request #7921: Initial code for pyiceberg SqlCatalog

via GitHub Wed, 19 Jul 2023 07:27:58 -0700


Fokko commented on code in PR #7921:
URL: https://github.com/apache/iceberg/pull/7921#discussion_r1268107763



##########
python/pyiceberg/catalog/__init__.py:
##########
@@ -110,11 +111,22 @@ def load_dynamodb(name: str, conf: Properties) -> Catalog:
         raise NotInstalledError("AWS DynamoDB support not installed: pip 
install 'pyiceberg[dynamodb]'") from exc
 
 
+def load_sql(name: str, conf: Properties) -> Catalog:
+    try:
+        from pyiceberg.catalog.sql import SqlCatalog
+
+        return SqlCatalog(name, **conf)
+    except ImportError as exc:
+        # TODO: is that really the package to install? (pyiceberg[sqlalchemy])
+        raise NotInstalledError("SQLAlchemy support not installed: pip install 
'pyiceberg[sqlalchemy]'") from exc

Review Comment:
   ```suggestion
           raise NotInstalledError("SQLAlchemy support not installed: pip 
install 'pyiceberg[sql]'") from exc
   ```



##########
python/pyiceberg/catalog/sql.py:
##########
@@ -0,0 +1,448 @@
+from typing import (
+    List,
+    Optional,
+    Set,
+    Union,
+)
+
+from sqlalchemy import (
+    String,
+    create_engine,
+    delete,
+    insert,
+    select,
+    union,
+    update,
+)
+from sqlalchemy.exc import IntegrityError
+from sqlalchemy.orm import (
+    DeclarativeBase,
+    Mapped,
+    MappedAsDataclass,
+    Session,
+    mapped_column,
+)
+
+from pyiceberg.catalog import (
+    METADATA_LOCATION,
+    Catalog,
+    Identifier,
+    Properties,
+    PropertiesUpdateSummary,
+)
+from pyiceberg.exceptions import (
+    NamespaceAlreadyExistsError,
+    NamespaceNotEmptyError,
+    NoSuchNamespaceError,
+    NoSuchPropertyException,
+    NoSuchTableError,
+    TableAlreadyExistsError,
+)
+from pyiceberg.io import load_file_io
+from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec
+from pyiceberg.schema import Schema
+from pyiceberg.serializers import FromInputFile
+from pyiceberg.table import Table
+from pyiceberg.table.metadata import new_table_metadata
+from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder
+from pyiceberg.typedef import EMPTY_DICT
+
+
+class SqlCatalogBaseTable(MappedAsDataclass, DeclarativeBase):
+    pass
+
+
+class IcebergTables(SqlCatalogBaseTable):
+    __tablename__ = "iceberg_tables"
+
+    catalog_name: Mapped[str] = mapped_column(String(255), nullable=False, 
primary_key=True)
+    table_namespace: Mapped[str] = mapped_column(String(255), nullable=False, 
primary_key=True)
+    table_name: Mapped[str] = mapped_column(String(255), nullable=False, 
primary_key=True)
+    metadata_location: Mapped[str] = mapped_column(String(1000), nullable=True)
+    previous_metadata_location: Mapped[str] = mapped_column(String(1000), 
nullable=True)

Review Comment:
   ```suggestion
       previous_metadata_location: Mapped[Optional[str]] = 
mapped_column(String(1000), nullable=True)
   ```



##########
python/pyiceberg/catalog/sql.py:
##########
@@ -0,0 +1,441 @@
+from typing import (
+    List,
+    Optional,
+    Set,
+    Union,
+)
+
+from sqlalchemy import (
+    String,
+    case,
+    create_engine,
+    delete,
+    select,
+    union,
+    update,
+)
+from sqlalchemy.exc import IntegrityError
+from sqlalchemy.orm import (
+    DeclarativeBase,
+    Mapped,
+    MappedAsDataclass,
+    Session,
+    mapped_column,
+)
+
+from pyiceberg.catalog import (
+    METADATA_LOCATION,
+    Catalog,
+    Identifier,
+    Properties,
+    PropertiesUpdateSummary,
+)
+from pyiceberg.exceptions import (
+    NamespaceAlreadyExistsError,
+    NamespaceNotEmptyError,
+    NoSuchNamespaceError,
+    NoSuchPropertyException,
+    NoSuchTableError,
+    TableAlreadyExistsError,
+)
+from pyiceberg.io import load_file_io
+from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec
+from pyiceberg.schema import Schema
+from pyiceberg.serializers import FromInputFile
+from pyiceberg.table import Table
+from pyiceberg.table.metadata import new_table_metadata
+from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder
+from pyiceberg.typedef import EMPTY_DICT
+
+
+class SQLCatalogBase(MappedAsDataclass, DeclarativeBase):
+    pass
+
+
+class IcebergTables(SQLCatalogBase):
+    __tablename__ = "iceberg_tables"
+
+    catalog_name: Mapped[str] = mapped_column(String(255), nullable=False, 
primary_key=True)
+    table_namespace: Mapped[str] = mapped_column(String(255), nullable=False, 
primary_key=True)
+    table_name: Mapped[str] = mapped_column(String(255), nullable=False, 
primary_key=True)
+    metadata_location: Mapped[str] = mapped_column(String(1000), nullable=True)
+    previous_metadata_location: Mapped[str] = mapped_column(String(1000), 
nullable=True)
+
+
+class IcebergNamespaceProperties(SQLCatalogBase):
+    __tablename__ = "iceberg_namespace_properties"
+    # Catalog minimum Namespace Properties
+    NAMESPACE_MINIMAL_PROPERTIES = {"exists": "true"}
+
+    catalog_name: Mapped[str] = mapped_column(String(255), nullable=False, 
primary_key=True)
+    namespace: Mapped[str] = mapped_column(String(255), nullable=False, 
primary_key=True)
+    property_key: Mapped[str] = mapped_column(String(255), nullable=False, 
primary_key=True)
+    property_value: Mapped[str] = mapped_column(String(1000), nullable=True)

Review Comment:
   I think it makes more sense to start strict, we can always relax the 
constraint. 



##########
python/pyiceberg/catalog/sql.py:
##########
@@ -0,0 +1,448 @@
+from typing import (
+    List,
+    Optional,
+    Set,
+    Union,
+)
+
+from sqlalchemy import (
+    String,
+    create_engine,
+    delete,
+    insert,
+    select,
+    union,
+    update,
+)
+from sqlalchemy.exc import IntegrityError
+from sqlalchemy.orm import (
+    DeclarativeBase,
+    Mapped,
+    MappedAsDataclass,
+    Session,
+    mapped_column,
+)
+
+from pyiceberg.catalog import (
+    METADATA_LOCATION,
+    Catalog,
+    Identifier,
+    Properties,
+    PropertiesUpdateSummary,
+)
+from pyiceberg.exceptions import (
+    NamespaceAlreadyExistsError,
+    NamespaceNotEmptyError,
+    NoSuchNamespaceError,
+    NoSuchPropertyException,
+    NoSuchTableError,
+    TableAlreadyExistsError,
+)
+from pyiceberg.io import load_file_io
+from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec
+from pyiceberg.schema import Schema
+from pyiceberg.serializers import FromInputFile
+from pyiceberg.table import Table
+from pyiceberg.table.metadata import new_table_metadata
+from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder
+from pyiceberg.typedef import EMPTY_DICT
+
+
+class SqlCatalogBaseTable(MappedAsDataclass, DeclarativeBase):
+    pass
+
+
+class IcebergTables(SqlCatalogBaseTable):
+    __tablename__ = "iceberg_tables"
+
+    catalog_name: Mapped[str] = mapped_column(String(255), nullable=False, 
primary_key=True)
+    table_namespace: Mapped[str] = mapped_column(String(255), nullable=False, 
primary_key=True)
+    table_name: Mapped[str] = mapped_column(String(255), nullable=False, 
primary_key=True)
+    metadata_location: Mapped[str] = mapped_column(String(1000), nullable=True)

Review Comment:
   ```suggestion
       metadata_location: Mapped[Optional[str]] = mapped_column(String(1000), 
nullable=True)
   ```



##########
python/pyproject.toml:
##########
@@ -61,6 +61,7 @@ thrift = { version = ">=0.13.0,<1.0.0", optional = true }
 boto3 = { version = ">=1.17.106", optional = true }
 s3fs = { version = ">=2021.08.0,<2024.1.0", optional = true } # Upper bound 
set arbitrarily, to be reassessed in early 2024.
 adlfs = { version = ">=2021.07.0,<2024.1.0", optional = true } # Upper bound 
set arbitrarily, to be reassessed in early 2024.
+psycopg2-binary = { version = ">=2.9.6", optional = true }

Review Comment:
   What do you think of:
   
   ```
   pip3 install pyiceberg[sql-postgres]
   ```
   
   This will install sqlalchemy, together with the Postgres driver.



##########
python/pyiceberg/catalog/sql.py:
##########
@@ -0,0 +1,448 @@
+from typing import (
+    List,
+    Optional,
+    Set,
+    Union,
+)
+
+from sqlalchemy import (
+    String,
+    create_engine,
+    delete,
+    insert,
+    select,
+    union,
+    update,
+)
+from sqlalchemy.exc import IntegrityError
+from sqlalchemy.orm import (
+    DeclarativeBase,
+    Mapped,
+    MappedAsDataclass,
+    Session,
+    mapped_column,
+)
+
+from pyiceberg.catalog import (
+    METADATA_LOCATION,
+    Catalog,
+    Identifier,
+    Properties,
+    PropertiesUpdateSummary,
+)
+from pyiceberg.exceptions import (
+    NamespaceAlreadyExistsError,
+    NamespaceNotEmptyError,
+    NoSuchNamespaceError,
+    NoSuchPropertyException,
+    NoSuchTableError,
+    TableAlreadyExistsError,
+)
+from pyiceberg.io import load_file_io
+from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec
+from pyiceberg.schema import Schema
+from pyiceberg.serializers import FromInputFile
+from pyiceberg.table import Table
+from pyiceberg.table.metadata import new_table_metadata
+from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder
+from pyiceberg.typedef import EMPTY_DICT
+
+
+class SqlCatalogBaseTable(MappedAsDataclass, DeclarativeBase):
+    pass
+
+
+class IcebergTables(SqlCatalogBaseTable):
+    __tablename__ = "iceberg_tables"
+
+    catalog_name: Mapped[str] = mapped_column(String(255), nullable=False, 
primary_key=True)
+    table_namespace: Mapped[str] = mapped_column(String(255), nullable=False, 
primary_key=True)
+    table_name: Mapped[str] = mapped_column(String(255), nullable=False, 
primary_key=True)
+    metadata_location: Mapped[str] = mapped_column(String(1000), nullable=True)
+    previous_metadata_location: Mapped[str] = mapped_column(String(1000), 
nullable=True)
+
+
+class IcebergNamespaceProperties(SqlCatalogBaseTable):
+    __tablename__ = "iceberg_namespace_properties"
+    # Catalog minimum Namespace Properties
+    NAMESPACE_MINIMAL_PROPERTIES = {"exists": "true"}
+
+    catalog_name: Mapped[str] = mapped_column(String(255), nullable=False, 
primary_key=True)
+    namespace: Mapped[str] = mapped_column(String(255), nullable=False, 
primary_key=True)
+    property_key: Mapped[str] = mapped_column(String(255), nullable=False, 
primary_key=True)
+    property_value: Mapped[str] = mapped_column(String(1000), nullable=True)
+
+
+class SqlCatalog(Catalog):
+    def __init__(self, name: str, **properties: str):
+        super().__init__(name, **properties)
+
+        if not (uri_prop := self.properties.get("uri")):
+            raise NoSuchPropertyException("SQL connection URI is required")
+        self.engine = create_engine(uri_prop, echo=True)
+
+    def initialize_tables(self) -> None:
+        SqlCatalogBaseTable.metadata.create_all(self.engine)
+
+    def destroy_tables(self) -> None:
+        SqlCatalogBaseTable.metadata.drop_all(self.engine)
+
+    def _convert_orm_to_iceberg(self, orm_table: IcebergTables) -> Table:
+        # Check for expected properties.
+        if not (metadata_location := orm_table.metadata_location):
+            raise NoSuchTableError(f"Table property {METADATA_LOCATION} is 
missing")
+        if not (table_namespace := orm_table.table_namespace):
+            raise NoSuchTableError(f"Table property 
{IcebergTables.table_namespace} is missing")
+        if not (table_name := orm_table.table_name):
+            raise NoSuchTableError(f"Table property {IcebergTables.table_name} 
is missing")
+
+        io = load_file_io(properties=self.properties, 
location=metadata_location)
+        file = io.new_input(metadata_location)
+        metadata = FromInputFile.table_metadata(file)
+        return Table(
+            identifier=(self.name, table_namespace, table_name),
+            metadata=metadata,
+            metadata_location=metadata_location,
+            io=self._load_file_io(metadata.properties, metadata_location),
+        )
+
+    def create_table(
+        self,
+        identifier: Union[str, Identifier],
+        schema: Schema,
+        location: Optional[str] = None,
+        partition_spec: PartitionSpec = UNPARTITIONED_PARTITION_SPEC,
+        sort_order: SortOrder = UNSORTED_SORT_ORDER,
+        properties: Properties = EMPTY_DICT,
+    ) -> Table:
+        """
+        Create an Iceberg table.
+
+        Args:
+            identifier: Table identifier.
+            schema: Table's schema.
+            location: Location for the table. Optional Argument.
+            partition_spec: PartitionSpec for the table.
+            sort_order: SortOrder for the table.
+            properties: Table properties that can be a string based dictionary.
+
+        Returns:
+            Table: the created table instance.
+
+        Raises:
+            AlreadyExistsError: If a table with the name already exists.
+            ValueError: If the identifier is invalid, or no path is given to 
store metadata.
+
+        """
+        database_name, table_name = 
self.identifier_to_database_and_table(identifier)
+        if not self._namespace_exists(database_name):
+            raise NoSuchNamespaceError(f"Namespace does not exist: 
{database_name}")
+
+        location = self._resolve_table_location(location, database_name, 
table_name)
+        metadata_location = self._get_metadata_location(location=location)
+        metadata = new_table_metadata(
+            location=location, schema=schema, partition_spec=partition_spec, 
sort_order=sort_order, properties=properties
+        )
+        io = load_file_io(properties=self.properties, 
location=metadata_location)
+        self._write_metadata(metadata, io, metadata_location)
+
+        with Session(self.engine) as session:
+            try:
+                session.add(
+                    IcebergTables(
+                        catalog_name=self.name,
+                        table_namespace=database_name,
+                        table_name=table_name,
+                        metadata_location=metadata_location,
+                        previous_metadata_location=None,
+                    )
+                )
+                session.commit()
+            except IntegrityError as e:
+                raise TableAlreadyExistsError(f"Table 
{database_name}.{table_name} already exists") from e
+
+        return self.load_table(identifier=identifier)
+
+    def load_table(self, identifier: Union[str, Identifier]) -> Table:
+        """Loads the table's metadata and returns the table instance.
+
+        You can also use this method to check for table existence using 'try 
catalog.table() except NoSuchTableError'.
+        Note: This method doesn't scan data stored in the table.
+
+        Args:
+            identifier (str | Identifier): Table identifier.
+
+        Returns:
+            Table: the table instance with its metadata.
+
+        Raises:
+            NoSuchTableError: If a table with the name does not exist.
+        """
+        database_name, table_name = 
self.identifier_to_database_and_table(identifier, NoSuchTableError)
+        with Session(self.engine) as session:
+            stmt = select(IcebergTables).where(
+                IcebergTables.catalog_name == self.name,
+                IcebergTables.table_namespace == database_name,
+                IcebergTables.table_name == table_name,
+            )
+            result = session.scalar(stmt)
+        if result:
+            return self._convert_orm_to_iceberg(result)
+        raise NoSuchTableError(f"Table does not exist: 
{database_name}.{table_name}")
+
+    def drop_table(self, identifier: Union[str, Identifier]) -> None:
+        """Drop a table.
+
+        Args:
+            identifier (str | Identifier): Table identifier.
+
+        Raises:
+            NoSuchTableError: If a table with the name does not exist.
+        """
+        database_name, table_name = 
self.identifier_to_database_and_table(identifier, NoSuchTableError)
+        with Session(self.engine) as session:
+            res = session.execute(
+                delete(IcebergTables).where(
+                    IcebergTables.catalog_name == self.name,
+                    IcebergTables.table_namespace == database_name,
+                    IcebergTables.table_name == table_name,
+                )
+            )
+            session.commit()
+        if res.rowcount < 1:
+            raise NoSuchTableError(f"Table does not exist: 
{database_name}.{table_name}")
+
+    def rename_table(self, from_identifier: Union[str, Identifier], 
to_identifier: Union[str, Identifier]) -> Table:
+        """Rename a fully classified table name.
+
+        Args:
+            from_identifier (str | Identifier): Existing table identifier.
+            to_identifier (str | Identifier): New table identifier.
+
+        Returns:
+            Table: the updated table instance with its metadata.
+
+        Raises:
+            NoSuchTableError: If a table with the name does not exist.
+            TableAlreadyExistsError: If a table with the new name already 
exist.
+            NoSuchNamespaceError: If the target namespace does not exist.
+        """
+        from_database_name, from_table_name = 
self.identifier_to_database_and_table(from_identifier, NoSuchTableError)
+        to_database_name, to_table_name = 
self.identifier_to_database_and_table(to_identifier)
+        if not self._namespace_exists(to_database_name):
+            raise NoSuchNamespaceError(f"Namespace does not exist: 
{to_database_name}")
+        with Session(self.engine) as session:
+            try:
+                stmt = (
+                    update(IcebergTables)
+                    .where(
+                        IcebergTables.catalog_name == self.name,
+                        IcebergTables.table_namespace == from_database_name,
+                        IcebergTables.table_name == from_table_name,
+                    )
+                    .values(table_namespace=to_database_name, 
table_name=to_table_name)
+                )
+                result = session.execute(stmt)
+                if result.rowcount < 1:
+                    raise NoSuchTableError(f"Table does not exist: 
{from_table_name}")
+                session.commit()
+            except IntegrityError as e:
+                raise TableAlreadyExistsError(f"Table 
{to_database_name}.{to_table_name} already exists") from e
+        return self.load_table(to_identifier)
+
+    def _namespace_exists(self, identifier: Union[str, Identifier]) -> bool:
+        namespace = self.identifier_to_database(identifier)
+        with Session(self.engine) as session:
+            stmt = (
+                select(IcebergTables)
+                .where(IcebergTables.catalog_name == self.name, 
IcebergTables.table_namespace.like(namespace + "%"))

Review Comment:
   I don't think we want to have the `startsWith` here. 
   ```suggestion
                   .where(IcebergTables.catalog_name == self.name, 
IcebergTables.table_namespace == namespace)
   ```
   
   For example, this is used for the `create_namespace`. If you would create a 
namespace `production`, then you can't create a database `prod` anymore. 
Furthermore, the `%` doesn't play well with indexes.



##########
python/pyiceberg/catalog/sql.py:
##########
@@ -0,0 +1,448 @@
+from typing import (
+    List,
+    Optional,
+    Set,
+    Union,
+)
+
+from sqlalchemy import (
+    String,
+    create_engine,
+    delete,
+    insert,
+    select,
+    union,
+    update,
+)
+from sqlalchemy.exc import IntegrityError
+from sqlalchemy.orm import (
+    DeclarativeBase,
+    Mapped,
+    MappedAsDataclass,
+    Session,
+    mapped_column,
+)
+
+from pyiceberg.catalog import (
+    METADATA_LOCATION,
+    Catalog,
+    Identifier,
+    Properties,
+    PropertiesUpdateSummary,
+)
+from pyiceberg.exceptions import (
+    NamespaceAlreadyExistsError,
+    NamespaceNotEmptyError,
+    NoSuchNamespaceError,
+    NoSuchPropertyException,
+    NoSuchTableError,
+    TableAlreadyExistsError,
+)
+from pyiceberg.io import load_file_io
+from pyiceberg.partitioning import UNPARTITIONED_PARTITION_SPEC, PartitionSpec
+from pyiceberg.schema import Schema
+from pyiceberg.serializers import FromInputFile
+from pyiceberg.table import Table
+from pyiceberg.table.metadata import new_table_metadata
+from pyiceberg.table.sorting import UNSORTED_SORT_ORDER, SortOrder
+from pyiceberg.typedef import EMPTY_DICT
+
+
+class SqlCatalogBaseTable(MappedAsDataclass, DeclarativeBase):
+    pass
+
+
+class IcebergTables(SqlCatalogBaseTable):
+    __tablename__ = "iceberg_tables"
+
+    catalog_name: Mapped[str] = mapped_column(String(255), nullable=False, 
primary_key=True)
+    table_namespace: Mapped[str] = mapped_column(String(255), nullable=False, 
primary_key=True)
+    table_name: Mapped[str] = mapped_column(String(255), nullable=False, 
primary_key=True)
+    metadata_location: Mapped[str] = mapped_column(String(1000), nullable=True)
+    previous_metadata_location: Mapped[str] = mapped_column(String(1000), 
nullable=True)
+
+
+class IcebergNamespaceProperties(SqlCatalogBaseTable):
+    __tablename__ = "iceberg_namespace_properties"
+    # Catalog minimum Namespace Properties
+    NAMESPACE_MINIMAL_PROPERTIES = {"exists": "true"}
+
+    catalog_name: Mapped[str] = mapped_column(String(255), nullable=False, 
primary_key=True)
+    namespace: Mapped[str] = mapped_column(String(255), nullable=False, 
primary_key=True)
+    property_key: Mapped[str] = mapped_column(String(255), nullable=False, 
primary_key=True)
+    property_value: Mapped[str] = mapped_column(String(1000), nullable=True)
+
+
+class SqlCatalog(Catalog):
+    def __init__(self, name: str, **properties: str):
+        super().__init__(name, **properties)
+
+        if not (uri_prop := self.properties.get("uri")):
+            raise NoSuchPropertyException("SQL connection URI is required")
+        self.engine = create_engine(uri_prop, echo=True)
+
+    def initialize_tables(self) -> None:

Review Comment:
   Nit: I like to be in line here with the SQL terminology
   ```suggestion
       def create_tables(self) -> None:
   ```



-- 
This is an automated message from the Apache Git Service.
To respond to the message, please log on to GitHub and use the
URL above to go to the specific comment.

To unsubscribe, e-mail: [email protected]

For queries about this service, please contact Infrastructure at:
[email protected]


---------------------------------------------------------------------
To unsubscribe, e-mail: [email protected]
For additional commands, e-mail: [email protected]

[GitHub] [iceberg] Fokko commented on a diff in pull request #7921: Initial code for pyiceberg SqlCatalog

Reply via email to