This is an automated email from the ASF dual-hosted git repository. yiguolei pushed a commit to branch branch-2.1 in repository https://gitbox.apache.org/repos/asf/doris.git
The following commit(s) were added to refs/heads/branch-2.1 by this push: new fa45f3b95a0 [feature](paimon/iceberg)add a docker that can directly pull up all the relevant environments of paimon/iceberg/doris for 2.1 (#38009) (#38588) fa45f3b95a0 is described below commit fa45f3b95a0cca342a40d0766e63e19ff38a3bf9 Author: wuwenchi <wuwenchi...@hotmail.com> AuthorDate: Wed Jul 31 23:05:51 2024 +0800 [feature](paimon/iceberg)add a docker that can directly pull up all the relevant environments of paimon/iceberg/doris for 2.1 (#38009) (#38588) bp: #38009 --- samples/datalake/iceberg_and_paimon/README.md | 279 ++++++++++++++++++ .../data/flink-conf/flink-conf.yaml | 312 +++++++++++++++++++++ .../data/flink-conf/log4j-cli.properties | 67 +++++ .../data/flink-conf/log4j-console.properties | 70 +++++ .../data/flink-conf/log4j-session.properties | 42 +++ .../data/flink-conf/log4j.properties | 61 ++++ .../data/flink-conf/logback-console.xml | 67 +++++ .../data/flink-conf/logback-session.xml | 39 +++ .../iceberg_and_paimon/data/flink-conf/logback.xml | 58 ++++ .../iceberg_and_paimon/data/flink-conf/masters | 18 ++ .../iceberg_and_paimon/data/flink-conf/workers | 18 ++ .../iceberg_and_paimon/data/flink-conf/zoo.cfg | 36 +++ .../data/spark-conf/fairscheduler.xml.template | 31 ++ .../data/spark-conf/log4j2.properties.template | 69 +++++ .../data/spark-conf/metrics.properties.template | 210 ++++++++++++++ .../data/spark-conf/spark-defaults.conf | 43 +++ .../data/spark-conf/spark-defaults.conf.template | 27 ++ .../data/spark-conf/spark-env.sh.template | 81 ++++++ .../data/spark-conf/workers.template | 19 ++ .../data/table/customer/000000_0 | Bin 0 -> 740686 bytes .../data/table/customer/000001_0 | Bin 0 -> 739823 bytes .../data/table/customer/000002_0 | Bin 0 -> 740700 bytes .../data/table/customer/000003_0 | Bin 0 -> 739592 bytes .../datalake/iceberg_and_paimon/docker-compose.env | 22 ++ .../datalake/iceberg_and_paimon/docker-compose.yml | 173 ++++++++++++ .../iceberg_and_paimon/scripts/start_doris.sh | 60 ++++ .../datalake/iceberg_and_paimon/sql/init_doris.sql | 21 ++ .../iceberg_and_paimon/sql/init_tables.sql | 53 ++++ .../iceberg_and_paimon/sql/prepare_data.sql | 8 + samples/datalake/iceberg_and_paimon/start_all.sh | 121 ++++++++ .../iceberg_and_paimon/start_doris_client.sh | 20 ++ .../iceberg_and_paimon/start_flink_client.sh | 19 ++ .../start_spark_iceberg_client.sh | 19 ++ .../start_spark_paimon_client.sh | 19 ++ samples/datalake/iceberg_and_paimon/stop_all.sh | 19 ++ 35 files changed, 2101 insertions(+) diff --git a/samples/datalake/iceberg_and_paimon/README.md b/samples/datalake/iceberg_and_paimon/README.md new file mode 100644 index 00000000000..0d3a80775f1 --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/README.md @@ -0,0 +1,279 @@ +# doris-iceberg-paimon-compose + + +## Launch Docker Compose +First, we need to ensure the environmental parameters of the machine. + +``` +sysctl -w vm.max_map_count=2000000 +``` + +We can then start all the required containers via the script. + +``` +bash start_all.sh +``` +It will start a set of docker, the environment includes: +- doris +- iceberg +- paimon +- flink +- spark + +And it will automatically create an iceberg table and a paimon table. We can use these tables directly to experience doris. + + +## paimon table test + +Enter the flink client. + +``` +bash start_flink_client.sh +``` + +Here is a table that has been created. + +```sql + +Flink SQL> use paimon.db_paimon; +[INFO] Execute statement succeed. + +Flink SQL> show tables; ++------------+ +| table name | ++------------+ +| customer | ++------------+ +1 row in set + +Flink SQL> show create table customer; ++------------------------------------------------------------------------+ +| result | ++------------------------------------------------------------------------+ +| CREATE TABLE `paimon`.`db_paimon`.`customer` ( + `c_custkey` INT NOT NULL, + `c_name` VARCHAR(25), + `c_address` VARCHAR(40), + `c_nationkey` INT NOT NULL, + `c_phone` CHAR(15), + `c_acctbal` DECIMAL(12, 2), + `c_mktsegment` CHAR(10), + `c_comment` VARCHAR(117), + CONSTRAINT `PK_c_custkey_c_nationkey` PRIMARY KEY (`c_custkey`, `c_nationkey`) NOT ENFORCED +) PARTITIONED BY (`c_nationkey`) +WITH ( + 'bucket' = '1', + 'path' = 's3://warehouse/wh/db_paimon.db/customer', + 'deletion-vectors.enabled' = 'true' +) + | ++-------------------------------------------------------------------------+ +1 row in set + +Flink SQL> desc customer; ++--------------+----------------+-------+-----------------------------+--------+-----------+ +| name | type | null | key | extras | watermark | ++--------------+----------------+-------+-----------------------------+--------+-----------+ +| c_custkey | INT | FALSE | PRI(c_custkey, c_nationkey) | | | +| c_name | VARCHAR(25) | TRUE | | | | +| c_address | VARCHAR(40) | TRUE | | | | +| c_nationkey | INT | FALSE | PRI(c_custkey, c_nationkey) | | | +| c_phone | CHAR(15) | TRUE | | | | +| c_acctbal | DECIMAL(12, 2) | TRUE | | | | +| c_mktsegment | CHAR(10) | TRUE | | | | +| c_comment | VARCHAR(117) | TRUE | | | | ++--------------+----------------+-------+-----------------------------+--------+-----------+ +8 rows in set +``` + +This table already has some data. + +```sql +Flink SQL> select * from customer order by c_custkey limit 4; ++-----------+--------------------+--------------------------------+-------------+-----------------+-----------+--------------+--------------------------------+ +| c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | ++-----------+--------------------+--------------------------------+-------------+-----------------+-----------+--------------+--------------------------------+ +| 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 711.56 | BUILDING | to the even, regular platel... | +| 2 | Customer#000000002 | XSTf4,NCwDVaWNe6tEgvwfmRchLXak | 13 | 23-768-687-3665 | 121.65 | AUTOMOBILE | l accounts. blithely ironic... | +| 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic,... | +| 32 | Customer#000000032 | jD2xZzi UmId,DCtNBLXKj9q0Tl... | 15 | 25-430-914-2194 | 3471.53 | BUILDING | cial ideas. final, furious ... | ++-----------+--------------------+--------------------------------+-------------+-----------------+-----------+--------------+--------------------------------+ +4 rows in set + +Flink SQL> select * from customer order by c_custkey desc limit 4; ++-----------+--------------------+--------------------------------+-------------+-----------------+-----------+--------------+--------------------------------+ +| c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | ++-----------+--------------------+--------------------------------+-------------+-----------------+-----------+--------------+--------------------------------+ +| 149987 | Customer#000149987 | P6z8nSIgW55cSydfa1bZ | 8 | 18-187-349-6326 | 5338.96 | HOUSEHOLD | aggle carefully quickly reg... | +| 149986 | Customer#000149986 | HyEJpj2jvEqt,,pA50NOvuTP | 7 | 17-654-752-5642 | 1251.17 | BUILDING | enticingly carefully carefu... | +| 149985 | Customer#000149985 | y4m,kcxXX6ZtGTJGxavBTJf52OM... | 22 | 32-595-455-4078 | 6012.98 | MACHINERY | kages affix against the bli... | +| 149984 | Customer#000149984 | ZBEyUfjRsVtUNSIv9dnnyoPYeQw... | 12 | 22-283-613-7016 | 6567.62 | HOUSEHOLD | ges integrate along the bli... | ++-----------+--------------------+--------------------------------+-------------+-----------------+-----------+--------------+--------------------------------+ +4 rows in set +``` + +Now we can query this table through doris. + +``` +bash start_doris_client.sh +``` + +After entering the doris client, the paimon catalog has been created here, so the data of the paimon table can be directly queried. + +```sql +mysql> use paimon.db_paimon; +Reading table information for completion of table and column names +You can turn off this feature to get a quicker startup with -A + +Database changed +mysql> show tables; ++---------------------+ +| Tables_in_db_paimon | ++---------------------+ +| customer | ++---------------------+ +1 row in set (0.00 sec) + +mysql> select * from customer order by c_custkey limit 4; ++-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ +| c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | ++-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ +| 1 | Customer#000000001 | IVhzIApeRb ot,c,E | 15 | 25-989-741-2988 | 711.56 | BUILDING | to the even, regular platelets. regular, ironic epitaphs nag e | +| 2 | Customer#000000002 | XSTf4,NCwDVaWNe6tEgvwfmRchLXak | 13 | 23-768-687-3665 | 121.65 | AUTOMOBILE | l accounts. blithely ironic theodolites integrate boldly: caref | +| 3 | Customer#000000003 | MG9kdTD2WBHm | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov | +| 32 | Customer#000000032 | jD2xZzi UmId,DCtNBLXKj9q0Tlp2iQ6ZcO3J | 15 | 25-430-914-2194 | 3471.53 | BUILDING | cial ideas. final, furious requests across the e | ++-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ +4 rows in set (1.89 sec) + +mysql> select * from customer order by c_custkey desc limit 4; ++-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------+ +| c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | ++-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------+ +| 149987 | Customer#000149987 | P6z8nSIgW55cSydfa1bZ | 8 | 18-187-349-6326 | 5338.96 | HOUSEHOLD | aggle carefully quickly regular ideas-- ironic, bold packages are. regular tithes cajole regular requ | +| 149986 | Customer#000149986 | HyEJpj2jvEqt,,pA50NOvuTP | 7 | 17-654-752-5642 | 1251.17 | BUILDING | enticingly carefully careful courts. furiously | +| 149985 | Customer#000149985 | y4m,kcxXX6ZtGTJGxavBTJf52OMqBK9z | 22 | 32-595-455-4078 | 6012.98 | MACHINERY | kages affix against the blithely pending foxes. slyly final packages boost | +| 149984 | Customer#000149984 | ZBEyUfjRsVtUNSIv9dnnyoPYeQwi7czgCeeeM | 12 | 22-283-613-7016 | 6567.62 | HOUSEHOLD | ges integrate along the blithely unusual | ++-----------+--------------------+---------------------------------------+-------------+-----------------+-----------+--------------+-------------------------------------------------------------------------------------------------------+ +4 rows in set (0.35 sec) +``` + +Doris can perform partition pruning on Paimon and speed up the query process through native reading. We can check this through `explain verbose`. + +```sql +mysql> explain verbose select * from customer where c_nationkey < 3; ++------------------------------------------------------------------------------------------------------------------------------------------------+ +| Explain String(Nereids Planner) | ++------------------------------------------------------------------------------------------------------------------------------------------------+ +| ............... | +| 0:VPAIMON_SCAN_NODE(68) | +| table: customer | +| predicates: (c_nationkey[#3] < 3) | +| inputSplitNum=3, totalFileSize=193823, scanRanges=3 | +| partition=3/0 | +| backends: | +| 10002 | +| s3://warehouse/wh/db_paimon.db/customer/c_nationkey=1/bucket-0/data-15cee5b7-1bd7-42ca-9314-56d92c62c03b-0.orc start: 0 length: 66600 | +| s3://warehouse/wh/db_paimon.db/customer/c_nationkey=2/bucket-0/data-e98fb7ef-ec2b-4ad5-a496-713cb9481d56-0.orc start: 0 length: 64059 | +| s3://warehouse/wh/db_paimon.db/customer/c_nationkey=0/bucket-0/data-431be05d-50fa-401f-9680-d646757d0f95-0.orc start: 0 length: 63164 | +| cardinality=18751, numNodes=1 | +| pushdown agg=NONE | +| paimonNativeReadSplits=3/3 | +| PaimonSplitStats: | +| SplitStat [type=NATIVE, rowCount=771, rawFileConvertable=true, hasDeletionVector=false] | +| SplitStat [type=NATIVE, rowCount=750, rawFileConvertable=true, hasDeletionVector=false] | +| SplitStat [type=NATIVE, rowCount=750, rawFileConvertable=true, hasDeletionVector=false] | +| tuple ids: 0 | +| | +| Tuples: | +| TupleDescriptor{id=0, tbl=customer} | +| SlotDescriptor{id=0, col=c_custkey, colUniqueId=0, type=INT, nullable=true, isAutoIncrement=false, subColPath=null} | +| SlotDescriptor{id=1, col=c_name, colUniqueId=1, type=TEXT, nullable=true, isAutoIncrement=false, subColPath=null} | +| SlotDescriptor{id=2, col=c_address, colUniqueId=2, type=TEXT, nullable=true, isAutoIncrement=false, subColPath=null} | +| SlotDescriptor{id=3, col=c_nationkey, colUniqueId=3, type=INT, nullable=true, isAutoIncrement=false, subColPath=null} | +| SlotDescriptor{id=4, col=c_phone, colUniqueId=4, type=TEXT, nullable=true, isAutoIncrement=false, subColPath=null} | +| SlotDescriptor{id=5, col=c_acctbal, colUniqueId=5, type=DECIMALV3(12, 2), nullable=true, isAutoIncrement=false, subColPath=null} | +| SlotDescriptor{id=6, col=c_mktsegment, colUniqueId=6, type=TEXT, nullable=true, isAutoIncrement=false, subColPath=null} | +| SlotDescriptor{id=7, col=c_comment, colUniqueId=7, type=TEXT, nullable=true, isAutoIncrement=false, subColPath=null} | +| | +| | +| | +| Statistics | +| planed with unknown column statistics | ++------------------------------------------------------------------------------------------------------------------------------------------------+ +66 rows in set (0.17 sec) +``` + +Through the query plan, we can see that doris only reads three partition files whose c_nationkey is less than 3, and the file reading method is native. +In addition, doris supports the `deletion vectors` of paimon. +First, we will modify some data through flink. + +```sql +Flink SQL> update customer set c_address='c_address_update' where c_nationkey = 1; +[INFO] Submitting SQL update statement to the cluster... +[INFO] SQL update statement has been successfully submitted to the cluster: +Job ID: ff838b7b778a94396b332b0d93c8f7ac + +``` + +After waiting for the task to be completed, we can view the modified data through doris. + +```sql +mysql> explain verbose select * from customer where c_nationkey < 3; ++------------------------------------------------------------------------------------------------------------------------------------------------+ +| Explain String(Nereids Planner) | ++------------------------------------------------------------------------------------------------------------------------------------------------+ +| ............... | +| | +| 0:VPAIMON_SCAN_NODE(68) | +| table: customer | +| predicates: (c_nationkey[#3] < 3) | +| inputSplitNum=4, totalFileSize=238324, scanRanges=4 | +| partition=3/0 | +| backends: | +| 10002 | +| s3://warehouse/wh/db_paimon.db/customer/c_nationkey=1/bucket-0/data-15cee5b7-1bd7-42ca-9314-56d92c62c03b-0.orc start: 0 length: 66600 | +| s3://warehouse/wh/db_paimon.db/customer/c_nationkey=1/bucket-0/data-5d50255a-2215-4010-b976-d5dc656f3444-0.orc start: 0 length: 44501 | +| s3://warehouse/wh/db_paimon.db/customer/c_nationkey=2/bucket-0/data-e98fb7ef-ec2b-4ad5-a496-713cb9481d56-0.orc start: 0 length: 64059 | +| s3://warehouse/wh/db_paimon.db/customer/c_nationkey=0/bucket-0/data-431be05d-50fa-401f-9680-d646757d0f95-0.orc start: 0 length: 63164 | +| cardinality=18751, numNodes=1 | +| pushdown agg=NONE | +| paimonNativeReadSplits=4/4 | +| PaimonSplitStats: | +| SplitStat [type=NATIVE, rowCount=1542, rawFileConvertable=true, hasDeletionVector=true] | +| SplitStat [type=NATIVE, rowCount=750, rawFileConvertable=true, hasDeletionVector=false] | +| SplitStat [type=NATIVE, rowCount=750, rawFileConvertable=true, hasDeletionVector=false] | +| tuple ids: 0 | +| | +| Tuples: | +| TupleDescriptor{id=0, tbl=customer} | +| SlotDescriptor{id=0, col=c_custkey, colUniqueId=0, type=INT, nullable=true, isAutoIncrement=false, subColPath=null} | +| SlotDescriptor{id=1, col=c_name, colUniqueId=1, type=TEXT, nullable=true, isAutoIncrement=false, subColPath=null} | +| SlotDescriptor{id=2, col=c_address, colUniqueId=2, type=TEXT, nullable=true, isAutoIncrement=false, subColPath=null} | +| SlotDescriptor{id=3, col=c_nationkey, colUniqueId=3, type=INT, nullable=true, isAutoIncrement=false, subColPath=null} | +| SlotDescriptor{id=4, col=c_phone, colUniqueId=4, type=TEXT, nullable=true, isAutoIncrement=false, subColPath=null} | +| SlotDescriptor{id=5, col=c_acctbal, colUniqueId=5, type=DECIMALV3(12, 2), nullable=true, isAutoIncrement=false, subColPath=null} | +| SlotDescriptor{id=6, col=c_mktsegment, colUniqueId=6, type=TEXT, nullable=true, isAutoIncrement=false, subColPath=null} | +| SlotDescriptor{id=7, col=c_comment, colUniqueId=7, type=TEXT, nullable=true, isAutoIncrement=false, subColPath=null} | +| | +| | +| | +| Statistics | +| planed with unknown column statistics | ++------------------------------------------------------------------------------------------------------------------------------------------------+ +67 rows in set (0.23 sec) +``` + +From the plan, we can see that doris reads 4 splits using the native method, and the corresponding paimon splits are 3, one of which carries the deletion vector. +Finally, we can read the modified data by doris. + +```sql +mysql> select * from customer where c_nationkey=1 limit 2; ++-----------+--------------------+-----------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ +| c_custkey | c_name | c_address | c_nationkey | c_phone | c_acctbal | c_mktsegment | c_comment | ++-----------+--------------------+-----------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ +| 3 | Customer#000000003 | c_address_update | 1 | 11-719-748-3364 | 7498.12 | AUTOMOBILE | deposits eat slyly ironic, even instructions. express foxes detect slyly. blithely even accounts abov | +| 513 | Customer#000000513 | c_address_update | 1 | 11-861-303-6887 | 955.37 | HOUSEHOLD | press along the quickly regular instructions. regular requests against the carefully ironic s | ++-----------+--------------------+-----------------+-------------+-----------------+-----------+--------------+--------------------------------------------------------------------------------------------------------+ +2 rows in set (0.19 sec) + +``` diff --git a/samples/datalake/iceberg_and_paimon/data/flink-conf/flink-conf.yaml b/samples/datalake/iceberg_and_paimon/data/flink-conf/flink-conf.yaml new file mode 100644 index 00000000000..c1d3b55bae3 --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/data/flink-conf/flink-conf.yaml @@ -0,0 +1,312 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +# These parameters are required for Java 17 support. +# They can be safely removed when using Java 8/11. +env.java.opts.all: --add-exports=java.base/sun.net.util=ALL-UNNAMED --add-exports=java.rmi/sun.rmi.registry=ALL-UNNAMED --add-exports=jdk.compiler/com.sun.tools.javac.api=ALL-UNNAMED --add-exports=jdk.compiler/com.sun.tools.javac.file=ALL-UNNAMED --add-exports=jdk.compiler/com.sun.tools.javac.parser=ALL-UNNAMED --add-exports=jdk.compiler/com.sun.tools.javac.tree=ALL-UNNAMED --add-exports=jdk.compiler/com.sun.tools.javac.util=ALL-UNNAMED --add-exports=java.security.jgss/sun.security.krb5= [...] + +#============================================================================== +# Common +#============================================================================== + +# The external address of the host on which the JobManager runs and can be +# reached by the TaskManagers and any clients which want to connect. This setting +# is only used in Standalone mode and may be overwritten on the JobManager side +# by specifying the --host <hostname> parameter of the bin/jobmanager.sh executable. +# In high availability mode, if you use the bin/start-cluster.sh script and setup +# the conf/masters file, this will be taken care of automatically. Yarn +# automatically configure the host name based on the hostname of the node where the +# JobManager runs. + +jobmanager.rpc.address: jobmanager + +# The RPC port where the JobManager is reachable. + +jobmanager.rpc.port: 6123 + +# The host interface the JobManager will bind to. By default, this is localhost, and will prevent +# the JobManager from communicating outside the machine/container it is running on. +# On YARN this setting will be ignored if it is set to 'localhost', defaulting to 0.0.0.0. +# On Kubernetes this setting will be ignored, defaulting to 0.0.0.0. +# +# To enable this, set the bind-host address to one that has access to an outside facing network +# interface, such as 0.0.0.0. + +jobmanager.bind-host: 0.0.0.0 + + +# The total process memory size for the JobManager. +# +# Note this accounts for all memory usage within the JobManager process, including JVM metaspace and other overhead. + +jobmanager.memory.process.size: 1600m + +# The host interface the TaskManager will bind to. By default, this is localhost, and will prevent +# the TaskManager from communicating outside the machine/container it is running on. +# On YARN this setting will be ignored if it is set to 'localhost', defaulting to 0.0.0.0. +# On Kubernetes this setting will be ignored, defaulting to 0.0.0.0. +# +# To enable this, set the bind-host address to one that has access to an outside facing network +# interface, such as 0.0.0.0. + +taskmanager.bind-host: 0.0.0.0 + +# The address of the host on which the TaskManager runs and can be reached by the JobManager and +# other TaskManagers. If not specified, the TaskManager will try different strategies to identify +# the address. +# +# Note this address needs to be reachable by the JobManager and forward traffic to one of +# the interfaces the TaskManager is bound to (see 'taskmanager.bind-host'). +# +# Note also that unless all TaskManagers are running on the same machine, this address needs to be +# configured separately for each TaskManager. + + +# The total process memory size for the TaskManager. +# +# Note this accounts for all memory usage within the TaskManager process, including JVM metaspace and other overhead. + +taskmanager.memory.process.size: 1728m + +# To exclude JVM metaspace and overhead, please, use total Flink memory size instead of 'taskmanager.memory.process.size'. +# It is not recommended to set both 'taskmanager.memory.process.size' and Flink memory. +# +# taskmanager.memory.flink.size: 1280m + +# The number of task slots that each TaskManager offers. Each slot runs one parallel pipeline. + +taskmanager.numberOfTaskSlots: 1 + +# The parallelism used for programs that did not specify and other parallelism. + +parallelism.default: 1 + +# The default file system scheme and authority. +# +# By default file paths without scheme are interpreted relative to the local +# root file system 'file:///'. Use this to override the default and interpret +# relative paths relative to a different file system, +# for example 'hdfs://mynamenode:12345' +# +# fs.default-scheme + +#============================================================================== +# High Availability +#============================================================================== + +# The high-availability mode. Possible options are 'NONE' or 'zookeeper'. +# +# high-availability.type: zookeeper + +# The path where metadata for master recovery is persisted. While ZooKeeper stores +# the small ground truth for checkpoint and leader election, this location stores +# the larger objects, like persisted dataflow graphs. +# +# Must be a durable file system that is accessible from all nodes +# (like HDFS, S3, Ceph, nfs, ...) +# +# high-availability.storageDir: hdfs:///flink/ha/ + +# The list of ZooKeeper quorum peers that coordinate the high-availability +# setup. This must be a list of the form: +# "host1:clientPort,host2:clientPort,..." (default clientPort: 2181) +# +# high-availability.zookeeper.quorum: localhost:2181 + + +# ACL options are based on https://zookeeper.apache.org/doc/r3.1.2/zookeeperProgrammers.html#sc_BuiltinACLSchemes +# It can be either "creator" (ZOO_CREATE_ALL_ACL) or "open" (ZOO_OPEN_ACL_UNSAFE) +# The default value is "open" and it can be changed to "creator" if ZK security is enabled +# +# high-availability.zookeeper.client.acl: open + +#============================================================================== +# Fault tolerance and checkpointing +#============================================================================== + +# The backend that will be used to store operator state checkpoints if +# checkpointing is enabled. Checkpointing is enabled when execution.checkpointing.interval > 0. +# +# Execution checkpointing related parameters. Please refer to CheckpointConfig and ExecutionCheckpointingOptions for more details. +# +# execution.checkpointing.interval: 3min +# execution.checkpointing.externalized-checkpoint-retention: [DELETE_ON_CANCELLATION, RETAIN_ON_CANCELLATION] +# execution.checkpointing.max-concurrent-checkpoints: 1 +# execution.checkpointing.min-pause: 0 +# execution.checkpointing.mode: [EXACTLY_ONCE, AT_LEAST_ONCE] +# execution.checkpointing.timeout: 10min +# execution.checkpointing.tolerable-failed-checkpoints: 0 +# execution.checkpointing.unaligned: false +# +# Supported backends are 'hashmap', 'rocksdb', or the +# <class-name-of-factory>. +# +# state.backend.type: hashmap + +# Directory for checkpoints filesystem, when using any of the default bundled +# state backends. +# +# state.checkpoints.dir: hdfs://namenode-host:port/flink-checkpoints + +# Default target directory for savepoints, optional. +# +# state.savepoints.dir: hdfs://namenode-host:port/flink-savepoints + +# Flag to enable/disable incremental checkpoints for backends that +# support incremental checkpoints (like the RocksDB state backend). +# +# state.backend.incremental: false + +# The failover strategy, i.e., how the job computation recovers from task failures. +# Only restart tasks that may have been affected by the task failure, which typically includes +# downstream tasks and potentially upstream tasks if their produced data is no longer available for consumption. + +jobmanager.execution.failover-strategy: region + +#============================================================================== +# Rest & web frontend +#============================================================================== + +# The port to which the REST client connects to. If rest.bind-port has +# not been specified, then the server will bind to this port as well. +# +#rest.port: 8081 + +# The address to which the REST client will connect to +# +rest.address: 0.0.0.0 + +# Port range for the REST and web server to bind to. +# +#rest.bind-port: 8080-8090 + +# The address that the REST & web server binds to +# By default, this is localhost, which prevents the REST & web server from +# being able to communicate outside of the machine/container it is running on. +# +# To enable this, set the bind address to one that has access to outside-facing +# network interface, such as 0.0.0.0. +# +rest.bind-address: 0.0.0.0 + +# Flag to specify whether job submission is enabled from the web-based +# runtime monitor. Uncomment to disable. + +#web.submit.enable: false + +# Flag to specify whether job cancellation is enabled from the web-based +# runtime monitor. Uncomment to disable. + +#web.cancel.enable: false + +#============================================================================== +# Advanced +#============================================================================== + +# Override the directories for temporary files. If not specified, the +# system-specific Java temporary directory (java.io.tmpdir property) is taken. +# +# For framework setups on Yarn, Flink will automatically pick up the +# containers' temp directories without any need for configuration. +# +# Add a delimited list for multiple directories, using the system directory +# delimiter (colon ':' on unix) or a comma, e.g.: +# /data1/tmp:/data2/tmp:/data3/tmp +# +# Note: Each directory entry is read from and written to by a different I/O +# thread. You can include the same directory multiple times in order to create +# multiple I/O threads against that directory. This is for example relevant for +# high-throughput RAIDs. +# +# io.tmp.dirs: /tmp + +# The classloading resolve order. Possible values are 'child-first' (Flink's default) +# and 'parent-first' (Java's default). +# +# Child first classloading allows users to use different dependency/library +# versions in their application than those in the classpath. Switching back +# to 'parent-first' may help with debugging dependency issues. +# +# classloader.resolve-order: child-first + +# The amount of memory going to the network stack. These numbers usually need +# no tuning. Adjusting them may be necessary in case of an "Insufficient number +# of network buffers" error. The default min is 64MB, the default max is 1GB. +# +# taskmanager.memory.network.fraction: 0.1 +# taskmanager.memory.network.min: 64mb +# taskmanager.memory.network.max: 1gb + +#============================================================================== +# Flink Cluster Security Configuration +#============================================================================== + +# Kerberos authentication for various components - Hadoop, ZooKeeper, and connectors - +# may be enabled in four steps: +# 1. configure the local krb5.conf file +# 2. provide Kerberos credentials (either a keytab or a ticket cache w/ kinit) +# 3. make the credentials available to various JAAS login contexts +# 4. configure the connector to use JAAS/SASL + +# The below configure how Kerberos credentials are provided. A keytab will be used instead of +# a ticket cache if the keytab path and principal are set. + +# security.kerberos.login.use-ticket-cache: true +# security.kerberos.login.keytab: /path/to/kerberos/keytab +# security.kerberos.login.principal: flink-user + +# The configuration below defines which JAAS login contexts + +# security.kerberos.login.contexts: Client,KafkaClient + +#============================================================================== +# ZK Security Configuration +#============================================================================== + +# Below configurations are applicable if ZK ensemble is configured for security + +# Override below configuration to provide custom ZK service name if configured +# zookeeper.sasl.service-name: zookeeper + +# The configuration below must match one of the values set in "security.kerberos.login.contexts" +# zookeeper.sasl.login-context-name: Client + +#============================================================================== +# HistoryServer +#============================================================================== + +# The HistoryServer is started and stopped via bin/historyserver.sh (start|stop) + +# Directory to upload completed jobs to. Add this directory to the list of +# monitored directories of the HistoryServer as well (see below). +#jobmanager.archive.fs.dir: hdfs:///completed-jobs/ + +# The address under which the web-based HistoryServer listens. +#historyserver.web.address: 0.0.0.0 + +# The port under which the web-based HistoryServer listens. +#historyserver.web.port: 8082 + +# Comma separated list of directories to monitor for completed jobs. +#historyserver.archive.fs.dir: hdfs:///completed-jobs/ + +# Interval in milliseconds for refreshing the monitored directories. +#historyserver.archive.fs.refresh-interval: 10000 + +blob.server.port: 6124 +query.server.port: 6125 diff --git a/samples/datalake/iceberg_and_paimon/data/flink-conf/log4j-cli.properties b/samples/datalake/iceberg_and_paimon/data/flink-conf/log4j-cli.properties new file mode 100644 index 00000000000..80af4c72643 --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/data/flink-conf/log4j-cli.properties @@ -0,0 +1,67 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +# Allows this configuration to be modified at runtime. The file will be checked every 30 seconds. +monitorInterval=30 + +rootLogger.level = INFO +rootLogger.appenderRef.file.ref = FileAppender + +# Log all infos in the given file +appender.file.name = FileAppender +appender.file.type = FILE +appender.file.append = false +appender.file.fileName = ${sys:log.file} +appender.file.layout.type = PatternLayout +appender.file.layout.pattern = %d{yyyy-MM-dd HH:mm:ss,SSS} %-5p %-60c %x - %m%n + +# Log output from org.apache.flink.yarn to the console. This is used by the +# CliFrontend class when using a per-job YARN cluster. +logger.yarn.name = org.apache.flink.yarn +logger.yarn.level = INFO +logger.yarn.appenderRef.console.ref = ConsoleAppender +logger.yarncli.name = org.apache.flink.yarn.cli.FlinkYarnSessionCli +logger.yarncli.level = INFO +logger.yarncli.appenderRef.console.ref = ConsoleAppender +logger.hadoop.name = org.apache.hadoop +logger.hadoop.level = WARN +logger.hadoop.appenderRef.console.ref = ConsoleAppender + +# Make sure hive logs go to the file. +logger.hive.name = org.apache.hadoop.hive +logger.hive.level = INFO +logger.hive.additivity = false +logger.hive.appenderRef.file.ref = FileAppender + +# Log output from org.apache.flink.kubernetes to the console. +logger.kubernetes.name = org.apache.flink.kubernetes +logger.kubernetes.level = INFO +logger.kubernetes.appenderRef.console.ref = ConsoleAppender + +appender.console.name = ConsoleAppender +appender.console.type = CONSOLE +appender.console.layout.type = PatternLayout +appender.console.layout.pattern = %d{yyyy-MM-dd HH:mm:ss,SSS} %-5p %-60c %x - %m%n + +# suppress the warning that hadoop native libraries are not loaded (irrelevant for the client) +logger.hadoopnative.name = org.apache.hadoop.util.NativeCodeLoader +logger.hadoopnative.level = OFF + +# Suppress the irrelevant (wrong) warnings from the Netty channel handler +logger.netty.name = org.jboss.netty.channel.DefaultChannelPipeline +logger.netty.level = OFF diff --git a/samples/datalake/iceberg_and_paimon/data/flink-conf/log4j-console.properties b/samples/datalake/iceberg_and_paimon/data/flink-conf/log4j-console.properties new file mode 100644 index 00000000000..c29441618f5 --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/data/flink-conf/log4j-console.properties @@ -0,0 +1,70 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +# Allows this configuration to be modified at runtime. The file will be checked every 30 seconds. +monitorInterval=30 + +# This affects logging for both user code and Flink +rootLogger.level = INFO +rootLogger.appenderRef.console.ref = ConsoleAppender +rootLogger.appenderRef.rolling.ref = RollingFileAppender + +# Uncomment this if you want to _only_ change Flink's logging +#logger.flink.name = org.apache.flink +#logger.flink.level = INFO + +# The following lines keep the log level of common libraries/connectors on +# log level INFO. The root logger does not override this. You have to manually +# change the log levels here. +logger.pekko.name = org.apache.pekko +logger.pekko.level = INFO +logger.kafka.name= org.apache.kafka +logger.kafka.level = INFO +logger.hadoop.name = org.apache.hadoop +logger.hadoop.level = WARN +logger.zookeeper.name = org.apache.zookeeper +logger.zookeeper.level = INFO +logger.shaded_zookeeper.name = org.apache.flink.shaded.zookeeper3 +logger.shaded_zookeeper.level = INFO + +# Log all infos to the console +appender.console.name = ConsoleAppender +appender.console.type = CONSOLE +appender.console.layout.type = PatternLayout +appender.console.layout.pattern = %d{yyyy-MM-dd HH:mm:ss,SSS} %-5p %-60c %x - %m%n +appender.console.filter.threshold.type = ThresholdFilter +appender.console.filter.threshold.level = ${sys:console.log.level:-ALL} + +# Log all infos in the given rolling file +appender.rolling.name = RollingFileAppender +appender.rolling.type = RollingFile +appender.rolling.append = true +appender.rolling.fileName = ${sys:log.file} +appender.rolling.filePattern = ${sys:log.file}.%i +appender.rolling.layout.type = PatternLayout +appender.rolling.layout.pattern = %d{yyyy-MM-dd HH:mm:ss,SSS} %-5p %-60c %x - %m%n +appender.rolling.policies.type = Policies +appender.rolling.policies.size.type = SizeBasedTriggeringPolicy +appender.rolling.policies.size.size=100MB +appender.rolling.policies.startup.type = OnStartupTriggeringPolicy +appender.rolling.strategy.type = DefaultRolloverStrategy +appender.rolling.strategy.max = ${env:MAX_LOG_FILE_NUMBER:-10} + +# Suppress the irrelevant (wrong) warnings from the Netty channel handler +logger.netty.name = org.jboss.netty.channel.DefaultChannelPipeline +logger.netty.level = OFF diff --git a/samples/datalake/iceberg_and_paimon/data/flink-conf/log4j-session.properties b/samples/datalake/iceberg_and_paimon/data/flink-conf/log4j-session.properties new file mode 100644 index 00000000000..136a4a6a298 --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/data/flink-conf/log4j-session.properties @@ -0,0 +1,42 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +# Allows this configuration to be modified at runtime. The file will be checked every 30 seconds. +monitorInterval=30 + +rootLogger.level = INFO +rootLogger.appenderRef.console.ref = ConsoleAppender + +appender.console.name = ConsoleAppender +appender.console.type = CONSOLE +appender.console.layout.type = PatternLayout +appender.console.layout.pattern = %d{yyyy-MM-dd HH:mm:ss,SSS} %-5p %-60c %x - %m%n + +# Suppress the irrelevant (wrong) warnings from the Netty channel handler +logger.netty.name = org.jboss.netty.channel.DefaultChannelPipeline +logger.netty.level = OFF +logger.zookeeper.name = org.apache.zookeeper +logger.zookeeper.level = WARN +logger.shaded_zookeeper.name = org.apache.flink.shaded.zookeeper3 +logger.shaded_zookeeper.level = WARN +logger.curator.name = org.apache.flink.shaded.org.apache.curator.framework +logger.curator.level = WARN +logger.runtimeutils.name= org.apache.flink.runtime.util.ZooKeeperUtils +logger.runtimeutils.level = WARN +logger.runtimeleader.name = org.apache.flink.runtime.leaderretrieval.ZooKeeperLeaderRetrievalDriver +logger.runtimeleader.level = WARN diff --git a/samples/datalake/iceberg_and_paimon/data/flink-conf/log4j.properties b/samples/datalake/iceberg_and_paimon/data/flink-conf/log4j.properties new file mode 100644 index 00000000000..ba5be64bff1 --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/data/flink-conf/log4j.properties @@ -0,0 +1,61 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +# Allows this configuration to be modified at runtime. The file will be checked every 30 seconds. +monitorInterval=10 + +# This affects logging for both user code and Flink +rootLogger.level = INFO +rootLogger.appenderRef.file.ref = MainAppender + +# Uncomment this if you want to _only_ change Flink's logging +#logger.flink.name = org.apache.flink +#logger.flink.level = INFO + +# The following lines keep the log level of common libraries/connectors on +# log level INFO. The root logger does not override this. You have to manually +# change the log levels here. +logger.pekko.name = org.apache.pekko +logger.pekko.level = INFO +logger.kafka.name= org.apache.kafka +logger.kafka.level = INFO +logger.hadoop.name = org.apache.hadoop +logger.hadoop.level = WARN +logger.zookeeper.name = org.apache.zookeeper +logger.zookeeper.level = INFO +logger.shaded_zookeeper.name = org.apache.flink.shaded.zookeeper3 +logger.shaded_zookeeper.level = INFO + +# Log all infos in the given file +appender.main.name = MainAppender +appender.main.type = RollingFile +appender.main.append = true +appender.main.fileName = ${sys:log.file} +appender.main.filePattern = ${sys:log.file}.%i +appender.main.layout.type = PatternLayout +appender.main.layout.pattern = %d{yyyy-MM-dd HH:mm:ss,SSS} %-5p %-60c %x - %m%n +appender.main.policies.type = Policies +appender.main.policies.size.type = SizeBasedTriggeringPolicy +appender.main.policies.size.size = 100MB +appender.main.policies.startup.type = OnStartupTriggeringPolicy +appender.main.strategy.type = DefaultRolloverStrategy +appender.main.strategy.max = ${env:MAX_LOG_FILE_NUMBER:-10} + +# Suppress the irrelevant (wrong) warnings from the Netty channel handler +logger.netty.name = org.jboss.netty.channel.DefaultChannelPipeline +logger.netty.level = OFF diff --git a/samples/datalake/iceberg_and_paimon/data/flink-conf/logback-console.xml b/samples/datalake/iceberg_and_paimon/data/flink-conf/logback-console.xml new file mode 100644 index 00000000000..4be057b63cf --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/data/flink-conf/logback-console.xml @@ -0,0 +1,67 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one + ~ or more contributor license agreements. See the NOTICE file + ~ distributed with this work for additional information + ~ regarding copyright ownership. The ASF licenses this file + ~ to you under the Apache License, Version 2.0 (the + ~ "License"); you may not use this file except in compliance + ~ with the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<configuration> + <appender name="console" class="ch.qos.logback.core.ConsoleAppender"> + <encoder> + <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{60} %X{sourceThread} - %msg%n</pattern> + </encoder> + <filter class="ch.qos.logback.classic.filter.ThresholdFilter"> + <level>${console.log.level:-ALL}</level> + </filter> + </appender> + + <appender name="rolling" class="ch.qos.logback.core.rolling.RollingFileAppender"> + <file>${log.file}</file> + <append>false</append> + + <rollingPolicy class="ch.qos.logback.core.rolling.FixedWindowRollingPolicy"> + <fileNamePattern>${log.file}.%i</fileNamePattern> + <minIndex>1</minIndex> + <maxIndex>10</maxIndex> + </rollingPolicy> + + <triggeringPolicy class="ch.qos.logback.core.rolling.SizeBasedTriggeringPolicy"> + <maxFileSize>100MB</maxFileSize> + </triggeringPolicy> + + <encoder> + <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{60} %X{sourceThread} - %msg%n</pattern> + </encoder> + </appender> + + <!-- This affects logging for both user code and Flink --> + <root level="INFO"> + <appender-ref ref="console"/> + <appender-ref ref="rolling"/> + </root> + + <!-- Uncomment this if you want to only change Flink's logging --> + <!--<logger name="org.apache.flink" level="INFO"/>--> + + <!-- The following lines keep the log level of common libraries/connectors on + log level INFO. The root logger does not override this. You have to manually + change the log levels here. --> + <logger name="org.apache.pekko" level="INFO"/> + <logger name="org.apache.kafka" level="INFO"/> + <logger name="org.apache.hadoop" level="WARN"/> + <logger name="org.apache.zookeeper" level="INFO"/> + + <!-- Suppress the irrelevant (wrong) warnings from the Netty channel handler --> + <logger name="org.jboss.netty.channel.DefaultChannelPipeline" level="ERROR"/> +</configuration> diff --git a/samples/datalake/iceberg_and_paimon/data/flink-conf/logback-session.xml b/samples/datalake/iceberg_and_paimon/data/flink-conf/logback-session.xml new file mode 100644 index 00000000000..636b78a3973 --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/data/flink-conf/logback-session.xml @@ -0,0 +1,39 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one + ~ or more contributor license agreements. See the NOTICE file + ~ distributed with this work for additional information + ~ regarding copyright ownership. The ASF licenses this file + ~ to you under the Apache License, Version 2.0 (the + ~ "License"); you may not use this file except in compliance + ~ with the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<configuration> + <appender name="file" class="ch.qos.logback.core.FileAppender"> + <file>${log.file}</file> + <append>false</append> + <encoder> + <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{60} %X{sourceThread} - %msg%n</pattern> + </encoder> + </appender> + + <appender name="console" class="ch.qos.logback.core.ConsoleAppender"> + <encoder> + <pattern>%d{yyyy-MM-dd HH:mm:ss} %-5level %logger{60} %X{sourceThread} - %msg%n</pattern> + </encoder> + </appender> + + <logger name="ch.qos.logback" level="WARN" /> + <root level="INFO"> + <appender-ref ref="file"/> + <appender-ref ref="console"/> + </root> +</configuration> diff --git a/samples/datalake/iceberg_and_paimon/data/flink-conf/logback.xml b/samples/datalake/iceberg_and_paimon/data/flink-conf/logback.xml new file mode 100644 index 00000000000..6472172d4e8 --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/data/flink-conf/logback.xml @@ -0,0 +1,58 @@ +<!-- + ~ Licensed to the Apache Software Foundation (ASF) under one + ~ or more contributor license agreements. See the NOTICE file + ~ distributed with this work for additional information + ~ regarding copyright ownership. The ASF licenses this file + ~ to you under the Apache License, Version 2.0 (the + ~ "License"); you may not use this file except in compliance + ~ with the License. You may obtain a copy of the License at + ~ + ~ http://www.apache.org/licenses/LICENSE-2.0 + ~ + ~ Unless required by applicable law or agreed to in writing, software + ~ distributed under the License is distributed on an "AS IS" BASIS, + ~ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ~ See the License for the specific language governing permissions and + ~ limitations under the License. + --> + +<configuration> + <appender name="file" class="ch.qos.logback.core.FileAppender"> + <file>${log.file}</file> + <append>false</append> + <encoder> + <pattern>%d{yyyy-MM-dd HH:mm:ss.SSS} [%thread] %-5level %logger{60} %X{sourceThread} - %msg%n</pattern> + </encoder> + </appender> + + <!-- This affects logging for both user code and Flink --> + <root level="INFO"> + <appender-ref ref="file"/> + </root> + + <!-- Uncomment this if you want to only change Flink's logging --> + <!--<logger name="org.apache.flink" level="INFO">--> + <!--<appender-ref ref="file"/>--> + <!--</logger>--> + + <!-- The following lines keep the log level of common libraries/connectors on + log level INFO. The root logger does not override this. You have to manually + change the log levels here. --> + <logger name="org.apache.pekko" level="INFO"> + <appender-ref ref="file"/> + </logger> + <logger name="org.apache.kafka" level="INFO"> + <appender-ref ref="file"/> + </logger> + <logger name="org.apache.hadoop" level="WARN"> + <appender-ref ref="file"/> + </logger> + <logger name="org.apache.zookeeper" level="INFO"> + <appender-ref ref="file"/> + </logger> + + <!-- Suppress the irrelevant (wrong) warnings from the Netty channel handler --> + <logger name="org.jboss.netty.channel.DefaultChannelPipeline" level="ERROR"> + <appender-ref ref="file"/> + </logger> +</configuration> diff --git a/samples/datalake/iceberg_and_paimon/data/flink-conf/masters b/samples/datalake/iceberg_and_paimon/data/flink-conf/masters new file mode 100644 index 00000000000..9346df33d40 --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/data/flink-conf/masters @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +localhost:8081 diff --git a/samples/datalake/iceberg_and_paimon/data/flink-conf/workers b/samples/datalake/iceberg_and_paimon/data/flink-conf/workers new file mode 100644 index 00000000000..7938b5c6067 --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/data/flink-conf/workers @@ -0,0 +1,18 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +localhost diff --git a/samples/datalake/iceberg_and_paimon/data/flink-conf/zoo.cfg b/samples/datalake/iceberg_and_paimon/data/flink-conf/zoo.cfg new file mode 100644 index 00000000000..f59899746c3 --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/data/flink-conf/zoo.cfg @@ -0,0 +1,36 @@ +################################################################################ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +################################################################################ + +# The number of milliseconds of each tick +tickTime=2000 + +# The number of ticks that the initial synchronization phase can take +initLimit=10 + +# The number of ticks that can pass between sending a request and getting an acknowledgement +syncLimit=5 + +# The directory where the snapshot is stored. +# dataDir=/tmp/zookeeper + +# The port at which the clients will connect +clientPort=2181 + +# ZooKeeper quorum peers +server.1=localhost:2888:3888 +# server.2=host:peer-port:leader-port diff --git a/samples/datalake/iceberg_and_paimon/data/spark-conf/fairscheduler.xml.template b/samples/datalake/iceberg_and_paimon/data/spark-conf/fairscheduler.xml.template new file mode 100644 index 00000000000..385b2e772d2 --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/data/spark-conf/fairscheduler.xml.template @@ -0,0 +1,31 @@ +<?xml version="1.0"?> + +<!-- + Licensed to the Apache Software Foundation (ASF) under one or more + contributor license agreements. See the NOTICE file distributed with + this work for additional information regarding copyright ownership. + The ASF licenses this file to You under the Apache License, Version 2.0 + (the "License"); you may not use this file except in compliance with + the License. You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. +--> + +<allocations> + <pool name="production"> + <schedulingMode>FAIR</schedulingMode> + <weight>1</weight> + <minShare>2</minShare> + </pool> + <pool name="test"> + <schedulingMode>FIFO</schedulingMode> + <weight>2</weight> + <minShare>3</minShare> + </pool> +</allocations> diff --git a/samples/datalake/iceberg_and_paimon/data/spark-conf/log4j2.properties.template b/samples/datalake/iceberg_and_paimon/data/spark-conf/log4j2.properties.template new file mode 100644 index 00000000000..ab96e03baed --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/data/spark-conf/log4j2.properties.template @@ -0,0 +1,69 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set everything to be logged to the console +rootLogger.level = info +rootLogger.appenderRef.stdout.ref = console + +# In the pattern layout configuration below, we specify an explicit `%ex` conversion +# pattern for logging Throwables. If this was omitted, then (by default) Log4J would +# implicitly add an `%xEx` conversion pattern which logs stacktraces with additional +# class packaging information. That extra information can sometimes add a substantial +# performance overhead, so we disable it in our default logging config. +# For more information, see SPARK-39361. +appender.console.type = Console +appender.console.name = console +appender.console.target = SYSTEM_ERR +appender.console.layout.type = PatternLayout +appender.console.layout.pattern = %d{yy/MM/dd HH:mm:ss} %p %c{1}: %m%n%ex + +# Set the default spark-shell/spark-sql log level to WARN. When running the +# spark-shell/spark-sql, the log level for these classes is used to overwrite +# the root logger's log level, so that the user can have different defaults +# for the shell and regular Spark apps. +logger.repl.name = org.apache.spark.repl.Main +logger.repl.level = warn + +logger.thriftserver.name = org.apache.spark.sql.hive.thriftserver.SparkSQLCLIDriver +logger.thriftserver.level = warn + +# Settings to quiet third party logs that are too verbose +logger.jetty1.name = org.sparkproject.jetty +logger.jetty1.level = warn +logger.jetty2.name = org.sparkproject.jetty.util.component.AbstractLifeCycle +logger.jetty2.level = error +logger.replexprTyper.name = org.apache.spark.repl.SparkIMain$exprTyper +logger.replexprTyper.level = info +logger.replSparkILoopInterpreter.name = org.apache.spark.repl.SparkILoop$SparkILoopInterpreter +logger.replSparkILoopInterpreter.level = info +logger.parquet1.name = org.apache.parquet +logger.parquet1.level = error +logger.parquet2.name = parquet +logger.parquet2.level = error + +# SPARK-9183: Settings to avoid annoying messages when looking up nonexistent UDFs in SparkSQL with Hive support +logger.RetryingHMSHandler.name = org.apache.hadoop.hive.metastore.RetryingHMSHandler +logger.RetryingHMSHandler.level = fatal +logger.FunctionRegistry.name = org.apache.hadoop.hive.ql.exec.FunctionRegistry +logger.FunctionRegistry.level = error + +# For deploying Spark ThriftServer +# SPARK-34128: Suppress undesirable TTransportException warnings involved in THRIFT-4805 +appender.console.filter.1.type = RegexFilter +appender.console.filter.1.regex = .*Thrift error occurred during processing of message.* +appender.console.filter.1.onMatch = deny +appender.console.filter.1.onMismatch = neutral diff --git a/samples/datalake/iceberg_and_paimon/data/spark-conf/metrics.properties.template b/samples/datalake/iceberg_and_paimon/data/spark-conf/metrics.properties.template new file mode 100644 index 00000000000..f52d33fd642 --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/data/spark-conf/metrics.properties.template @@ -0,0 +1,210 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# syntax: [instance].sink|source.[name].[options]=[value] + +# This file configures Spark's internal metrics system. The metrics system is +# divided into instances which correspond to internal components. +# Each instance can be configured to report its metrics to one or more sinks. +# Accepted values for [instance] are "master", "worker", "executor", "driver", +# and "applications". A wildcard "*" can be used as an instance name, in +# which case all instances will inherit the supplied property. +# +# Within an instance, a "source" specifies a particular set of grouped metrics. +# there are two kinds of sources: +# 1. Spark internal sources, like MasterSource, WorkerSource, etc, which will +# collect a Spark component's internal state. Each instance is paired with a +# Spark source that is added automatically. +# 2. Common sources, like JvmSource, which will collect low level state. +# These can be added through configuration options and are then loaded +# using reflection. +# +# A "sink" specifies where metrics are delivered to. Each instance can be +# assigned one or more sinks. +# +# The sink|source field specifies whether the property relates to a sink or +# source. +# +# The [name] field specifies the name of source or sink. +# +# The [options] field is the specific property of this source or sink. The +# source or sink is responsible for parsing this property. +# +# Notes: +# 1. To add a new sink, set the "class" option to a fully qualified class +# name (see examples below). +# 2. Some sinks involve a polling period. The minimum allowed polling period +# is 1 second. +# 3. Wildcard properties can be overridden by more specific properties. +# For example, master.sink.console.period takes precedence over +# *.sink.console.period. +# 4. A metrics specific configuration +# "spark.metrics.conf=${SPARK_HOME}/conf/metrics.properties" should be +# added to Java properties using -Dspark.metrics.conf=xxx if you want to +# customize metrics system. You can also put the file in ${SPARK_HOME}/conf +# and it will be loaded automatically. +# 5. The MetricsServlet sink is added by default as a sink in the master, +# worker and driver, and you can send HTTP requests to the "/metrics/json" +# endpoint to get a snapshot of all the registered metrics in JSON format. +# For master, requests to the "/metrics/master/json" and +# "/metrics/applications/json" endpoints can be sent separately to get +# metrics snapshots of the master instance and applications. This +# MetricsServlet does not have to be configured. +# 6. The metrics system can also be configured using Spark configuration +# parameters. The relevant parameter names are formed by adding the +# prefix "spark.metrics.conf." to the configuration entries detailed in +# this file (see examples below). + +## List of available common sources and their properties. + +# org.apache.spark.metrics.source.JvmSource +# Note: Currently, JvmSource is the only available common source. +# It can be added to an instance by setting the "class" option to its +# fully qualified class name (see examples below). + +## List of available sinks and their properties. + +# org.apache.spark.metrics.sink.ConsoleSink +# Name: Default: Description: +# period 10 Poll period +# unit seconds Unit of the poll period + +# org.apache.spark.metrics.sink.CSVSink +# Name: Default: Description: +# period 10 Poll period +# unit seconds Unit of the poll period +# directory /tmp Where to store CSV files + +# org.apache.spark.metrics.sink.GangliaSink +# Name: Default: Description: +# host NONE Hostname or multicast group of the Ganglia server, +# must be set +# port NONE Port of the Ganglia server(s), must be set +# period 10 Poll period +# unit seconds Unit of the poll period +# ttl 1 TTL of messages sent by Ganglia +# dmax 0 Lifetime in seconds of metrics (0 never expired) +# mode multicast Ganglia network mode ('unicast' or 'multicast') + +# org.apache.spark.metrics.sink.JmxSink + +# org.apache.spark.metrics.sink.MetricsServlet +# Name: Default: Description: +# path VARIES* Path prefix from the web server root +# sample false Whether to show entire set of samples for histograms +# ('false' or 'true') +# +# * Default path is /metrics/json for all instances except the master. The +# master has two paths: +# /metrics/applications/json # App information +# /metrics/master/json # Master information + +# org.apache.spark.metrics.sink.PrometheusServlet +# Name: Default: Description: +# path VARIES* Path prefix from the web server root +# +# * Default path is /metrics/prometheus for all instances except the master. The +# master has two paths: +# /metrics/applications/prometheus # App information +# /metrics/master/prometheus # Master information + +# org.apache.spark.metrics.sink.GraphiteSink +# Name: Default: Description: +# host NONE Hostname of the Graphite server, must be set +# port NONE Port of the Graphite server, must be set +# period 10 Poll period +# unit seconds Unit of the poll period +# prefix EMPTY STRING Prefix to prepend to every metric's name +# protocol tcp Protocol ("tcp" or "udp") to use +# regex NONE Optional filter to send only metrics matching this regex string + +# org.apache.spark.metrics.sink.StatsdSink +# Name: Default: Description: +# host 127.0.0.1 Hostname or IP of StatsD server +# port 8125 Port of StatsD server +# period 10 Poll period +# unit seconds Units of poll period +# prefix EMPTY STRING Prefix to prepend to metric name + +## Examples +# Enable JmxSink for all instances by class name +#*.sink.jmx.class=org.apache.spark.metrics.sink.JmxSink + +# Enable ConsoleSink for all instances by class name +#*.sink.console.class=org.apache.spark.metrics.sink.ConsoleSink + +# Enable StatsdSink for all instances by class name +#*.sink.statsd.class=org.apache.spark.metrics.sink.StatsdSink +#*.sink.statsd.prefix=spark + +# Polling period for the ConsoleSink +#*.sink.console.period=10 +# Unit of the polling period for the ConsoleSink +#*.sink.console.unit=seconds + +# Polling period for the ConsoleSink specific for the master instance +#master.sink.console.period=15 +# Unit of the polling period for the ConsoleSink specific for the master +# instance +#master.sink.console.unit=seconds + +# Enable CsvSink for all instances by class name +#*.sink.csv.class=org.apache.spark.metrics.sink.CsvSink + +# Polling period for the CsvSink +#*.sink.csv.period=1 +# Unit of the polling period for the CsvSink +#*.sink.csv.unit=minutes + +# Polling directory for CsvSink +#*.sink.csv.directory=/tmp/ + +# Polling period for the CsvSink specific for the worker instance +#worker.sink.csv.period=10 +# Unit of the polling period for the CsvSink specific for the worker instance +#worker.sink.csv.unit=minutes + +# Enable Slf4jSink for all instances by class name +#*.sink.slf4j.class=org.apache.spark.metrics.sink.Slf4jSink + +# Polling period for the Slf4JSink +#*.sink.slf4j.period=1 +# Unit of the polling period for the Slf4jSink +#*.sink.slf4j.unit=minutes + +# Example configuration for Graphite sink +#*.sink.graphite.class=org.apache.spark.metrics.sink.GraphiteSink +#*.sink.graphite.host=<graphiteEndPoint_hostName> +#*.sink.graphite.port=<listening_port> +#*.sink.graphite.period=10 +#*.sink.graphite.unit=seconds +#*.sink.graphite.prefix=<optional_value> + +# Enable JvmSource for instance master, worker, driver and executor +#master.source.jvm.class=org.apache.spark.metrics.source.JvmSource + +#worker.source.jvm.class=org.apache.spark.metrics.source.JvmSource + +#driver.source.jvm.class=org.apache.spark.metrics.source.JvmSource + +#executor.source.jvm.class=org.apache.spark.metrics.source.JvmSource + +# Example configuration for PrometheusServlet +#*.sink.prometheusServlet.class=org.apache.spark.metrics.sink.PrometheusServlet +#*.sink.prometheusServlet.path=/metrics/prometheus +#master.sink.prometheusServlet.path=/metrics/master/prometheus +#applications.sink.prometheusServlet.path=/metrics/applications/prometheus diff --git a/samples/datalake/iceberg_and_paimon/data/spark-conf/spark-defaults.conf b/samples/datalake/iceberg_and_paimon/data/spark-conf/spark-defaults.conf new file mode 100755 index 00000000000..925cdf02455 --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/data/spark-conf/spark-defaults.conf @@ -0,0 +1,43 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Default system properties included when running spark-submit. +# This is useful for setting default environmental settings. + +# Example: +# spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions +spark.sql.catalog.demo org.apache.iceberg.spark.SparkCatalog +spark.sql.catalog.demo.type rest +spark.sql.catalog.demo.uri http://rest:8181 +spark.sql.catalog.demo.io-impl org.apache.iceberg.aws.s3.S3FileIO +spark.sql.catalog.demo.warehouse s3://warehouse/wh/ +spark.sql.catalog.demo.s3.endpoint http://minio:9000 + +spark.sql.defaultCatalog demo +spark.eventLog.enabled true +spark.eventLog.dir /home/iceberg/spark-events +spark.history.fs.logDirectory /home/iceberg/spark-events +spark.sql.catalogImplementation in-memory + +# spark.sql.extensions org.apache.paimon.spark.extensions.PaimonSparkSessionExtensions +spark.sql.catalog.paimon org.apache.paimon.spark.SparkCatalog +spark.sql.catalog.paimon.warehouse s3://warehouse/wh +spark.sql.catalog.paimon.s3.endpoint http://minio:9000 +spark.sql.catalog.paimon.s3.access-key admin +spark.sql.catalog.paimon.s3.secret-key password +spark.sql.catalog.paimon.s3.region us-east-1 +spark.sql.catalog.paimon.warehouse s3://warehouse/wh \ No newline at end of file diff --git a/samples/datalake/iceberg_and_paimon/data/spark-conf/spark-defaults.conf.template b/samples/datalake/iceberg_and_paimon/data/spark-conf/spark-defaults.conf.template new file mode 100644 index 00000000000..19cba6e71ed --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/data/spark-conf/spark-defaults.conf.template @@ -0,0 +1,27 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Default system properties included when running spark-submit. +# This is useful for setting default environmental settings. + +# Example: +# spark.master spark://master:7077 +# spark.eventLog.enabled true +# spark.eventLog.dir hdfs://namenode:8021/directory +# spark.serializer org.apache.spark.serializer.KryoSerializer +# spark.driver.memory 5g +# spark.executor.extraJavaOptions -XX:+PrintGCDetails -Dkey=value -Dnumbers="one two three" diff --git a/samples/datalake/iceberg_and_paimon/data/spark-conf/spark-env.sh.template b/samples/datalake/iceberg_and_paimon/data/spark-conf/spark-env.sh.template new file mode 100755 index 00000000000..e9491995e72 --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/data/spark-conf/spark-env.sh.template @@ -0,0 +1,81 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# This file is sourced when running various Spark programs. +# Copy it as spark-env.sh and edit that to configure Spark for your site. + +# Options read when launching programs locally with +# ./bin/run-example or ./bin/spark-submit +# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files +# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node +# - SPARK_PUBLIC_DNS, to set the public dns name of the driver program + +# Options read by executors and drivers running inside the cluster +# - SPARK_LOCAL_IP, to set the IP address Spark binds to on this node +# - SPARK_PUBLIC_DNS, to set the public DNS name of the driver program +# - SPARK_LOCAL_DIRS, storage directories to use on this node for shuffle and RDD data +# - MESOS_NATIVE_JAVA_LIBRARY, to point to your libmesos.so if you use Mesos + +# Options read in any mode +# - SPARK_CONF_DIR, Alternate conf dir. (Default: ${SPARK_HOME}/conf) +# - SPARK_EXECUTOR_CORES, Number of cores for the executors (Default: 1). +# - SPARK_EXECUTOR_MEMORY, Memory per Executor (e.g. 1000M, 2G) (Default: 1G) +# - SPARK_DRIVER_MEMORY, Memory for Driver (e.g. 1000M, 2G) (Default: 1G) + +# Options read in any cluster manager using HDFS +# - HADOOP_CONF_DIR, to point Spark towards Hadoop configuration files + +# Options read in YARN client/cluster mode +# - YARN_CONF_DIR, to point Spark towards YARN configuration files when you use YARN + +# Options for the daemons used in the standalone deploy mode +# - SPARK_MASTER_HOST, to bind the master to a different IP address or hostname +# - SPARK_MASTER_PORT / SPARK_MASTER_WEBUI_PORT, to use non-default ports for the master +# - SPARK_MASTER_OPTS, to set config properties only for the master (e.g. "-Dx=y") +# - SPARK_WORKER_CORES, to set the number of cores to use on this machine +# - SPARK_WORKER_MEMORY, to set how much total memory workers have to give executors (e.g. 1000m, 2g) +# - SPARK_WORKER_PORT / SPARK_WORKER_WEBUI_PORT, to use non-default ports for the worker +# - SPARK_WORKER_DIR, to set the working directory of worker processes +# - SPARK_WORKER_OPTS, to set config properties only for the worker (e.g. "-Dx=y") +# - SPARK_DAEMON_MEMORY, to allocate to the master, worker and history server themselves (default: 1g). +# - SPARK_HISTORY_OPTS, to set config properties only for the history server (e.g. "-Dx=y") +# - SPARK_SHUFFLE_OPTS, to set config properties only for the external shuffle service (e.g. "-Dx=y") +# - SPARK_DAEMON_JAVA_OPTS, to set config properties for all daemons (e.g. "-Dx=y") +# - SPARK_DAEMON_CLASSPATH, to set the classpath for all daemons +# - SPARK_PUBLIC_DNS, to set the public dns name of the master or workers + +# Options for launcher +# - SPARK_LAUNCHER_OPTS, to set config properties and Java options for the launcher (e.g. "-Dx=y") + +# Generic options for the daemons used in the standalone deploy mode +# - SPARK_CONF_DIR Alternate conf dir. (Default: ${SPARK_HOME}/conf) +# - SPARK_LOG_DIR Where log files are stored. (Default: ${SPARK_HOME}/logs) +# - SPARK_LOG_MAX_FILES Max log files of Spark daemons can rotate to. Default is 5. +# - SPARK_PID_DIR Where the pid file is stored. (Default: /tmp) +# - SPARK_IDENT_STRING A string representing this instance of spark. (Default: $USER) +# - SPARK_NICENESS The scheduling priority for daemons. (Default: 0) +# - SPARK_NO_DAEMONIZE Run the proposed command in the foreground. It will not output a PID file. +# Options for native BLAS, like Intel MKL, OpenBLAS, and so on. +# You might get better performance to enable these options if using native BLAS (see SPARK-21305). +# - MKL_NUM_THREADS=1 Disable multi-threading of Intel MKL +# - OPENBLAS_NUM_THREADS=1 Disable multi-threading of OpenBLAS + +# Options for beeline +# - SPARK_BEELINE_OPTS, to set config properties only for the beeline cli (e.g. "-Dx=y") +# - SPARK_BEELINE_MEMORY, Memory for beeline (e.g. 1000M, 2G) (Default: 1G) diff --git a/samples/datalake/iceberg_and_paimon/data/spark-conf/workers.template b/samples/datalake/iceberg_and_paimon/data/spark-conf/workers.template new file mode 100644 index 00000000000..be42a638230 --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/data/spark-conf/workers.template @@ -0,0 +1,19 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# A Spark Worker will be started on each of the machines listed below. +localhost \ No newline at end of file diff --git a/samples/datalake/iceberg_and_paimon/data/table/customer/000000_0 b/samples/datalake/iceberg_and_paimon/data/table/customer/000000_0 new file mode 100644 index 00000000000..4521df9f977 Binary files /dev/null and b/samples/datalake/iceberg_and_paimon/data/table/customer/000000_0 differ diff --git a/samples/datalake/iceberg_and_paimon/data/table/customer/000001_0 b/samples/datalake/iceberg_and_paimon/data/table/customer/000001_0 new file mode 100644 index 00000000000..c3f4625c765 Binary files /dev/null and b/samples/datalake/iceberg_and_paimon/data/table/customer/000001_0 differ diff --git a/samples/datalake/iceberg_and_paimon/data/table/customer/000002_0 b/samples/datalake/iceberg_and_paimon/data/table/customer/000002_0 new file mode 100644 index 00000000000..bd8aeb3b419 Binary files /dev/null and b/samples/datalake/iceberg_and_paimon/data/table/customer/000002_0 differ diff --git a/samples/datalake/iceberg_and_paimon/data/table/customer/000003_0 b/samples/datalake/iceberg_and_paimon/data/table/customer/000003_0 new file mode 100644 index 00000000000..1cb94ee0708 Binary files /dev/null and b/samples/datalake/iceberg_and_paimon/data/table/customer/000003_0 differ diff --git a/samples/datalake/iceberg_and_paimon/docker-compose.env b/samples/datalake/iceberg_and_paimon/docker-compose.env new file mode 100644 index 00000000000..5ea287509a0 --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/docker-compose.env @@ -0,0 +1,22 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +REST_CATALOG_PORT=18181 +MINIO_UI_PORT=19002 +MINIO_API_PORT=19001 +DORIS_QUERY_PORT=19031 diff --git a/samples/datalake/iceberg_and_paimon/docker-compose.yml b/samples/datalake/iceberg_and_paimon/docker-compose.yml new file mode 100644 index 00000000000..739b2bfbed7 --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/docker-compose.yml @@ -0,0 +1,173 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +version: "3" + +services: + spark: + image: tabulario/spark-iceberg + container_name: doris-iceberg-paimon-spark + hostname: demo-spark-iceberg + build: spark/ + volumes: + - ./packages/jars/paimon-spark-3.5-0.8.0.jar:/opt/spark/jars/paimon-spark-3.5-0.8.0.jar + - ./packages/jars/paimon-s3-0.8.0.jar:/opt/spark/jars/paimon-s3-0.8.0.jar + - ./data/table:/opt/data + - ./data/spark-conf:/opt/spark/conf + - ./sql/prepare_data.sql:/opt/sql/prepare_data.sql + depends_on: + - rest + - minio + environment: + - AWS_ACCESS_KEY_ID=admin + - AWS_SECRET_ACCESS_KEY=password + - AWS_REGION=us-east-1 + networks: + - demo-iceberg + + rest: + image: tabulario/iceberg-rest + container_name: doris-iceberg-paimon-iceberg-rest + ports: + - ${REST_CATALOG_PORT}:8181 + environment: + - AWS_ACCESS_KEY_ID=admin + - AWS_SECRET_ACCESS_KEY=password + - AWS_REGION=us-east-1 + - CATALOG_WAREHOUSE=s3://warehouse/wh/ + - CATALOG_IO__IMPL=org.apache.iceberg.aws.s3.S3FileIO + - CATALOG_S3_ENDPOINT=http://minio:9000 + networks: + - demo-iceberg + volumes: + - './packages/jdk1.8.0_202:/opt/jdk1.8.0_202' + - './packages/doris-bin:/opt/doris-bin' + - './scripts:/opt/scripts' + + minio: + image: minio/minio + container_name: doris-iceberg-paimon-minio + ports: + - ${MINIO_API_PORT}:9000 + - ${MINIO_UI_PORT}:9001 + environment: + - MINIO_ROOT_USER=admin + - MINIO_ROOT_PASSWORD=password + - MINIO_DOMAIN=minio + networks: + demo-iceberg: + aliases: + - warehouse.minio + command: ["server", "/data", "--console-address", ":9001"] + + mc: + depends_on: + - minio + image: minio/mc + container_name: doris-iceberg-paimon-mc + environment: + - AWS_ACCESS_KEY_ID=admin + - AWS_SECRET_ACCESS_KEY=password + - AWS_REGION=us-east-1 + networks: + - demo-iceberg + entrypoint: > + /bin/sh -c " + until (/usr/bin/mc config host add minio http://minio:9000 admin password) do echo '...waiting...' && sleep 1; done; + /usr/bin/mc rm -r --force minio/warehouse; + /usr/bin/mc mb minio/warehouse; + /usr/bin/mc policy set public minio/warehouse; + tail -f /dev/null + " + + jobmanager: + image: flink:1.18.0 + container_name: doris-iceberg-paimon-jobmanager + environment: + - JOB_MANAGER_RPC_ADDRESS=jobmanager + - AWS_ACCESS_KEY_ID=admin + - AWS_SECRET_ACCESS_KEY=password + - AWS_REGION=us-east-1 + ports: + - "8082:8081" + command: jobmanager + depends_on: + - rest + - minio + volumes: + - ./packages/jars/flink-connector-jdbc-3.1.2-1.18.jar:/opt/flink/lib/flink-connector-jdbc-3.1.2-1.18.jar + - ./packages/jars/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar:/opt/flink/lib/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar + - ./packages/jars/flink-s3-fs-hadoop-1.18.0.jar:/opt/flink/plugins/s3-fs-hadoop/flink-s3-fs-hadoop-1.18.0.jar + - ./packages/jars/iceberg-flink-runtime-1.18-1.5.2.jar:/opt/flink/lib/iceberg-flink-runtime-1.18-1.5.2.jar + - ./packages/jars/iceberg-aws-bundle-1.5.2.jar:/opt/flink/lib/iceberg-aws-bundle-1.5.2.jar + - ./packages/jars/paimon-flink-1.18-0.8.0.jar:/opt/flink/lib/paimon-flink-1.18-0.8.0.jar + - ./packages/jars/paimon-s3-0.8.0.jar:/opt/flink/lib/paimon-s3-0.8.0.jar + - ./sql/init_tables.sql:/opt/flink/sql/init_tables.sql + - ./data/flink-conf:/opt/flink/conf + networks: + - demo-iceberg + deploy: + replicas: 1 + + taskmanager: + image: flink:1.18.0 + environment: + - JOB_MANAGER_RPC_ADDRESS=jobmanager + - AWS_ACCESS_KEY_ID=admin + - AWS_SECRET_ACCESS_KEY=password + - AWS_REGION=us-east-1 + depends_on: + - jobmanager + command: taskmanager + volumes: + - ./packages/jars/flink-connector-jdbc-3.1.2-1.18.jar:/opt/flink/lib/flink-connector-jdbc-3.1.2-1.18.jar + - ./packages/jars/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar:/opt/flink/lib/flink-shaded-hadoop-2-uber-2.8.3-10.0.jar + - ./packages/jars/flink-s3-fs-hadoop-1.18.0.jar:/opt/flink/plugins/s3-fs-hadoop/flink-s3-fs-hadoop-1.18.0.jar + - ./packages/jars/iceberg-flink-runtime-1.18-1.5.2.jar:/opt/flink/lib/iceberg-flink-runtime-1.18-1.5.2.jar + - ./packages/jars/iceberg-aws-bundle-1.5.2.jar:/opt/flink/lib/iceberg-aws-bundle-1.5.2.jar + - ./packages/jars/paimon-flink-1.18-0.8.0.jar:/opt/flink/lib/paimon-flink-1.18-0.8.0.jar + - ./packages/jars/paimon-s3-0.8.0.jar:/opt/flink/lib/paimon-s3-0.8.0.jar + networks: + - demo-iceberg + deploy: + replicas: 2 + + doris: + image: mysql:8.0.18 + container_name: doris-iceberg-paimon-doris + networks: + - demo-iceberg + hostname: doris + ports: + - ${DORIS_QUERY_PORT}:9030 + environment: + - AWS_ACCESS_KEY_ID=admin + - AWS_SECRET_ACCESS_KEY=password + - AWS_REGION=us-east-1 + - LD_LIBRARY_PATH=/opt/doris/be/lib + - JAVA_HOME=/opt/jdk8 + volumes: + - ./packages/jdk1.8.0_202:/opt/jdk8 + - ./packages/doris-bin:/opt/doris-bin + - ./sql/init_doris.sql:/opt/doris-bin/init_doris.sql + - ./scripts:/opt/scripts + command: bin/bash /opt/scripts/start_doris.sh + +networks: + demo-iceberg: + ipam: + driver: default diff --git a/samples/datalake/iceberg_and_paimon/scripts/start_doris.sh b/samples/datalake/iceberg_and_paimon/scripts/start_doris.sh new file mode 100644 index 00000000000..1953843e9a0 --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/scripts/start_doris.sh @@ -0,0 +1,60 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +export JAVA_HOME=/opt/jdk8 +export PATH=${JAVA_HOME}/bin:${PATH} + +echo 'start to copy...' +cp -r /opt/doris-bin /opt/doris + +echo 'start fe...' +rm -rf /opt/doris/fe/doris-meta/* +/opt/doris/fe/bin/start_fe.sh --daemon + +echo 'start be...' +rm -rf /opt/doris/be/storage/* +/opt/doris/be/bin/start_be.sh --daemon + +while [[ ! -f "/opt/doris/fe/log/fe.log" ]]; do + echo "wait log..." + sleep 2 +done + +QE=$(grep "QE service start." /opt/doris/fe/log/fe.log) +while [[ -z "${QE}" ]]; do + echo "wait fe..." + sleep 2 + QE=$(grep "QE service start." /opt/doris/fe/log/fe.log) +done + +echo 'doris is started' + +MYSQL_ERROR=$(mysql -u root -P 9030 -h doris </opt/doris-bin/init_doris.sql 2>&1) +ERR=$(echo "${MYSQL_ERROR}" | grep "Can't connect to MySQL") +echo "${ERR}" +while [[ -n "${ERR}" ]]; do + echo "wait mysql..." + sleep 2 + MYSQL_ERROR=$(mysql -u root -P 9030 -h doris </opt/doris-bin/init_doris.sql 2>&1) + ERR=$(echo "${MYSQL_ERROR}" | grep "Can't connect to MySQL") +done + +echo 'doris is inited' + +tail -F /dev/null diff --git a/samples/datalake/iceberg_and_paimon/sql/init_doris.sql b/samples/datalake/iceberg_and_paimon/sql/init_doris.sql new file mode 100644 index 00000000000..b18234972c1 --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/sql/init_doris.sql @@ -0,0 +1,21 @@ +ALTER SYSTEM ADD BACKEND '127.0.0.1:9050'; + +CREATE CATALOG `iceberg` PROPERTIES ( + "type" = "iceberg", + "iceberg.catalog.type" = "rest", + "uri"="http://rest:8181", + "warehouse" = "s3://warehouse/", + "s3.endpoint"="http://minio:9000", + "s3.access_key"="admin", + "s3.secret_key"="password", + "s3.region"="us-east-1" +); + +CREATE CATALOG `paimon` PROPERTIES ( + "type" = "paimon", + "warehouse" = "s3://warehouse/wh/", + "s3.endpoint"="http://minio:9000", + "s3.access_key"="admin", + "s3.secret_key"="password", + "s3.region"="us-east-1" +); diff --git a/samples/datalake/iceberg_and_paimon/sql/init_tables.sql b/samples/datalake/iceberg_and_paimon/sql/init_tables.sql new file mode 100644 index 00000000000..96434fe5810 --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/sql/init_tables.sql @@ -0,0 +1,53 @@ +SET 'sql-client.execution.result-mode' = 'tableau'; +SET 'execution.runtime-mode' = 'batch'; + + +CREATE CATALOG iceberg WITH ( + 'type'='iceberg', + 'catalog-type'='rest', + 'uri'='http://rest:8181/', + 's3.endpoint'='http://minio:9000', + 'warehouse'='s3://warehouse/wh/' +); + +create database if not exists iceberg.db_iceberg; + + +CREATE TABLE if not exists iceberg.db_iceberg.tb_iceberg ( + id BIGINT, + val string, + PRIMARY KEY (id) NOT ENFORCED +) WITH ( +'write.upsert.enabled'='true', +'upsert-enabled'='true', +'write.delete.mode'='merge-on-read', +'write.update.mode'='merge-on-read' +); + + +CREATE CATALOG `paimon` WITH ( + 'type' = 'paimon', + 'warehouse' = 's3://warehouse/wh', + 's3.endpoint'='http://minio:9000', + 's3.access-key' = 'admin', + 's3.secret-key' = 'password', + 's3.region' = 'us-east-1' +); + + +create database if not exists paimon.db_paimon; + +CREATE TABLE if not exists paimon.db_paimon.customer ( + `c_custkey` int, + `c_name` varchar(25), + `c_address` varchar(40), + `c_nationkey` int, + `c_phone` char(15), + `c_acctbal` decimal(12,2), + `c_mktsegment` char(10), + `c_comment` varchar(117), + PRIMARY KEY (c_custkey, c_nationkey) NOT ENFORCED +) PARTITIONED BY (c_nationkey) WITH ( + 'deletion-vectors.enabled' = 'true', + 'bucket'='1' +); diff --git a/samples/datalake/iceberg_and_paimon/sql/prepare_data.sql b/samples/datalake/iceberg_and_paimon/sql/prepare_data.sql new file mode 100644 index 00000000000..f97d0698054 --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/sql/prepare_data.sql @@ -0,0 +1,8 @@ + +CREATE TEMPORARY VIEW customer_files +USING org.apache.spark.sql.parquet +OPTIONS ( + path "file:///opt/data/customer/" +); + +insert overwrite paimon.db_paimon.customer select c_custkey,c_name, c_address, c_nationkey, c_phone, c_acctbal, c_mktsegment, c_comment from customer_files; \ No newline at end of file diff --git a/samples/datalake/iceberg_and_paimon/start_all.sh b/samples/datalake/iceberg_and_paimon/start_all.sh new file mode 100644 index 00000000000..bc90a820f28 --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/start_all.sh @@ -0,0 +1,121 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e + +DORIS_PACKAGE=apache-doris-2.1.4-bin-x64 +DORIS_DOWNLOAD_URL=https://apache-doris-releases.oss-accelerate.aliyuncs.com + +download_source_file() { + local FILE_PATH="$1" + local EXPECTED_MD5="$2" + local DOWNLOAD_URL="$3" + + echo "solve for ${FILE_PATH} ..." + + if [[ -f "${FILE_PATH}" ]]; then + local FILE_MD5 + echo "compare md5sum ..." + FILE_MD5=$(md5sum "${FILE_PATH}" | awk '{ print $1 }') + + if [[ "${FILE_MD5}" = "${EXPECTED_MD5}" ]]; then + echo "${FILE_PATH} is ready!" + else + echo "${FILE_PATH} is broken, Redownloading ..." + rm "${FILE_PATH}" + wget "${DOWNLOAD_URL}"/"${FILE_PATH}" + fi + else + echo "downloading ${FILE_PATH} ..." + wget "${DOWNLOAD_URL}"/"${FILE_PATH}" + fi +} + +curdir="$(cd "$(dirname "${BASH_SOURCE[0]}")" &>/dev/null && pwd)" +cd "${curdir}" || exit + +if [[ ! -d "packages" ]]; then + mkdir packages +fi +cd packages || exit + +download_source_file "${DORIS_PACKAGE}.tar.gz" "a4d8bc9730aca3a51294e87d7d5b3e8e" "${DORIS_DOWNLOAD_URL}" +download_source_file "jdk-8u202-linux-x64.tar.gz" "0029351f7a946f6c05b582100c7d45b7" "https://repo.huaweicloud.com/java/jdk/8u202-b08" +download_source_file "iceberg-aws-bundle-1.5.2.jar" "7087ac697254f8067d0f813521542263" "https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/1.5.2" +download_source_file "iceberg-flink-runtime-1.18-1.5.2.jar" "8e895288e6770eea69ea05ffbc918c1b" "https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-flink-runtime-1.18/1.5.2" +download_source_file "flink-connector-jdbc-3.1.2-1.18.jar" "5c99b637721dd339e10725b81ccedb60" "https://repo1.maven.org/maven2/org/apache/flink/flink-connector-jdbc/3.1.2-1.18" +download_source_file "paimon-s3-0.8.0.jar" "3e510c634a21cbcdca4fd3b85786a20c" "https://repo1.maven.org/maven2/org/apache/paimon/paimon-s3/0.8.0" +download_source_file "paimon-flink-1.18-0.8.0.jar" "f590d94af1b923a7c68152b558d5b25b" "https://repo1.maven.org/maven2/org/apache/paimon/paimon-flink-1.18/0.8.0" +download_source_file "paimon-spark-3.5-0.8.0.jar" "963d0c17d69034ecf77816f64863fc51" "https://repo1.maven.org/maven2/org/apache/paimon/paimon-spark-3.5/0.8.0" +download_source_file "flink-shaded-hadoop-2-uber-2.8.3-10.0.jar" "f6f0be5b9cbebfd43e38121b209f4ecc" "https://repo1.maven.org/maven2/org/apache/flink/flink-shaded-hadoop-2-uber/2.8.3-10.0" +download_source_file "flink-s3-fs-hadoop-1.18.0.jar" "60b75e0fdc5ed05f1213b593c4b66556" "https://repo1.maven.org/maven2/org/apache/flink/flink-s3-fs-hadoop/1.18.0" + +if [[ ! -f "jdk1.8.0_202/SUCCESS" ]]; then + echo "Prepare jdk8 environment" + if [[ -d "jdk1.8.0_202" ]]; then + echo "Remove broken jdk1.8.0_202" + rm -rf jdk1.8.0_202 + fi + echo "Unpackage jdk1.8.0_202" + tar xzf jdk-8u202-linux-x64.tar.gz + touch jdk1.8.0_202/SUCCESS +fi + +if [[ ! -f "doris-bin/SUCCESS" ]]; then + echo "Prepare ${DORIS_PACKAGE} environment" + if [[ -d "doris-bin" ]]; then + echo "Remove broken ${DORIS_PACKAGE}" + rm -rf doris-bin + fi + echo "Unpackage ${DORIS_PACKAGE}" + tar xzf "${DORIS_PACKAGE}".tar.gz + mv "${DORIS_PACKAGE}" doris-bin + touch doris-bin/SUCCESS +fi + +if [[ ! -f "jars/SUCCESS" ]]; then + echo "Prepare jars environment" + if [[ -d "jars" ]]; then + echo "Remove broken jars" + rm -rf jars + fi + mkdir jars + cp ./*.jar jars/ + touch jars/SUCCESS +fi + +cd ../ + +echo "Start docker-compose..." +sudo docker compose -f docker-compose.yml --env-file docker-compose.env up -d + +echo "Start init iceberg and paimon tables..." +sudo docker exec -it doris-iceberg-paimon-jobmanager sql-client.sh -f /opt/flink/sql/init_tables.sql | tee -a init.log >/dev/null + +echo "Start prepare data for tables..." +sudo docker exec -it doris-iceberg-paimon-spark spark-sql --conf spark.sql.extensions=org.apache.paimon.spark.extensions.PaimonSparkSessionExtensions -f /opt/sql/prepare_data.sql | tee -a init.log >/dev/null + +echo "=============================================================================" +echo "Success to launch doris+iceberg+paimon+flink+spark+minio environments!" +echo "You can:" +echo " 'bash start_doris_client.sh' to login into doris" +echo " 'bash start_flink_client.sh' to login into flink" +echo " 'bash start_spark_paimon_client.sh' to login into spark for paimon" +echo " 'bash start_spark_iceberg_client.sh' to login into spark for iceberg" +echo "=============================================================================" diff --git a/samples/datalake/iceberg_and_paimon/start_doris_client.sh b/samples/datalake/iceberg_and_paimon/start_doris_client.sh new file mode 100644 index 00000000000..e94c0462a36 --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/start_doris_client.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash + +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +sudo docker exec -it doris-iceberg-paimon-doris sh -c "mysql -u root -h doris -P 9030" diff --git a/samples/datalake/iceberg_and_paimon/start_flink_client.sh b/samples/datalake/iceberg_and_paimon/start_flink_client.sh new file mode 100644 index 00000000000..c69e22b31ce --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/start_flink_client.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +sudo docker exec -it doris-iceberg-paimon-jobmanager sql-client.sh -i /opt/flink/sql/init_tables.sql diff --git a/samples/datalake/iceberg_and_paimon/start_spark_iceberg_client.sh b/samples/datalake/iceberg_and_paimon/start_spark_iceberg_client.sh new file mode 100644 index 00000000000..0865046e599 --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/start_spark_iceberg_client.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +sudo docker exec -it doris-iceberg-paimon-spark spark-sql --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions diff --git a/samples/datalake/iceberg_and_paimon/start_spark_paimon_client.sh b/samples/datalake/iceberg_and_paimon/start_spark_paimon_client.sh new file mode 100644 index 00000000000..b995ffe79d1 --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/start_spark_paimon_client.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +sudo docker exec -it doris-iceberg-paimon-spark spark-sql --conf spark.sql.extensions=org.apache.paimon.spark.extensions.PaimonSparkSessionExtensions diff --git a/samples/datalake/iceberg_and_paimon/stop_all.sh b/samples/datalake/iceberg_and_paimon/stop_all.sh new file mode 100644 index 00000000000..63c24475607 --- /dev/null +++ b/samples/datalake/iceberg_and_paimon/stop_all.sh @@ -0,0 +1,19 @@ +#!/usr/bin/env bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +sudo docker compose -f docker-compose.yml --env-file docker-compose.env down --------------------------------------------------------------------- To unsubscribe, e-mail: commits-unsubscr...@doris.apache.org For additional commands, e-mail: commits-h...@doris.apache.org