From 227b6225edd77112d95ff5d92ace6fa06836fc7d Mon Sep 17 00:00:00 2001 From: Sammy Sidhu Date: Wed, 25 Oct 2023 01:25:33 -0700 Subject: [PATCH] [CHORE] bring up fixtures for iceberg (#1527) * Adds fixtures (Minio / Iceberg Rest server / Spark) to enable iceberg integration tests --- .github/workflows/nightlies-tests.yml | 2 +- .github/workflows/python-package.yml | 2 +- .../retry_server => iceberg}/__init__.py | 0 .../iceberg/docker-compose/Dockerfile | 71 ++++ .../iceberg/docker-compose/check-license | 86 +++++ .../docker-compose/docker-compose-azurite.yml | 26 ++ .../docker-compose-gcs-server.yml | 30 ++ .../iceberg/docker-compose/docker-compose.yml | 88 +++++ .../iceberg/docker-compose/entrypoint.sh | 25 ++ .../iceberg/docker-compose/provision.py | 324 ++++++++++++++++++ .../iceberg/docker-compose/run-azurite.sh | 33 ++ .../iceberg/docker-compose/run-gcs-server.sh | 33 ++ .../iceberg/docker-compose/run-minio.sh | 33 ++ .../docker-compose/spark-defaults.conf | 29 ++ .../{ => io}/docker-compose/Dockerfile.nginx | 0 .../docker-compose/Dockerfile.s3_retry_server | 0 .../docker-compose/docker-compose.yml | 0 .../nginx-serve-static-files.conf | 0 .../docker-compose/retry_server}/__init__.py | 0 .../docker-compose/retry_server/main.py | 0 .../retry-server-requirements.txt | 0 .../retry_server/routers}/__init__.py | 0 .../routers/get_retries_parquet_bucket.py | 0 .../routers/head_retries_parquet_bucket.py | 0 .../routers/rate_limited_echo_gets_bucket.py | 0 .../retry_server/utils/__init__.py | 0 .../retry_server/utils/parquet_generation.py | 0 .../retry_server/utils/responses.py | 0 28 files changed, 780 insertions(+), 2 deletions(-) rename tests/integration/{docker-compose/retry_server => iceberg}/__init__.py (100%) create mode 100644 tests/integration/iceberg/docker-compose/Dockerfile create mode 100755 tests/integration/iceberg/docker-compose/check-license create mode 100644 tests/integration/iceberg/docker-compose/docker-compose-azurite.yml create mode 100644 tests/integration/iceberg/docker-compose/docker-compose-gcs-server.yml create mode 100644 tests/integration/iceberg/docker-compose/docker-compose.yml create mode 100755 tests/integration/iceberg/docker-compose/entrypoint.sh create mode 100644 tests/integration/iceberg/docker-compose/provision.py create mode 100755 tests/integration/iceberg/docker-compose/run-azurite.sh create mode 100644 tests/integration/iceberg/docker-compose/run-gcs-server.sh create mode 100755 tests/integration/iceberg/docker-compose/run-minio.sh create mode 100644 tests/integration/iceberg/docker-compose/spark-defaults.conf rename tests/integration/{ => io}/docker-compose/Dockerfile.nginx (100%) rename tests/integration/{ => io}/docker-compose/Dockerfile.s3_retry_server (100%) rename tests/integration/{ => io}/docker-compose/docker-compose.yml (100%) rename tests/integration/{ => io}/docker-compose/nginx-serve-static-files.conf (100%) rename tests/integration/{docker-compose/retry_server/routers => io/docker-compose/retry_server}/__init__.py (100%) rename tests/integration/{ => io}/docker-compose/retry_server/main.py (100%) rename tests/integration/{ => io}/docker-compose/retry_server/retry-server-requirements.txt (100%) rename tests/integration/{docker-compose/retry_server/utils => io/docker-compose/retry_server/routers}/__init__.py (100%) rename tests/integration/{ => io}/docker-compose/retry_server/routers/get_retries_parquet_bucket.py (100%) rename tests/integration/{ => io}/docker-compose/retry_server/routers/head_retries_parquet_bucket.py (100%) rename tests/integration/{ => io}/docker-compose/retry_server/routers/rate_limited_echo_gets_bucket.py (100%) create mode 100644 tests/integration/io/docker-compose/retry_server/utils/__init__.py rename tests/integration/{ => io}/docker-compose/retry_server/utils/parquet_generation.py (100%) rename tests/integration/{ => io}/docker-compose/retry_server/utils/responses.py (100%) diff --git a/.github/workflows/nightlies-tests.yml b/.github/workflows/nightlies-tests.yml index 2bdb565ae9..e7e59e8ff6 100644 --- a/.github/workflows/nightlies-tests.yml +++ b/.github/workflows/nightlies-tests.yml @@ -111,7 +111,7 @@ jobs: - name: Spin up IO services uses: isbang/compose-action@v1.5.1 with: - compose-file: ./tests/integration/docker-compose/docker-compose.yml + compose-file: ./tests/integration/io/docker-compose/docker-compose.yml down-flags: --volumes - name: Run IO integration tests run: | diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml index fcd44f21c9..8cde988412 100644 --- a/.github/workflows/python-package.yml +++ b/.github/workflows/python-package.yml @@ -307,7 +307,7 @@ jobs: - name: Spin up IO services uses: isbang/compose-action@v1.5.1 with: - compose-file: ./tests/integration/docker-compose/docker-compose.yml + compose-file: ./tests/integration/io/docker-compose/docker-compose.yml down-flags: --volumes - name: Run IO integration tests run: | diff --git a/tests/integration/docker-compose/retry_server/__init__.py b/tests/integration/iceberg/__init__.py similarity index 100% rename from tests/integration/docker-compose/retry_server/__init__.py rename to tests/integration/iceberg/__init__.py diff --git a/tests/integration/iceberg/docker-compose/Dockerfile b/tests/integration/iceberg/docker-compose/Dockerfile new file mode 100644 index 0000000000..77ed84ed4f --- /dev/null +++ b/tests/integration/iceberg/docker-compose/Dockerfile @@ -0,0 +1,71 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +FROM python:3.9-bullseye + +RUN apt-get -qq update && \ + apt-get -qq install -y --no-install-recommends \ + sudo \ + curl \ + vim \ + unzip \ + openjdk-11-jdk \ + build-essential \ + software-properties-common \ + ssh && \ + apt-get -qq clean && \ + rm -rf /var/lib/apt/lists/* + +# Optional env variables +ENV SPARK_HOME=${SPARK_HOME:-"/opt/spark"} +ENV HADOOP_HOME=${HADOOP_HOME:-"/opt/hadoop"} +ENV PYTHONPATH=$SPARK_HOME/python:$SPARK_HOME/python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH + +RUN mkdir -p ${HADOOP_HOME} && mkdir -p ${SPARK_HOME} && mkdir -p /home/iceberg/spark-events +WORKDIR ${SPARK_HOME} + +ENV SPARK_VERSION=3.4.1 +ENV ICEBERG_SPARK_RUNTIME_VERSION=3.4_2.12 +ENV ICEBERG_VERSION=1.4.0 +ENV AWS_SDK_VERSION=2.20.18 +ENV PYICEBERG_VERSION=0.4.0 + +RUN curl --retry 3 -s -C - https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/spark-${SPARK_VERSION}-bin-hadoop3.tgz -o spark-${SPARK_VERSION}-bin-hadoop3.tgz \ + && tar xzf spark-${SPARK_VERSION}-bin-hadoop3.tgz --directory /opt/spark --strip-components 1 \ + && rm -rf spark-${SPARK_VERSION}-bin-hadoop3.tgz + +# Download iceberg spark runtime +RUN curl -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}/${ICEBERG_VERSION}/iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar -Lo iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar \ + && mv iceberg-spark-runtime-${ICEBERG_SPARK_RUNTIME_VERSION}-${ICEBERG_VERSION}.jar /opt/spark/jars + +# Download AWS bundle +RUN curl -s https://repo1.maven.org/maven2/org/apache/iceberg/iceberg-aws-bundle/${ICEBERG_VERSION}/iceberg-aws-bundle-${ICEBERG_VERSION}.jar -Lo iceberg-aws-bundle-${ICEBERG_VERSION}.jar \ + && mv iceberg-aws-bundle-${ICEBERG_VERSION}.jar /opt/spark/jars + +COPY spark-defaults.conf /opt/spark/conf +ENV PATH="/opt/spark/sbin:/opt/spark/bin:${PATH}" + +RUN chmod u+x /opt/spark/sbin/* && \ + chmod u+x /opt/spark/bin/* + +RUN pip3 install -q ipython + +RUN pip3 install "pyiceberg[s3fs]==${PYICEBERG_VERSION}" + +COPY entrypoint.sh . +COPY provision.py . + +ENTRYPOINT ["./entrypoint.sh"] +CMD ["notebook"] diff --git a/tests/integration/iceberg/docker-compose/check-license b/tests/integration/iceberg/docker-compose/check-license new file mode 100755 index 0000000000..6b1a9dfff2 --- /dev/null +++ b/tests/integration/iceberg/docker-compose/check-license @@ -0,0 +1,86 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +acquire_rat_jar () { + + URL="https://repo.maven.apache.org/maven2/org/apache/rat/apache-rat/${RAT_VERSION}/apache-rat-${RAT_VERSION}.jar" + + JAR="$rat_jar" + + # Download rat launch jar if it hasn't been downloaded yet + if [ ! -f "$JAR" ]; then + # Download + printf "Attempting to fetch rat\n" + JAR_DL="${JAR}.part" + if [ $(command -v curl) ]; then + curl -L --silent "${URL}" > "$JAR_DL" && mv "$JAR_DL" "$JAR" + elif [ $(command -v wget) ]; then + wget --quiet ${URL} -O "$JAR_DL" && mv "$JAR_DL" "$JAR" + else + printf "You do not have curl or wget installed, please install rat manually.\n" + exit -1 + fi + fi + + unzip -tq "$JAR" &> /dev/null + if [ $? -ne 0 ]; then + # We failed to download + rm "$JAR" + printf "Our attempt to download rat locally to ${JAR} failed. Please install rat manually.\n" + exit -1 + fi +} + +# Go to the Spark project root directory +FWDIR="$(cd "`dirname "$0"`"/..; pwd)" +cd "$FWDIR" + +if test -x "$JAVA_HOME/bin/java"; then + declare java_cmd="$JAVA_HOME/bin/java" +else + declare java_cmd=java +fi + +export RAT_VERSION=0.15 +export rat_jar="$FWDIR"/lib/apache-rat-${RAT_VERSION}.jar +mkdir -p "$FWDIR"/lib + +[[ -f "$rat_jar" ]] || acquire_rat_jar || { + echo "Download failed. Obtain the rat jar manually and place it at $rat_jar" + exit 1 +} + +mkdir -p build +$java_cmd -jar "$rat_jar" -E "$FWDIR"/dev/.rat-excludes -d "$FWDIR" > build/rat-results.txt + +if [ $? -ne 0 ]; then + echo "RAT exited abnormally" + exit 1 +fi + +ERRORS="$(cat build/rat-results.txt | grep -e "??")" + +if test ! -z "$ERRORS"; then + echo "Could not find Apache license headers in the following files:" + echo "$ERRORS" + exit 1 +else + echo -e "RAT checks passed." +fi diff --git a/tests/integration/iceberg/docker-compose/docker-compose-azurite.yml b/tests/integration/iceberg/docker-compose/docker-compose-azurite.yml new file mode 100644 index 0000000000..8f2f7ebf92 --- /dev/null +++ b/tests/integration/iceberg/docker-compose/docker-compose-azurite.yml @@ -0,0 +1,26 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +version: '3' + +services: + azurite: + image: mcr.microsoft.com/azure-storage/azurite + container_name: azurite + hostname: azurite + ports: + - 10000:10000 + command: [azurite-blob, --loose, --blobHost, 0.0.0.0] diff --git a/tests/integration/iceberg/docker-compose/docker-compose-gcs-server.yml b/tests/integration/iceberg/docker-compose/docker-compose-gcs-server.yml new file mode 100644 index 0000000000..a767af6385 --- /dev/null +++ b/tests/integration/iceberg/docker-compose/docker-compose-gcs-server.yml @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +version: '3' + +services: + gcs-server: + image: fsouza/fake-gcs-server + container_name: gcs-server + ports: + - 4443:4443 + entrypoint: > + /bin/sh -c " + mkdir -p /data/warehouse; + /bin/fake-gcs-server -data /data -scheme http; + exit 0; + " diff --git a/tests/integration/iceberg/docker-compose/docker-compose.yml b/tests/integration/iceberg/docker-compose/docker-compose.yml new file mode 100644 index 0000000000..53b96dff23 --- /dev/null +++ b/tests/integration/iceberg/docker-compose/docker-compose.yml @@ -0,0 +1,88 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +version: '3' + +services: + spark-iceberg: + image: python-integration + container_name: pyiceberg-spark + build: . + networks: + iceberg_net: + depends_on: + - rest + - minio + volumes: + - ./warehouse:/home/iceberg/warehouse + environment: + - AWS_ACCESS_KEY_ID=admin + - AWS_SECRET_ACCESS_KEY=password + - AWS_REGION=us-east-1 + ports: + - 8888:8888 + - 8080:8080 + links: + - rest:rest + - minio:minio + rest: + image: tabulario/iceberg-rest + container_name: pyiceberg-rest + networks: + iceberg_net: + ports: + - 8181:8181 + environment: + - AWS_ACCESS_KEY_ID=admin + - AWS_SECRET_ACCESS_KEY=password + - AWS_REGION=us-east-1 + - CATALOG_WAREHOUSE=s3://warehouse/ + - CATALOG_IO__IMPL=org.apache.iceberg.aws.s3.S3FileIO + - CATALOG_S3_ENDPOINT=http://minio:9000 + minio: + image: minio/minio + container_name: pyiceberg-minio + environment: + - MINIO_ROOT_USER=admin + - MINIO_ROOT_PASSWORD=password + - MINIO_DOMAIN=minio + networks: + iceberg_net: + aliases: + - warehouse.minio + ports: + - 9000:9000 + command: [server, /data] + mc: + depends_on: + - minio + image: minio/mc + container_name: pyiceberg-mc + networks: + iceberg_net: + environment: + - AWS_ACCESS_KEY_ID=admin + - AWS_SECRET_ACCESS_KEY=password + - AWS_REGION=us-east-1 + entrypoint: > + /bin/sh -c " + until (/usr/bin/mc config host add minio http://minio:9000 admin password) do echo '...waiting...' && sleep 1; done; + /usr/bin/mc mb minio/warehouse; + /usr/bin/mc policy set public minio/warehouse; + tail -f /dev/null + " +networks: + iceberg_net: diff --git a/tests/integration/iceberg/docker-compose/entrypoint.sh b/tests/integration/iceberg/docker-compose/entrypoint.sh new file mode 100755 index 0000000000..52a2d2e660 --- /dev/null +++ b/tests/integration/iceberg/docker-compose/entrypoint.sh @@ -0,0 +1,25 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +start-master.sh -p 7077 +start-worker.sh spark://spark-iceberg:7077 +start-history-server.sh +python3 provision.py +tail -f /dev/null diff --git a/tests/integration/iceberg/docker-compose/provision.py b/tests/integration/iceberg/docker-compose/provision.py new file mode 100644 index 0000000000..e1a7a6c0de --- /dev/null +++ b/tests/integration/iceberg/docker-compose/provision.py @@ -0,0 +1,324 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from __future__ import annotations + +from pyiceberg.catalog import load_catalog +from pyiceberg.schema import Schema +from pyiceberg.types import FixedType, NestedField, UUIDType +from pyspark.sql import SparkSession +from pyspark.sql.functions import current_date, date_add, expr + +spark = SparkSession.builder.getOrCreate() + +spark.sql( + """ + CREATE DATABASE IF NOT EXISTS default; +""" +) + +schema = Schema( + NestedField(field_id=1, name="uuid_col", field_type=UUIDType(), required=False), + NestedField(field_id=2, name="fixed_col", field_type=FixedType(25), required=False), +) + +catalog = load_catalog( + "local", + **{ + "type": "rest", + "uri": "http://rest:8181", + "s3.endpoint": "http://minio:9000", + "s3.access-key-id": "admin", + "s3.secret-access-key": "password", + }, +) + +catalog.create_table(identifier="default.test_uuid_and_fixed_unpartitioned", schema=schema) + +spark.sql( + """ + INSERT INTO default.test_uuid_and_fixed_unpartitioned VALUES + ('102cb62f-e6f8-4eb0-9973-d9b012ff0967', CAST('1234567890123456789012345' AS BINARY)), + ('ec33e4b2-a834-4cc3-8c4a-a1d3bfc2f226', CAST('1231231231231231231231231' AS BINARY)), + ('639cccce-c9d2-494a-a78c-278ab234f024', CAST('12345678901234567ass12345' AS BINARY)), + ('c1b0d8e0-0b0e-4b1e-9b0a-0e0b0d0c0a0b', CAST('asdasasdads12312312312111' AS BINARY)), + ('923dae77-83d6-47cd-b4b0-d383e64ee57e', CAST('qweeqwwqq1231231231231111' AS BINARY)); + """ +) + +spark.sql( + """ + CREATE OR REPLACE TABLE default.test_null_nan + USING iceberg + AS SELECT + 1 AS idx, + float('NaN') AS col_numeric +UNION ALL SELECT + 2 AS idx, + null AS col_numeric +UNION ALL SELECT + 3 AS idx, + 1 AS col_numeric +""" +) + +spark.sql( + """ + CREATE OR REPLACE TABLE default.test_null_nan_rewritten + USING iceberg + AS SELECT * FROM default.test_null_nan +""" +) + +spark.sql( + """ +CREATE OR REPLACE TABLE default.test_limit as + SELECT * LATERAL VIEW explode(ARRAY(1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) AS idx; +""" +) + +spark.sql( + """ +CREATE OR REPLACE TABLE default.test_positional_mor_deletes ( + dt date, + number integer, + letter string +) +USING iceberg +TBLPROPERTIES ( + 'write.delete.mode'='merge-on-read', + 'write.update.mode'='merge-on-read', + 'write.merge.mode'='merge-on-read', + 'format-version'='2' +); +""" +) + +# Partitioning is not really needed, but there is a bug: +# https://github.com/apache/iceberg/pull/7685 +spark.sql( + """ + ALTER TABLE default.test_positional_mor_deletes ADD PARTITION FIELD years(dt) AS dt_years +""" +) + +spark.sql( + """ +INSERT INTO default.test_positional_mor_deletes +VALUES + (CAST('2023-03-01' AS date), 1, 'a'), + (CAST('2023-03-02' AS date), 2, 'b'), + (CAST('2023-03-03' AS date), 3, 'c'), + (CAST('2023-03-04' AS date), 4, 'd'), + (CAST('2023-03-05' AS date), 5, 'e'), + (CAST('2023-03-06' AS date), 6, 'f'), + (CAST('2023-03-07' AS date), 7, 'g'), + (CAST('2023-03-08' AS date), 8, 'h'), + (CAST('2023-03-09' AS date), 9, 'i'), + (CAST('2023-03-10' AS date), 10, 'j'), + (CAST('2023-03-11' AS date), 11, 'k'), + (CAST('2023-03-12' AS date), 12, 'l'); +""" +) + +spark.sql( + """ +ALTER TABLE default.test_positional_mor_deletes CREATE TAG tag_12 + """ +) + +spark.sql( + """ +ALTER TABLE default.test_positional_mor_deletes CREATE BRANCH without_5 + """ +) + +spark.sql( + """ +DELETE FROM default.test_positional_mor_deletes.branch_without_5 WHERE number = 5 + """ +) + + +spark.sql( + """ +DELETE FROM default.test_positional_mor_deletes WHERE number = 9 +""" +) + +spark.sql( + """ + CREATE OR REPLACE TABLE default.test_positional_mor_double_deletes ( + dt date, + number integer, + letter string + ) + USING iceberg + TBLPROPERTIES ( + 'write.delete.mode'='merge-on-read', + 'write.update.mode'='merge-on-read', + 'write.merge.mode'='merge-on-read', + 'format-version'='2' + ); +""" +) + +# Partitioning is not really needed, but there is a bug: +# https://github.com/apache/iceberg/pull/7685 +spark.sql( + """ + ALTER TABLE default.test_positional_mor_double_deletes ADD PARTITION FIELD years(dt) AS dt_years +""" +) + +spark.sql( + """ +INSERT INTO default.test_positional_mor_double_deletes +VALUES + (CAST('2023-03-01' AS date), 1, 'a'), + (CAST('2023-03-02' AS date), 2, 'b'), + (CAST('2023-03-03' AS date), 3, 'c'), + (CAST('2023-03-04' AS date), 4, 'd'), + (CAST('2023-03-05' AS date), 5, 'e'), + (CAST('2023-03-06' AS date), 6, 'f'), + (CAST('2023-03-07' AS date), 7, 'g'), + (CAST('2023-03-08' AS date), 8, 'h'), + (CAST('2023-03-09' AS date), 9, 'i'), + (CAST('2023-03-10' AS date), 10, 'j'), + (CAST('2023-03-11' AS date), 11, 'k'), + (CAST('2023-03-12' AS date), 12, 'l'); +""" +) + +spark.sql( + """ + DELETE FROM default.test_positional_mor_double_deletes WHERE number = 9 +""" +) + +spark.sql( + """ + DELETE FROM default.test_positional_mor_double_deletes WHERE letter == 'f' +""" +) + +all_types_dataframe = ( + spark.range(0, 5, 1, 5) + .withColumnRenamed("id", "longCol") + .withColumn("intCol", expr("CAST(longCol AS INT)")) + .withColumn("floatCol", expr("CAST(longCol AS FLOAT)")) + .withColumn("doubleCol", expr("CAST(longCol AS DOUBLE)")) + .withColumn("dateCol", date_add(current_date(), 1)) + .withColumn("timestampCol", expr("TO_TIMESTAMP(dateCol)")) + .withColumn("stringCol", expr("CAST(dateCol AS STRING)")) + .withColumn("booleanCol", expr("longCol > 5")) + .withColumn("binaryCol", expr("CAST(longCol AS BINARY)")) + .withColumn("byteCol", expr("CAST(longCol AS BYTE)")) + .withColumn("decimalCol", expr("CAST(longCol AS DECIMAL(10, 2))")) + .withColumn("shortCol", expr("CAST(longCol AS SHORT)")) + .withColumn("mapCol", expr("MAP(longCol, decimalCol)")) + .withColumn("arrayCol", expr("ARRAY(longCol)")) + .withColumn("structCol", expr("STRUCT(mapCol, arrayCol)")) +) + +all_types_dataframe.writeTo("default.test_all_types").tableProperty("format-version", "2").partitionedBy( + "intCol" +).createOrReplace() + +for table_name, partition in [ + ("test_partitioned_by_identity", "ts"), + ("test_partitioned_by_years", "years(dt)"), + ("test_partitioned_by_months", "months(dt)"), + ("test_partitioned_by_days", "days(ts)"), + ("test_partitioned_by_hours", "hours(ts)"), + ("test_partitioned_by_truncate", "truncate(1, letter)"), + ("test_partitioned_by_bucket", "bucket(16, number)"), +]: + spark.sql( + f""" + CREATE OR REPLACE TABLE default.{table_name} ( + dt date, + ts timestamp, + number integer, + letter string + ) + USING iceberg; + """ + ) + + spark.sql(f"ALTER TABLE default.{table_name} ADD PARTITION FIELD {partition}") + + spark.sql( + f""" + INSERT INTO default.{table_name} + VALUES + (CAST('2022-03-01' AS date), CAST('2022-03-01 01:22:00' AS timestamp), 1, 'a'), + (CAST('2022-03-02' AS date), CAST('2022-03-02 02:22:00' AS timestamp), 2, 'b'), + (CAST('2022-03-03' AS date), CAST('2022-03-03 03:22:00' AS timestamp), 3, 'c'), + (CAST('2022-03-04' AS date), CAST('2022-03-04 04:22:00' AS timestamp), 4, 'd'), + (CAST('2023-03-05' AS date), CAST('2023-03-05 05:22:00' AS timestamp), 5, 'e'), + (CAST('2023-03-06' AS date), CAST('2023-03-06 06:22:00' AS timestamp), 6, 'f'), + (CAST('2023-03-07' AS date), CAST('2023-03-07 07:22:00' AS timestamp), 7, 'g'), + (CAST('2023-03-08' AS date), CAST('2023-03-08 08:22:00' AS timestamp), 8, 'h'), + (CAST('2023-03-09' AS date), CAST('2023-03-09 09:22:00' AS timestamp), 9, 'i'), + (CAST('2023-03-10' AS date), CAST('2023-03-10 10:22:00' AS timestamp), 10, 'j'), + (CAST('2023-03-11' AS date), CAST('2023-03-11 11:22:00' AS timestamp), 11, 'k'), + (CAST('2023-03-12' AS date), CAST('2023-03-12 12:22:00' AS timestamp), 12, 'l'); + """ + ) + +# There is an issue with CREATE OR REPLACE +# https://github.com/apache/iceberg/issues/8756 +spark.sql( + """ +DROP TABLE IF EXISTS default.test_table_version +""" +) + +spark.sql( + """ +CREATE TABLE default.test_table_version ( + dt date, + number integer, + letter string +) +USING iceberg +TBLPROPERTIES ( + 'format-version'='1' +); +""" +) + +spark.sql( + """ +CREATE TABLE default.test_table_sanitized_character ( + `letter/abc` string +) +USING iceberg +TBLPROPERTIES ( + 'format-version'='1' +); +""" +) + +spark.sql( + f""" +INSERT INTO default.test_table_sanitized_character +VALUES + ('123') +""" +) diff --git a/tests/integration/iceberg/docker-compose/run-azurite.sh b/tests/integration/iceberg/docker-compose/run-azurite.sh new file mode 100755 index 0000000000..c218155894 --- /dev/null +++ b/tests/integration/iceberg/docker-compose/run-azurite.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +set -ex + +if [ $(docker ps -q --filter "name=azurite" --filter "status=running" ) ]; then + echo "Azurite backend running" +else + docker-compose -f dev/docker-compose-azurite.yml kill + docker-compose -f dev/docker-compose-azurite.yml up -d + while [ -z $(docker ps -q --filter "name=azurite" --filter "status=running" ) ] + do + echo "Waiting for Azurite" + sleep 1 + done +fi diff --git a/tests/integration/iceberg/docker-compose/run-gcs-server.sh b/tests/integration/iceberg/docker-compose/run-gcs-server.sh new file mode 100644 index 0000000000..289d89009a --- /dev/null +++ b/tests/integration/iceberg/docker-compose/run-gcs-server.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +set -ex + +if [ $(docker ps -q --filter "name=gcs-server" --filter "status=running" ) ]; then + echo "Fake GCS Server running" +else + docker-compose -f dev/docker-compose-gcs-server.yml kill + docker-compose -f dev/docker-compose-gcs-server.yml up -d + while [ -z $(docker ps -q --filter "name=gcs-server" --filter "status=running" ) ] + do + echo "Waiting for Fake GCS Server" + sleep 1 + done +fi diff --git a/tests/integration/iceberg/docker-compose/run-minio.sh b/tests/integration/iceberg/docker-compose/run-minio.sh new file mode 100755 index 0000000000..0db37012e7 --- /dev/null +++ b/tests/integration/iceberg/docker-compose/run-minio.sh @@ -0,0 +1,33 @@ +#!/bin/bash +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +set -ex + +if [ $(docker ps -q --filter "name=pyiceberg-minio" --filter "status=running" ) ]; then + echo "Minio backend running" +else + docker-compose -f dev/docker-compose.yml kill + docker-compose -f dev/docker-compose.yml up -d + while [ -z $(docker ps -q --filter "name=pyiceberg-minio" --filter "status=running" ) ] + do + echo "Waiting for Minio" + sleep 1 + done +fi diff --git a/tests/integration/iceberg/docker-compose/spark-defaults.conf b/tests/integration/iceberg/docker-compose/spark-defaults.conf new file mode 100644 index 0000000000..56c345432a --- /dev/null +++ b/tests/integration/iceberg/docker-compose/spark-defaults.conf @@ -0,0 +1,29 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +spark.sql.extensions org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions +spark.sql.catalog.demo org.apache.iceberg.spark.SparkCatalog +spark.sql.catalog.demo.type rest +spark.sql.catalog.demo.uri http://rest:8181 +spark.sql.catalog.demo.io-impl org.apache.iceberg.aws.s3.S3FileIO +spark.sql.catalog.demo.warehouse s3://warehouse/wh/ +spark.sql.catalog.demo.s3.endpoint http://minio:9000 +spark.sql.defaultCatalog demo +spark.eventLog.enabled true +spark.eventLog.dir /home/iceberg/spark-events +spark.history.fs.logDirectory /home/iceberg/spark-events +spark.sql.catalogImplementation in-memory diff --git a/tests/integration/docker-compose/Dockerfile.nginx b/tests/integration/io/docker-compose/Dockerfile.nginx similarity index 100% rename from tests/integration/docker-compose/Dockerfile.nginx rename to tests/integration/io/docker-compose/Dockerfile.nginx diff --git a/tests/integration/docker-compose/Dockerfile.s3_retry_server b/tests/integration/io/docker-compose/Dockerfile.s3_retry_server similarity index 100% rename from tests/integration/docker-compose/Dockerfile.s3_retry_server rename to tests/integration/io/docker-compose/Dockerfile.s3_retry_server diff --git a/tests/integration/docker-compose/docker-compose.yml b/tests/integration/io/docker-compose/docker-compose.yml similarity index 100% rename from tests/integration/docker-compose/docker-compose.yml rename to tests/integration/io/docker-compose/docker-compose.yml diff --git a/tests/integration/docker-compose/nginx-serve-static-files.conf b/tests/integration/io/docker-compose/nginx-serve-static-files.conf similarity index 100% rename from tests/integration/docker-compose/nginx-serve-static-files.conf rename to tests/integration/io/docker-compose/nginx-serve-static-files.conf diff --git a/tests/integration/docker-compose/retry_server/routers/__init__.py b/tests/integration/io/docker-compose/retry_server/__init__.py similarity index 100% rename from tests/integration/docker-compose/retry_server/routers/__init__.py rename to tests/integration/io/docker-compose/retry_server/__init__.py diff --git a/tests/integration/docker-compose/retry_server/main.py b/tests/integration/io/docker-compose/retry_server/main.py similarity index 100% rename from tests/integration/docker-compose/retry_server/main.py rename to tests/integration/io/docker-compose/retry_server/main.py diff --git a/tests/integration/docker-compose/retry_server/retry-server-requirements.txt b/tests/integration/io/docker-compose/retry_server/retry-server-requirements.txt similarity index 100% rename from tests/integration/docker-compose/retry_server/retry-server-requirements.txt rename to tests/integration/io/docker-compose/retry_server/retry-server-requirements.txt diff --git a/tests/integration/docker-compose/retry_server/utils/__init__.py b/tests/integration/io/docker-compose/retry_server/routers/__init__.py similarity index 100% rename from tests/integration/docker-compose/retry_server/utils/__init__.py rename to tests/integration/io/docker-compose/retry_server/routers/__init__.py diff --git a/tests/integration/docker-compose/retry_server/routers/get_retries_parquet_bucket.py b/tests/integration/io/docker-compose/retry_server/routers/get_retries_parquet_bucket.py similarity index 100% rename from tests/integration/docker-compose/retry_server/routers/get_retries_parquet_bucket.py rename to tests/integration/io/docker-compose/retry_server/routers/get_retries_parquet_bucket.py diff --git a/tests/integration/docker-compose/retry_server/routers/head_retries_parquet_bucket.py b/tests/integration/io/docker-compose/retry_server/routers/head_retries_parquet_bucket.py similarity index 100% rename from tests/integration/docker-compose/retry_server/routers/head_retries_parquet_bucket.py rename to tests/integration/io/docker-compose/retry_server/routers/head_retries_parquet_bucket.py diff --git a/tests/integration/docker-compose/retry_server/routers/rate_limited_echo_gets_bucket.py b/tests/integration/io/docker-compose/retry_server/routers/rate_limited_echo_gets_bucket.py similarity index 100% rename from tests/integration/docker-compose/retry_server/routers/rate_limited_echo_gets_bucket.py rename to tests/integration/io/docker-compose/retry_server/routers/rate_limited_echo_gets_bucket.py diff --git a/tests/integration/io/docker-compose/retry_server/utils/__init__.py b/tests/integration/io/docker-compose/retry_server/utils/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/tests/integration/docker-compose/retry_server/utils/parquet_generation.py b/tests/integration/io/docker-compose/retry_server/utils/parquet_generation.py similarity index 100% rename from tests/integration/docker-compose/retry_server/utils/parquet_generation.py rename to tests/integration/io/docker-compose/retry_server/utils/parquet_generation.py diff --git a/tests/integration/docker-compose/retry_server/utils/responses.py b/tests/integration/io/docker-compose/retry_server/utils/responses.py similarity index 100% rename from tests/integration/docker-compose/retry_server/utils/responses.py rename to tests/integration/io/docker-compose/retry_server/utils/responses.py