diff --git a/docker/datahub-ingestion-base/Dockerfile b/docker/datahub-ingestion-base/Dockerfile index 220c6f7d448af..bfd4ee1143f5e 100644 --- a/docker/datahub-ingestion-base/Dockerfile +++ b/docker/datahub-ingestion-base/Dockerfile @@ -5,7 +5,7 @@ ARG BASE_IMAGE=base ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine ARG GITHUB_REPO_URL=https://github.com ARG DEBIAN_REPO_URL=https://deb.debian.org/debian -ARG PIP_MIRROR_URL=null +ARG PIP_MIRROR_URL=https://pypi.python.org/simple FROM golang:1-alpine3.18 AS dockerize-binary @@ -26,15 +26,18 @@ RUN go install github.com/jwilder/dockerize@$DOCKERIZE_VERSION FROM python:3.10 as base -ARG DEBIAN_REPO_URL -ARG PIP_MIRROR_URL ARG GITHUB_REPO_URL ENV DEBIAN_FRONTEND noninteractive -# Optionally set corporate mirror for apk and pip +# Optionally set corporate mirror for deb +ARG DEBIAN_REPO_URL RUN if [ "${DEBIAN_REPO_URL}" != "http://deb.debian.org/debian" ] ; then sed -i "s#http.*://deb.debian.org/debian#${DEBIAN_REPO_URL}#g" /etc/apt/sources.list.d/debian.sources ; fi -RUN if [ "${PIP_MIRROR_URL}" != "null" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi + +# Optionally set corporate mirror for pip +ARG PIP_MIRROR_URL +RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi +ENV UV_INDEX_URL=${PIP_MIRROR_URL} RUN apt-get update && apt-get install -y -qq \ python3-ldap \ @@ -67,8 +70,7 @@ USER datahub ENV VIRTUAL_ENV=/datahub-ingestion/.venv ENV PATH="${VIRTUAL_ENV}/bin:$PATH" RUN python3 -m venv $VIRTUAL_ENV && \ - uv pip install --no-cache -r requirements.txt && \ - pip uninstall -y acryl-datahub + uv pip install --no-cache -r requirements.txt ENTRYPOINT [ "/entrypoint.sh" ] diff --git a/docker/datahub-ingestion/Dockerfile b/docker/datahub-ingestion/Dockerfile index 6c8829557837c..3f29417dca0d7 100644 --- a/docker/datahub-ingestion/Dockerfile +++ b/docker/datahub-ingestion/Dockerfile @@ -2,12 +2,22 @@ ARG APP_ENV=full ARG BASE_IMAGE=acryldata/datahub-ingestion-base ARG DOCKER_VERSION=head -ARG PIP_MIRROR_URL=null ARG DEBIAN_REPO_URL=https://deb.debian.org/debian +ARG PIP_MIRROR_URL=https://pypi.python.org/simple FROM $BASE_IMAGE:$DOCKER_VERSION as base + +# Optionally set corporate mirror for deb +USER 0 +ARG DEBIAN_REPO_URL +RUN if [ "${DEBIAN_REPO_URL}" != "http://deb.debian.org/debian" ] ; then sed -i "s#http.*://deb.debian.org/debian#${DEBIAN_REPO_URL}#g" /etc/apt/sources.list.d/debian.sources ; fi USER datahub +# Optionally set corporate mirror for pip +ARG PIP_MIRROR_URL +RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi +ENV UV_INDEX_URL=${PIP_MIRROR_URL} + COPY --chown=datahub ./metadata-ingestion /datahub-ingestion COPY --chown=datahub ./metadata-ingestion-modules/airflow-plugin /datahub-ingestion/airflow-plugin @@ -19,23 +29,17 @@ RUN sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEAS cat airflow-plugin/src/datahub_airflow_plugin/__init__.py | grep __version__ FROM base as slim-install -ARG PIP_MIRROR_URL -RUN if [ "${PIP_MIRROR_URL}" != "null" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi RUN uv pip install --no-cache "acryl-datahub[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary] @ ." FROM base as full-install-build -ARG PIP_MIRROR_URL -ARG DEBIAN_REPO_URL USER 0 -RUN if [ "${DEBIAN_REPO_URL}" != "http://deb.debian.org/debian" ] ; then sed -i "s#http.*://deb.debian.org/debian#${DEBIAN_REPO_URL}#g" /etc/apt/sources.list.d/debian.sources ; fi RUN apt-get update && apt-get install -y -qq maven USER datahub COPY ./docker/datahub-ingestion/pyspark_jars.sh . -RUN if [ "${PIP_MIRROR_URL}" != "null" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi RUN uv pip install --no-cache "acryl-datahub[base,all] @ ." "acryl-datahub-airflow-plugin[plugin-v2] @ ./airflow-plugin" && \ datahub --version RUN ./pyspark_jars.sh diff --git a/docker/datahub-ingestion/Dockerfile-slim-only b/docker/datahub-ingestion/Dockerfile-slim-only index ba43bd3c3c6be..a5f2a93e8a27b 100644 --- a/docker/datahub-ingestion/Dockerfile-slim-only +++ b/docker/datahub-ingestion/Dockerfile-slim-only @@ -1,26 +1,25 @@ # Defining environment ARG BASE_IMAGE=acryldata/datahub-ingestion-base ARG DOCKER_VERSION=head-slim -ARG PIP_MIRROR_URL=null +ARG PIP_MIRROR_URL=https://pypi.python.org/simple FROM $BASE_IMAGE:$DOCKER_VERSION as base -USER 0 +USER datahub + +# Optionally set corporate mirror for apk and pip +ARG PIP_MIRROR_URL +RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi +ENV UV_INDEX_URL=${PIP_MIRROR_URL} -COPY ./metadata-ingestion /datahub-ingestion +COPY --chown=datahub ./metadata-ingestion /datahub-ingestion ARG RELEASE_VERSION WORKDIR /datahub-ingestion RUN sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/datahub/__init__.py && \ - cat src/datahub/__init__.py && \ - chown -R datahub /datahub-ingestion - -USER datahub + cat src/datahub/__init__.py FROM base as slim-install -ARG PIP_MIRROR_URL - -RUN if [ "${PIP_MIRROR_URL}" != "null" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi RUN uv pip install --no-cache "acryl-datahub[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary] @ ." && \ datahub --version