Skip to content

Commit

Permalink
feat(docker): respect pip mirrors with uv (datahub-project#9963)
Browse files Browse the repository at this point in the history
  • Loading branch information
hsheth2 authored and Alexander Sukhoborov committed Mar 1, 2024
1 parent 4aa1dc3 commit 5a516f8
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 24 deletions.
16 changes: 9 additions & 7 deletions docker/datahub-ingestion-base/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ ARG BASE_IMAGE=base
ARG ALPINE_REPO_URL=http://dl-cdn.alpinelinux.org/alpine
ARG GITHUB_REPO_URL=https://github.com
ARG DEBIAN_REPO_URL=https://deb.debian.org/debian
ARG PIP_MIRROR_URL=null
ARG PIP_MIRROR_URL=https://pypi.python.org/simple

FROM golang:1-alpine3.18 AS dockerize-binary

Expand All @@ -26,15 +26,18 @@ RUN go install github.com/jwilder/dockerize@$DOCKERIZE_VERSION

FROM python:3.10 as base

ARG DEBIAN_REPO_URL
ARG PIP_MIRROR_URL
ARG GITHUB_REPO_URL

ENV DEBIAN_FRONTEND noninteractive

# Optionally set corporate mirror for apk and pip
# Optionally set corporate mirror for deb
ARG DEBIAN_REPO_URL
RUN if [ "${DEBIAN_REPO_URL}" != "http://deb.debian.org/debian" ] ; then sed -i "s#http.*://deb.debian.org/debian#${DEBIAN_REPO_URL}#g" /etc/apt/sources.list.d/debian.sources ; fi
RUN if [ "${PIP_MIRROR_URL}" != "null" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi

# Optionally set corporate mirror for pip
ARG PIP_MIRROR_URL
RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi
ENV UV_INDEX_URL=${PIP_MIRROR_URL}

RUN apt-get update && apt-get install -y -qq \
python3-ldap \
Expand Down Expand Up @@ -67,8 +70,7 @@ USER datahub
ENV VIRTUAL_ENV=/datahub-ingestion/.venv
ENV PATH="${VIRTUAL_ENV}/bin:$PATH"
RUN python3 -m venv $VIRTUAL_ENV && \
uv pip install --no-cache -r requirements.txt && \
pip uninstall -y acryl-datahub
uv pip install --no-cache -r requirements.txt

ENTRYPOINT [ "/entrypoint.sh" ]

Expand Down
18 changes: 11 additions & 7 deletions docker/datahub-ingestion/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,22 @@
ARG APP_ENV=full
ARG BASE_IMAGE=acryldata/datahub-ingestion-base
ARG DOCKER_VERSION=head
ARG PIP_MIRROR_URL=null
ARG DEBIAN_REPO_URL=https://deb.debian.org/debian
ARG PIP_MIRROR_URL=https://pypi.python.org/simple

FROM $BASE_IMAGE:$DOCKER_VERSION as base

# Optionally set corporate mirror for deb
USER 0
ARG DEBIAN_REPO_URL
RUN if [ "${DEBIAN_REPO_URL}" != "http://deb.debian.org/debian" ] ; then sed -i "s#http.*://deb.debian.org/debian#${DEBIAN_REPO_URL}#g" /etc/apt/sources.list.d/debian.sources ; fi
USER datahub

# Optionally set corporate mirror for pip
ARG PIP_MIRROR_URL
RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi
ENV UV_INDEX_URL=${PIP_MIRROR_URL}

COPY --chown=datahub ./metadata-ingestion /datahub-ingestion
COPY --chown=datahub ./metadata-ingestion-modules/airflow-plugin /datahub-ingestion/airflow-plugin

Expand All @@ -19,23 +29,17 @@ RUN sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEAS
cat airflow-plugin/src/datahub_airflow_plugin/__init__.py | grep __version__

FROM base as slim-install
ARG PIP_MIRROR_URL

RUN if [ "${PIP_MIRROR_URL}" != "null" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi
RUN uv pip install --no-cache "acryl-datahub[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary] @ ."

FROM base as full-install-build
ARG PIP_MIRROR_URL
ARG DEBIAN_REPO_URL

USER 0
RUN if [ "${DEBIAN_REPO_URL}" != "http://deb.debian.org/debian" ] ; then sed -i "s#http.*://deb.debian.org/debian#${DEBIAN_REPO_URL}#g" /etc/apt/sources.list.d/debian.sources ; fi
RUN apt-get update && apt-get install -y -qq maven

USER datahub
COPY ./docker/datahub-ingestion/pyspark_jars.sh .

RUN if [ "${PIP_MIRROR_URL}" != "null" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi
RUN uv pip install --no-cache "acryl-datahub[base,all] @ ." "acryl-datahub-airflow-plugin[plugin-v2] @ ./airflow-plugin" && \
datahub --version
RUN ./pyspark_jars.sh
Expand Down
19 changes: 9 additions & 10 deletions docker/datahub-ingestion/Dockerfile-slim-only
Original file line number Diff line number Diff line change
@@ -1,26 +1,25 @@
# Defining environment
ARG BASE_IMAGE=acryldata/datahub-ingestion-base
ARG DOCKER_VERSION=head-slim
ARG PIP_MIRROR_URL=null
ARG PIP_MIRROR_URL=https://pypi.python.org/simple

FROM $BASE_IMAGE:$DOCKER_VERSION as base
USER 0
USER datahub

# Optionally set corporate mirror for apk and pip
ARG PIP_MIRROR_URL
RUN if [ "${PIP_MIRROR_URL}" != "https://pypi.python.org/simple" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi
ENV UV_INDEX_URL=${PIP_MIRROR_URL}

COPY ./metadata-ingestion /datahub-ingestion
COPY --chown=datahub ./metadata-ingestion /datahub-ingestion

ARG RELEASE_VERSION
WORKDIR /datahub-ingestion
RUN sed -i.bak "s/__version__ = \"1\!0.0.0.dev0\"/__version__ = \"$(echo $RELEASE_VERSION|sed s/-/+/)\"/" src/datahub/__init__.py && \
cat src/datahub/__init__.py && \
chown -R datahub /datahub-ingestion

USER datahub
cat src/datahub/__init__.py

FROM base as slim-install

ARG PIP_MIRROR_URL

RUN if [ "${PIP_MIRROR_URL}" != "null" ] ; then pip config set global.index-url ${PIP_MIRROR_URL} ; fi
RUN uv pip install --no-cache "acryl-datahub[base,datahub-rest,datahub-kafka,snowflake,bigquery,redshift,mysql,postgres,hive,clickhouse,glue,dbt,looker,lookml,tableau,powerbi,superset,datahub-business-glossary] @ ." && \
datahub --version

Expand Down

0 comments on commit 5a516f8

Please sign in to comment.