diff --git a/docker/dockerfile.merlin b/docker/dockerfile.merlin index 8f9aa3df8..77e08edbe 100644 --- a/docker/dockerfile.merlin +++ b/docker/dockerfile.merlin @@ -3,10 +3,12 @@ ARG TRITON_VERSION=23.06 ARG DLFW_VERSION=23.06 ARG FULL_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3 +ARG SDK_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3-sdk ARG BASE_IMAGE=nvcr.io/nvidia/tritonserver:${TRITON_VERSION}-py3-min ARG DLFW_IMAGE=nvcr.io/nvidia/tensorflow:${TRITON_VERSION}-tf2-py3 FROM ${FULL_IMAGE} as triton +FROM ${SDK_IMAGE} as sdk FROM ${DLFW_IMAGE} as dlfw FROM ${BASE_IMAGE} as build @@ -118,8 +120,9 @@ COPY --chown=1000:1000 --from=triton /opt/tritonserver/lib lib/ COPY --chown=1000:1000 --from=triton /opt/tritonserver/include include/ COPY --chown=1000:1000 --from=triton /opt/tritonserver/repoagents/ repoagents/ COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/python backends/ -# NOTE 2023-07: fil-backend is not available on ARM. -COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/fil* backends/fil/ +# NOTE 2023-09: fil-backend is not available on ARM. Some docker versions flag an error if there is +# not a single source file to copy. To avoid this, we als specify a small dummy file. +COPY --chown=1000:1000 --from=triton /opt/tritonserver/LICENSE /opt/tritonserver/backends/fil/* backends/fil/ COPY --chown=1000:1000 --from=triton /usr/bin/serve /usr/bin/. ENV PATH=/opt/tritonserver/bin:${PATH}: @@ -187,11 +190,12 @@ RUN ARCH=$([ "${TARGETARCH}" = "arm64" ] && echo "sbsa" || echo "x86_64") && \ python3 \ python3-pip \ python3-dev \ + python3-libnvinfer \ rapidjson-dev \ tree \ wget \ zlib1g-dev \ - # Required to build RocksDB and RdKafka.. + # Required to build RocksDB and RdKafka. libgflags-dev \ libbz2-dev \ libsnappy-dev \ @@ -208,11 +212,6 @@ RUN ARCH=$([ "${TARGETARCH}" = "arm64" ] && echo "sbsa" || echo "x86_64") && \ openssh-server \ # [ HugeCTR ] libaio-dev && \ - # NOTE: libnvinfer is installed anyway, just Python bindings are missing on ARM. - if [[ "$TARGETARCH" != "arm64" ]]; then \ - # TensorRT dependencies - apt install -y --no-install-recommends python3-libnvinfer \ - ; fi && \ apt autoremove -y && \ apt clean && \ rm -rf /var/lib/apt/lists/* @@ -225,7 +224,7 @@ ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${JAVA_HOME}/lib:${JAVA_HOME}/lib/server # Binaries COPY --chown=1000:1000 --from=build /usr/local/bin/cmake /usr/local/bin/ COPY --chown=1000:1000 --from=build /usr/local/bin/pytest /usr/local/bin/ -COPY --chown=1000:1000 --from=build /usr/local/bin/perf_* /usr/local/bin/ +COPY --chown=1000:1000 --from=sdk /usr/local/bin/perf_* /usr/local/bin/ # Triton Server WORKDIR /opt/tritonserver @@ -237,8 +236,9 @@ COPY --chown=1000:1000 --from=triton /opt/tritonserver/lib lib/ COPY --chown=1000:1000 --from=triton /opt/tritonserver/include include/ COPY --chown=1000:1000 --from=triton /opt/tritonserver/repoagents/ repoagents/ COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/python backends/python/ -# NOTE 2023-07: fil-backend is not available on ARM. -COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/fil* backends/fil/ +# NOTE 2023-09: fil-backend is not available on ARM. Some docker versions flag an error if there is +# not a single source file to copy. To avoid this, we als specify a small dummy file. +COPY --chown=1000:1000 --from=triton /opt/tritonserver/LICENSE /opt/tritonserver/backends/fil/* backends/fil/ COPY --chown=1000:1000 --from=triton /opt/tritonserver/backends/tensorrt backends/tensorrt/ COPY --chown=1000:1000 --from=triton /usr/bin/serve /usr/bin/. COPY --chown=1000:1000 --from=triton /usr/lib/*-linux-gnu/libdcgm.so.2 /tmp @@ -362,7 +362,7 @@ ENV PATH=${PATH}:${HADOOP_HOME}/bin:${HADOOP_HOME}/sbin \ YARN_NODEMANAGER_USER=root \ # Tackles with ThreadReaper stack overflow issues: https://bugs.openjdk.java.net/browse/JDK-8153057 LIBHDFS_OPTS='-Djdk.lang.processReaperUseDefaultStackSize=true' \ - # Tackles with JVM setting error signals that UCX library will check (GitLab issue #425). + # Tackles with JVM setting error signals that the UCX library checks (GitLab issue #425). UCX_ERROR_SIGNALS='' \ CLASSPATH=${CLASSPATH}:\ ${HADOOP_HOME}/etc/hadoop/*:\ @@ -389,7 +389,7 @@ ENV PATH=$PATH:${HUGECTR_HOME}/bin \ LD_LIBRARY_PATH=${LD_LIBRARY_PATH}:${HUGECTR_HOME}/lib RUN if [ "${HUGECTR_DEV_MODE}" == "false" ]; then \ - # Install HugeCTR inference which is dependency for hps_backenc + # Install HugeCTR inference which is dependency for hps_backend git clone --branch ${HUGECTR_VER} --depth 1 https://${_CI_JOB_TOKEN}${_HUGECTR_REPO} /hugectr && \ cd /hugectr && \ git submodule update --init --recursive && \ diff --git a/docker/dockerfile.tf b/docker/dockerfile.tf index e86129268..b61adf156 100644 --- a/docker/dockerfile.tf +++ b/docker/dockerfile.tf @@ -42,7 +42,6 @@ ARG _CI_JOB_TOKEN="" ARG HUGECTR_VER=main ENV LD_LIBRARY_PATH=/usr/local/lib/python${PYTHON_VERSION}/dist-packages/tensorflow:$LD_LIBRARY_PATH \ - LIBRARY_PATH=${HUGECTR_HOME}/lib:$LIBRARY_PATH \ SOK_COMPILE_UNIT_TEST=ON RUN mkdir -p /usr/local/nvidia/lib64 && \ @@ -55,6 +54,9 @@ ARG INSTALL_DISTRIBUTED_EMBEDDINGS=false ARG TFDE_VER=v23.03.00 RUN if [ "$HUGECTR_DEV_MODE" == "false" ]; then \ + export HUGECTR_HOME=/usr/local/hugectr && \ + rm -rf ${HUGECTR_HOME}/lib/libgmock* ${HUGECTR_HOME}/lib/pkgconfig/gmock* ${HUGECTR_HOME}/include/gmock && \ + rm -rf ${HUGECTR_HOME}/lib/libgtest* ${HUGECTR_HOME}/lib/pkgconfig/gtest* ${HUGECTR_HOME}/include/gtest && \ git clone --branch ${HUGECTR_VER} --depth 1 --recurse-submodules --shallow-submodules https://${_CI_JOB_TOKEN}${_HUGECTR_REPO} /hugectr && \ pushd /hugectr && \ rm -rf .git/modules && \