diff --git a/public_dropin_notebook_environments/python311_notebook/Dockerfile b/public_dropin_notebook_environments/python311_notebook/Dockerfile new file mode 100644 index 000000000..da55acc62 --- /dev/null +++ b/public_dropin_notebook_environments/python311_notebook/Dockerfile @@ -0,0 +1,175 @@ +# Copyright 2023 DataRobot, Inc. and its affiliates. +# All rights reserved. +# DataRobot, Inc. Confidential. +# This is unpublished proprietary source code of DataRobot, Inc. +# and its affiliates. +# The copyright notice above does not evidence any actual or intended +# publication of such source code. + + +################### !NOTA BENE! ####################### +# All the files, parameters and packages are necessary # +# for the proper functioning of Notebooks. # +# If needed, you can include any system package # +# that will be installed through microdnf or # +# add a required package to the requirements.txt file. # +# Please note that removing predefined packages # +# may result in issues with Notebooks functionality. # +########################################################### + +ARG WORKDIR=/etc/system/kernel +ARG AGENTDIR=/etc/system/kernel/agent +ARG VENV_PATH=${WORKDIR}/.venv + +ARG UNAME=notebooks +ARG UID=10101 +ARG GID=10101 + +# You can specify a different python version here +# be sure that package available in microdnf repo +# to check use this bash commands: +# ```bash``` +# docker run --rm -it registry.access.redhat.com/ubi8/ubi-minimal:8.7 bash +# microdnf repoquery python3* +# ``` +ARG PYTHON_VERSION=3.11 +ARG PYTHON_EXACT_VERSION=3.11.7 + +FROM registry.access.redhat.com/ubi8/ubi-minimal:8.7 AS base +# some globally required dependencies + +ARG UNAME +ARG UID +ARG GID +ARG WORKDIR +ARG AGENTDIR +ARG VENV_PATH +ARG PYTHON_VERSION +ARG PYTHON_EXACT_VERSION + +# Set the SHELL option -o pipefail before RUN with a pipe in it. +# Rationale: https://github.com/hadolint/hadolint/wiki/DL4006 +SHELL ["/bin/bash", "-o", "pipefail", "-c"] + +# Add any package that will be installed on system level here: +RUN microdnf update \ + && microdnf install -y python$PYTHON_VERSION-$PYTHON_EXACT_VERSION python$PYTHON_VERSION-devel-$PYTHON_EXACT_VERSION \ + gcc-8.5.0 gcc-c++-8.5.0 glibc-devel-2.28 libffi-devel-3.1 graphviz-2.40.1 python$PYTHON_VERSION-pip \ + openblas-0.3.15 python$PYTHON_VERSION-scipy shadow-utils-2:4.6 passwd-0.80 git-2.43.0 openssh-server tar-2:1.30 gzip-1.9 unzip-6.0 zip-3.0 wget-1.19.5 \ + java-11-openjdk-headless-11.0.23.0.9-3.el8 vim-minimal-2:8.0.1763 nano-2.9.8 \ + && pip3 install -U --no-cache-dir pip==23.1.2 setuptools==68.2.2 \ + && curl -sS https://webi.sh/gh | sh && cp ~/.local/bin/gh /usr/bin/ \ + && microdnf clean all + +ENV PYTHONUNBUFFERED=1 \ + PYTHONDONTWRITEBYTECODE=1 \ + VENV_PATH=${VENV_PATH} \ + PIP_NO_CACHE_DIR=1 \ + NOTEBOOKS_KERNEL="python" + +ENV PATH="$VENV_PATH/bin:$PATH" \ + PYTHONPATH="/home/notebooks/.ipython/extensions:/home/notebooks/storage" + +RUN python3 -m venv ${VENV_PATH} +WORKDIR ${WORKDIR} + +COPY ./agent/agent.py ./agent/cgroup_watchers.py ${AGENTDIR}/ +COPY ./jupyter_kernel_gateway_config.py ./start_server.sh ${WORKDIR}/ +COPY ./ipython_config.py /etc/ipython/ +COPY ./extensions /etc/ipython/extensions + +# Adding SSHD requirements +COPY ./sshd_config /etc/ssh/ +RUN cp -a /etc/ssh /etc/ssh.cache && rm -rf /var/cache/apk/* +RUN mkdir /etc/authorized_keys + +# Removing pip leftovers to not have trivy complain +RUN rm -rf /lib/python3.9/site-packages/pip-20.2.4.dist-info && \ + rm -rf /etc/system/kernel/.venv/lib/python3.9/site-packages/pip-20.2.4.dist-info && \ + rm -rf /lib/python3.8/site-packages/pip-19.3.1.dist-info && \ + rm -rf /etc/system/kernel/.venv/lib/python3.8/site-packages/pip-19.3.1.dist-info + +# Custom user to run the image from + +RUN groupadd -g $GID -o $UNAME && \ + useradd -l -m -u $UID -g $GID -o -s /bin/bash $UNAME + +# Prompt customizations +COPY ./setup-prompt.sh /etc/profile.d/setup-prompt.sh + +# remove microdnf +RUN microdnf remove microdnf + +# additional setup scripts +COPY ./setup-ssh.sh ./common-user-limits.sh ./setup-venv.sh ${WORKDIR}/ + +# Adding SSHD requirements +RUN chown -R $UNAME:$UNAME ${WORKDIR} ${VENV_PATH} /home/notebooks /etc/ssh /etc/authorized_keys \ + # sshd prep + && touch /etc/profile.d/notebooks-load-env.sh \ + && chown -R $UNAME:$UNAME /etc/profile.d/notebooks-load-env.sh \ + # Limit max processes + && touch /etc/profile.d/bash-profile-load.sh \ + && chown -R $UNAME:$UNAME /etc/profile.d/bash-profile-load.sh + +USER $UNAME + +# Jupyter Gateway port +EXPOSE 8888 +# sshd port +EXPOSE 22 + +FROM base AS minimal +# this stage has only bare minimal of dependencies installed to optimize build time for the local development + +ARG WORKDIR +ARG VENV_PATH + +COPY ./dr_requirements.txt ./agent/requirements-agent.txt ${WORKDIR}/ +RUN python3 -m pip install --no-cache-dir -r ${WORKDIR}/dr_requirements.txt \ + && python3 -m pip install --no-cache-dir -r ${WORKDIR}/requirements-agent.txt \ + && rm ${WORKDIR}/dr_requirements.txt \ + && rm ${WORKDIR}/requirements-agent.txt \ + && rm ${VENV_PATH}/share/jupyter/kernels/python3/kernel.json \ + && chmod a+x ${WORKDIR}/start_server.sh + +# Monitoring agent port +EXPOSE 8889 + +FROM minimal AS builder +# this stage includes all data science dependencies we want to have in the kernel runtime out of the box + +ARG WORKDIR +ARG VENV_PATH +ARG PYTHON_VERSION + +COPY ./kernel.json ${VENV_PATH}/share/jupyter/kernels/python3/ +COPY ./requirements.txt ${WORKDIR}/ +RUN pip3 install --no-cache-dir -r ${WORKDIR}/requirements.txt \ + && rm ${WORKDIR}/requirements.txt + +FROM base AS kernel +# this stage is what actually going to be run as kernel image and it's clean from all build junks + +ARG UNAME + +ARG WORKDIR + +ARG GIT_COMMIT + +LABEL com.datarobot.repo-name="notebooks" +LABEL com.datarobot.repo-sha=$GIT_COMMIT + +# Removing pip leftovers to not have trivy complain +RUN rm -rf /lib/python3.9/site-packages/pip-20.2.4.dist-info && \ + rm -rf "${VENV_PATH}"/lib/python3.9/site-packages/pip-20.2.4.dist-info && \ + rm -rf /lib/python3.8/site-packages/pip-19.3.1.dist-info && \ + rm -rf "${VENV_PATH}"/lib/python3.8/site-packages/pip-19.3.1.dist-info && \ + rm -rf "${VENV_PATH}"/lib/python3.9/site-packages/setuptools-50.3.2.dist-info && \ + rm -rf "${VENV_PATH}"/lib/python3.8/site-packages/setuptools-41.6.0.dist-info && \ + rm -rf "${VENV_PATH}"/lib/python3.9/site-packages/setuptools-68.2.2.dist-info + + +RUN chown -R $UNAME:$UNAME ${WORKDIR} /home/notebooks + +COPY --from=builder --chown=$UNAME $WORKDIR $WORKDIR diff --git a/public_dropin_notebook_environments/python311_notebook/README.md b/public_dropin_notebook_environments/python311_notebook/README.md new file mode 100644 index 000000000..ad61d859a --- /dev/null +++ b/public_dropin_notebook_environments/python311_notebook/README.md @@ -0,0 +1,19 @@ +# Python 3.11 Notebook Drop-In Template Environment + +This template environment can be used to create custom Python 3.11 notebook environments. + +## Supported Libraries + +This environment has been built for python 3.11 and includes commonly used OSS machine learning and data science libraries. +For specific version information, see [requirements](requirements.txt). + +## Instructions + +1. Update [requirements](requirements.txt) to add your custom libraries supported by Python 3.11. +2. From the terminal, run `tar -czvf py311_notebook_dropin.tar.gz -C /path/to/public_dropin_notebook_environments/python311_notebook/ .` +3. Using either the API or from the UI create a new Custom Environment with the tarball created in step 2. + +### Using this environment in notebooks + +Upon successful build, the custom environment can be used in notebooks, by selecting it +from `ENVIRONMENT` settings > `Image` in the notebook sidebar. \ No newline at end of file diff --git a/public_dropin_notebook_environments/python311_notebook/agent/README.md b/public_dropin_notebook_environments/python311_notebook/agent/README.md new file mode 100644 index 000000000..731a699df --- /dev/null +++ b/public_dropin_notebook_environments/python311_notebook/agent/README.md @@ -0,0 +1,2 @@ +This folder contains dependencies required to use this custom environment for DataRobot Notebooks. +Please do not modify or delete this folder from your Docker context. diff --git a/public_dropin_notebook_environments/python311_notebook/agent/agent.py b/public_dropin_notebook_environments/python311_notebook/agent/agent.py new file mode 100644 index 000000000..d24863b4d --- /dev/null +++ b/public_dropin_notebook_environments/python311_notebook/agent/agent.py @@ -0,0 +1,62 @@ +# Copyright 2022 DataRobot, Inc. and its affiliates. +# All rights reserved. +# DataRobot, Inc. Confidential. +# This is unpublished proprietary source code of DataRobot, Inc. +# and its affiliates. +# The copyright notice above does not evidence any actual or intended +# publication of such source code. + +import asyncio + +from websockets.exceptions import ConnectionClosedOK, ConnectionClosedError + +from cgroup_watchers import ( + CGroupFileReader, + CGroupWatcher, + DummyWatcher, + SystemWatcher, + CGroupVersionUnsupported, +) +from fastapi import FastAPI, WebSocket +import logging +import ecs_logging + +logger = logging.getLogger("kernel_agent") + +logger.setLevel(logging.DEBUG) +handler = logging.StreamHandler() +handler.setFormatter(ecs_logging.StdlibFormatter()) +logger.addHandler(handler) + +app = FastAPI() + +try: + watcher = CGroupWatcher(CGroupFileReader(), SystemWatcher()) +except CGroupVersionUnsupported: + logger.warning("CGroup Version Unsupported. Dummy utilization will be broadcasted") + watcher = DummyWatcher() + + +@app.websocket_route("/ws") +async def websocket_endpoint(websocket: WebSocket): + await websocket.accept() + + try: + while True: + await websocket.send_json( + { + "cpu_percent": watcher.cpu_usage_percentage(), + "mem_percent": watcher.memory_usage_percentage(), + } + ) + + await asyncio.sleep(3) + except ConnectionClosedError: + logger.warning( + "utilization consumer unconnected", + extra={"connection": websocket.client}, + exc_info=True, + ) + except ConnectionClosedOK: + # https://github.com/encode/starlette/issues/759 + logger.info("utilization consumer unconnected", extra={"connection": websocket.client}) diff --git a/public_dropin_notebook_environments/python311_notebook/agent/cgroup_watchers.py b/public_dropin_notebook_environments/python311_notebook/agent/cgroup_watchers.py new file mode 100644 index 000000000..e84ad01d3 --- /dev/null +++ b/public_dropin_notebook_environments/python311_notebook/agent/cgroup_watchers.py @@ -0,0 +1,172 @@ +# Copyright 2022 DataRobot, Inc. and its affiliates. +# All rights reserved. +# DataRobot, Inc. Confidential. +# This is unpublished proprietary source code of DataRobot, Inc. +# and its affiliates. +# The copyright notice above does not evidence any actual or intended +# publication of such source code. +import re +import time +from pathlib import Path + +import psutil # type: ignore + +# Parts of this code have been reused from repo: +# https://github.com/neptune-ai/neptune-client/blob/master/LICENSE + +NANO_SECS = 10**9 + + +class CGroupVersionUnsupported(Exception): + """There are two versions of CGroups, the agent is compatible with V1 only. + This error occurs when the agent was tried to be ran in V2""" + + +class SystemWatcher: + @staticmethod + def cpu_count() -> int: + return psutil.cpu_count() + + @staticmethod + def cpu_percent() -> float: + return psutil.cpu_percent() + + @staticmethod + def virtual_memory(): + return psutil.virtual_memory() + + +class CGroupFileReader: + def __init__(self) -> None: + cgroup_memory_dir = self._cgroup_mount_dir(subsystem="memory") + cgroup_cpu_dir = self._cgroup_mount_dir(subsystem="cpu") + cgroup_cpuacct_dir = self._cgroup_mount_dir(subsystem="cpuacct") + + self._memory_usage_file = cgroup_memory_dir / "memory.stat" + self._memory_limit_file = cgroup_memory_dir / "memory.limit_in_bytes" + + self._cpu_period_file = cgroup_cpu_dir / "cpu.cfs_period_us" + self._cpu_quota_file = cgroup_cpu_dir / "cpu.cfs_quota_us" + + self._cpuacct_usage_file = cgroup_cpuacct_dir / "cpuacct.usage" + + def memory_usage_in_bytes(self) -> int: + memory_stat_str = self._memory_usage_file.read_text() + total_rss_str = next( + iter([stat for stat in memory_stat_str.split("\n") if stat.startswith("total_rss")]), + "0", + ) + total_rss = int(total_rss_str.split(" ")[-1]) + return total_rss + + def memory_limit_in_bytes(self) -> int: + return self._read_metric(self._memory_limit_file) + + def cpu_quota_micros(self) -> int: + return self._read_metric(self._cpu_quota_file) + + def cpu_period_micros(self) -> int: + return self._read_metric(self._cpu_period_file) + + def cpuacct_usage_nanos(self) -> int: + return self._read_metric(self._cpuacct_usage_file) + + def _read_metric(self, filename: Path) -> int: + with open(filename) as f: + return int(f.read()) + + def _cgroup_mount_dir(self, subsystem: str) -> Path: + """ + :param subsystem: cgroup subsystem like memory, cpu etc. + :return: directory where subsystem is mounted + """ + try: + with open("/proc/mounts", "r") as f: + for line in f.readlines(): + split_line = re.split(r"\s+", line) + mount_dir = split_line[1] + + if "cgroup" in mount_dir: + dirname = mount_dir.split("/")[-1] + subsystems = dirname.split(",") + + if subsystem in subsystems: + return Path(mount_dir) + except FileNotFoundError: + ... + + raise CGroupVersionUnsupported + + +class BaseWatcher: + def cpu_usage_percentage(self) -> float: + raise NotImplementedError + + def memory_usage_percentage(self) -> float: + raise NotImplementedError + + +class CGroupWatcher(BaseWatcher): + def __init__(self, cgroup_file_reader: CGroupFileReader, system_watcher: SystemWatcher) -> None: + self._cgroup_file_reader = cgroup_file_reader + self._system_watcher = system_watcher + + self._last_cpu_usage_ts_nanos = 0.0 + self._last_cpu_cum_usage_nanos = 0.0 + + def memory_usage_in_bytes(self) -> float: + return self._cgroup_file_reader.memory_usage_in_bytes() + + def memory_limit_in_bytes(self) -> float: + cgroup_mem_limit = self._cgroup_file_reader.memory_limit_in_bytes() + total_virtual_memory = self._system_watcher.virtual_memory().total + return min(cgroup_mem_limit, total_virtual_memory) + + def memory_usage_percentage(self) -> float: + return round(self.memory_usage_in_bytes() / self.memory_limit_in_bytes() * 100, 2) + + def cpu_usage_limit_in_cores(self) -> float: + cpu_quota_micros = self._cgroup_file_reader.cpu_quota_micros() + + if cpu_quota_micros == -1: + return float(self._system_watcher.cpu_count()) + else: + cpu_period_micros = self._cgroup_file_reader.cpu_period_micros() + return float(cpu_quota_micros) / float(cpu_period_micros) + + def cpu_usage_percentage(self) -> float: + current_timestamp_nanos = time.time() * NANO_SECS + cpu_cum_usage_nanos = self._cgroup_file_reader.cpuacct_usage_nanos() + + if self._is_first_measurement(): + current_usage = 0.0 + else: + usage_diff = cpu_cum_usage_nanos - self._last_cpu_cum_usage_nanos + time_diff = current_timestamp_nanos - self._last_cpu_usage_ts_nanos + current_usage = ( + float(usage_diff) / float(time_diff) / self.cpu_usage_limit_in_cores() * 100.0 + ) + + self._last_cpu_usage_ts_nanos = current_timestamp_nanos + self._last_cpu_cum_usage_nanos = cpu_cum_usage_nanos + + # In case the cpu usage exceeds the limit, we need to limit it + return round(self._limit(current_usage, lower_limit=0.0, upper_limit=100.0), 2) + + def _is_first_measurement(self) -> bool: + return self._last_cpu_usage_ts_nanos is None or self._last_cpu_cum_usage_nanos is None + + @staticmethod + def _limit(value: float, lower_limit: float, upper_limit: float) -> float: + return max(lower_limit, min(value, upper_limit)) + + +class DummyWatcher(BaseWatcher): + def __init__(self): + self._system_watcher = SystemWatcher() + + def cpu_usage_percentage(self) -> float: + return self._system_watcher.cpu_percent() + + def memory_usage_percentage(self) -> float: + return self._system_watcher.virtual_memory().percent diff --git a/public_dropin_notebook_environments/python311_notebook/agent/requirements-agent.txt b/public_dropin_notebook_environments/python311_notebook/agent/requirements-agent.txt new file mode 100644 index 000000000..1a5d741a2 --- /dev/null +++ b/public_dropin_notebook_environments/python311_notebook/agent/requirements-agent.txt @@ -0,0 +1,2 @@ +fastapi[All]==0.109.2 +psutil==5.9.1 diff --git a/public_dropin_notebook_environments/python311_notebook/common-user-limits.sh b/public_dropin_notebook_environments/python311_notebook/common-user-limits.sh new file mode 100644 index 000000000..d07156902 --- /dev/null +++ b/public_dropin_notebook_environments/python311_notebook/common-user-limits.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +echo "Generating common bash profile..." +{ + echo "#!/bin/bash" + echo "# Setting user process limits." + echo "ulimit -Su 2048" + echo "ulimit -Hu 2048" +} > /etc/profile.d/bash-profile-load.sh \ No newline at end of file diff --git a/public_dropin_notebook_environments/python311_notebook/dr_requirements.txt b/public_dropin_notebook_environments/python311_notebook/dr_requirements.txt new file mode 100644 index 000000000..f47c0bded --- /dev/null +++ b/public_dropin_notebook_environments/python311_notebook/dr_requirements.txt @@ -0,0 +1,8 @@ +setuptools==68.2.2 +ecs-logging==2.0.0 +jupyter-client==7.4.9 +jupyter_kernel_gateway==2.5.2 +jupyter_core==5.2.0 +ipykernel==6.28.0 +pandas==1.5.1 +mistune==2.0.4 diff --git a/public_dropin_notebook_environments/python311_notebook/env_info.json b/public_dropin_notebook_environments/python311_notebook/env_info.json new file mode 100644 index 000000000..ed85038fd --- /dev/null +++ b/public_dropin_notebook_environments/python311_notebook/env_info.json @@ -0,0 +1,11 @@ +{ + "id": "6583d56f5627082b3cff990e", + "name": "[DataRobot] Python 3.11 Notebook Drop-In", + "description": "This template environment can be used to create Python 3.11 notebook environments.", + "programmingLanguage": "python", + "environmentVersionId": "664f2c8d705fc73cfa4fcbc9", + "isPublic": true, + "useCases": [ + "notebook" + ] +} diff --git a/public_dropin_notebook_environments/python311_notebook/extensions/README.md b/public_dropin_notebook_environments/python311_notebook/extensions/README.md new file mode 100644 index 000000000..731a699df --- /dev/null +++ b/public_dropin_notebook_environments/python311_notebook/extensions/README.md @@ -0,0 +1,2 @@ +This folder contains dependencies required to use this custom environment for DataRobot Notebooks. +Please do not modify or delete this folder from your Docker context. diff --git a/public_dropin_notebook_environments/python311_notebook/extensions/dataframe_formatter.py b/public_dropin_notebook_environments/python311_notebook/extensions/dataframe_formatter.py new file mode 100644 index 000000000..c76837929 --- /dev/null +++ b/public_dropin_notebook_environments/python311_notebook/extensions/dataframe_formatter.py @@ -0,0 +1,332 @@ +# Copyright 2022 DataRobot, Inc. and its affiliates. +# All rights reserved. +# DataRobot, Inc. Confidential. +# This is unpublished proprietary source code of DataRobot, Inc. +# and its affiliates. +# The copyright notice above does not evidence any actual or intended +# publication of such source code. + +# -*- coding: utf-8 -*- +import json +import sys +import traceback +from enum import Enum +from typing import Any, Callable, Dict, List, Optional, Union, cast + +from IPython.core.formatters import BaseFormatter +from IPython.core.magic import Magics +from pydantic import BaseModel +from traitlets import ObjectName, Unicode + +is_pandas_loaded = True + +try: + from pandas import DataFrame, DatetimeIndex, io +except ImportError: + is_pandas_loaded = False + + +class Entity(BaseModel): + """ + Base class for data transfer objects + """ + + class Config: + allow_population_by_field_name = True + + +class DataframePaginationAttributes(Entity): + limit: int + offset: int + + +class DataframeAggregationParams(Entity): + group_by: str + aggregate_by: str + aggregation_func: str + + +class DataframeFilterParams(Entity): + filter_by: Optional[str] + filter: str + + +class DataframesProcessSteps(str, Enum): + CHART_CELL_DATAFRAME = "chart_cell_dataframe" + AGGREGATION = "aggregation" + PAGINATION = "pagination" + SORTING = "sorting" + GET_COLUMNS = "get_columns" + DEFAULT = "get_columns" + + +Columns = List[Dict[str, Any]] + +index_key = "_dr_df_index" + + +def _register_exception( + e: Exception, + step: str, +) -> Dict[str, Any]: + exc_info = sys.exc_info() + traceback_msg = traceback.format_exception(*exc_info) + + return { + "step": step, + "message": str(e), + "traceback": traceback_msg, + } + + +def _set_index(df: DataFrame) -> DataFrame: + if isinstance(df.index, DatetimeIndex): + df.index = df.index.strftime("%Y-%m-%dT%H:%M:%S.%f") + + df.index = df.index.rename(index_key) + return df + + +def _validate_columns(data: DataFrame) -> None: + """To prevent failing some DataFrame process steps like columns extraction + and converting to json we need ensure that columns dtypes can be converted + + Args: + data (DataFrame): in-memory DataFrame + + Returns: + None + """ + convertable_types = [ + "int64", + "float64", + "bool", + "category", + "object", + "datetime64[ns]", + "timedelta[ns]", + ] + for column in data.columns: + dtype = data[column].dtype + if dtype not in convertable_types: + # Try to keep datetime dtype, remove the timezone information + # but converting to UTC, so yielding naive UTC time + if hasattr(data[column], "dt") and hasattr(data[column].dt, "tz_convert"): + data[column] = data[column].dt.tz_convert(None) + else: + # Otherwise, keep going working with dataframe but set pandas column type to str + data[column] = data[column].astype(str) + + +def _get_dataframe_columns(df: DataFrame) -> Columns: + schema = io.json.build_table_schema(_set_index(df)) + columns = cast(Columns, schema["fields"]) + return columns + + +# DataFrame pagination if pagination attrs exist +def _paginate_dataframe(df: DataFrame, pagination: DataframePaginationAttributes) -> DataFrame: + start_row = pagination.offset + end_row = start_row + pagination.limit + return df[start_row:end_row] + + +def _sort_dataframe(df: DataFrame, sort_by: str) -> DataFrame: + sorting_list = sort_by.split(",") + sort_by_list = [] + ascending_list = [] + for sort_key in sorting_list: + sort_by_list.append(sort_key.lstrip("-")) + ascending_list.append(not sort_key.startswith("-")) + return df.sort_values(by=sort_by_list, ascending=ascending_list, ignore_index=False) + + +def _aggregate_dataframe( + df: DataFrame, aggregation_params: DataframeAggregationParams +) -> DataFrame: + aggregated = df.groupby(aggregation_params.group_by).aggregate( + {f"{aggregation_params.aggregate_by}": aggregation_params.aggregation_func} + ) + return aggregated.reset_index() + + +def _transform_to_json(data: DataFrame) -> Any: + if isinstance(data, list): + return data + + return json.loads(data.to_json(orient="table", index=True, default_handler=str))["data"] + + +def _prepare_df_for_chart_cell(val: DataFrame, columns: List[str]) -> Union[DataFrame, List[str]]: + if len(columns) == 0: + data = [] + elif len(columns) == 1: + # Return counts if only one column was selected or selected count of records + dataframe = ( + val.groupby(columns)[columns[0]].count().reset_index(name="count").set_index("count") + ) + data = _set_index(dataframe) + else: + # Return only selected columns + data = val[columns] + + return data + + +# This formatter can operate with a data that we are received as a DataFrame +def formatter( # noqa: C901,PLR0912 + val: "DataFrame", + formatter: Optional[Callable[..., List[str]]] = None, + **formatter_kwargs: Any, +) -> Dict[str, Any]: + error = [] + dataframe_limit = 5000 + dataframe_id = id(val) + pagination = DataframePaginationAttributes(limit=10, offset=0) + data = val + sort_by = "" + selected_columns = [] + _validate_columns(data) + try: + columns = _get_dataframe_columns(data) + except Exception as e: + error.append(_register_exception(e, DataframesProcessSteps.GET_COLUMNS.value)) + + # check if it's a dataframe for ChartCell then return full dataframe + if hasattr(val, "attrs") and "returnAll" in val.attrs and val.attrs["returnAll"]: + # Validate what to return to UI + if hasattr(val, "attrs") and "selected_columns" in val.attrs: + selected_columns = list( + filter(lambda item: item is not index_key, val.attrs["selected_columns"]) + ) + try: + data = _prepare_df_for_chart_cell(val=data, columns=selected_columns) + except Exception as e: + error.append( + _register_exception(e, DataframesProcessSteps.CHART_CELL_DATAFRAME.value) + ) + if len(selected_columns) < 2: + # Reset `returnAll` attribute to prevent returning a whole DF on next formatter call + val.attrs.update({"returnAll": False}) + data = [] if len(error) > 0 else data + + return { + "columns": columns, + "data": _transform_to_json(data), + "referenceId": dataframe_id, + "error": error, + "indexKey": index_key, + } + + aggregation_func = val.attrs.get("aggregation", {}).get("aggregation_func") + if aggregation_func and aggregation_func != "no-aggregation": + aggregation = DataframeAggregationParams( + group_by=val.attrs["aggregation"]["group_by"], + aggregate_by=val.attrs["aggregation"]["aggregate_by"], + aggregation_func=val.attrs["aggregation"]["aggregation_func"], + ) + try: + data = _aggregate_dataframe(data, aggregation) + except Exception as e: + error.append(_register_exception(e, DataframesProcessSteps.AGGREGATION.value)) + + if len(data.index) >= dataframe_limit: + pagination = DataframePaginationAttributes(limit=dataframe_limit, offset=0) + try: + data = _paginate_dataframe(data, pagination) + except Exception as e: + error.append(_register_exception(e, DataframesProcessSteps.PAGINATION.value)) + + # Reset `returnAll` attribute to prevent returning a whole DF on next formatter call + val.attrs.update({"returnAll": False}) + + return { + "columns": columns, + "data": _transform_to_json(data), + "referenceId": dataframe_id, + "error": error, + "indexKey": index_key, + } + + # Sorting step, gets attrs that has been setup in DataframeProcessor + if hasattr(val, "attrs") and "sort_by" in val.attrs: + try: + data = _sort_dataframe(df=data, sort_by=val.attrs["sort_by"]) + sort_by = val.attrs["sort_by"] + except Exception as e: + error.append(_register_exception(e, DataframesProcessSteps.SORTING.value)) + + # Pagination step, gets attrs that has been setup in DataframeProcessor + if hasattr(val, "attrs") and "pagination" in val.attrs: + pagination = DataframePaginationAttributes( + limit=val.attrs["pagination"]["limit"], offset=val.attrs["pagination"]["offset"] + ) + + # If dataframe length is less than pagination limit no need to paginate it + if len(data.index) > int(pagination.limit): + try: + data = _paginate_dataframe(data, pagination) + except Exception as e: + error.append(_register_exception(e, DataframesProcessSteps.PAGINATION.value)) + + return { + "data": _transform_to_json(data), + "columns": columns, + "count": len(data.index), + "totalCount": len(val.index), + "offset": int(pagination.offset), + "limit": int(pagination.limit), + "referenceId": dataframe_id, + "sortedBy": sort_by, + "indexKey": index_key, + "error": error, + } + + +# To Add a new data formatter we need create a new class instance based on a +# BaseFormatter from the iPython kernel +# +# Ignoring mypy error: Class cannot subclass "BaseFormatter" (has type "Any") +class DataFrameFormatter(BaseFormatter): # type: ignore[misc] + """A DataFrame formatter. This is basically a copy of the JSONFormatter + so it will return as a new mime type: application/vnd.dataframe+json in output. + """ + + format_type = Unicode("application/vnd.dataframe+json") + _return_type = (list, dict) + + print_method = ObjectName("_repr_json_") + + def _check_return(self, r: Any, obj: Any) -> Any: + """Check that a return value is appropriate + Return the value if so, None otherwise, warning if invalid. + """ + if r is None: + return + md = None + if isinstance(r, tuple): + # unpack data, metadata tuple for type checking on first element + r, md = r + + assert not isinstance(r, str), "JSON-as-string has been deprecated since IPython < 3" + + if md is not None: + # put the tuple back together + r = (r, md) + return super(DataFrameFormatter, self)._check_return(r, obj) + + +# Load our extension into ipython kernel +def load_ipython_extension(ipython: Magics) -> None: + if is_pandas_loaded: + ipython.display_formatter.formatters[ + "application/vnd.dataframe+json" + ] = DataFrameFormatter() + dataframe_json_formatter = ipython.display_formatter.formatters[ + "application/vnd.dataframe+json" + ] + dataframe_json_formatter.for_type(DataFrame, formatter) + + print("Pandas DataFrame MimeType Extension loaded") + else: + print("Please make `pip install pandas` to use DataFrame extension") diff --git a/public_dropin_notebook_environments/python311_notebook/ipython_config.py b/public_dropin_notebook_environments/python311_notebook/ipython_config.py new file mode 100644 index 000000000..87197bfb8 --- /dev/null +++ b/public_dropin_notebook_environments/python311_notebook/ipython_config.py @@ -0,0 +1,10 @@ +# Copyright 2022 DataRobot, Inc. and its affiliates. +# All rights reserved. +# DataRobot, Inc. Confidential. +# This is unpublished proprietary source code of DataRobot, Inc. +# and its affiliates. +# The copyright notice above does not evidence any actual or intended +# publication of such source code. + +# This need to load extensions automaticaly when kernel starting +c.InteractiveShellApp.extensions = ["dataframe_formatter"] diff --git a/public_dropin_notebook_environments/python311_notebook/jupyter_kernel_gateway_config.py b/public_dropin_notebook_environments/python311_notebook/jupyter_kernel_gateway_config.py new file mode 100644 index 000000000..1d1126789 --- /dev/null +++ b/public_dropin_notebook_environments/python311_notebook/jupyter_kernel_gateway_config.py @@ -0,0 +1,26 @@ +# Copyright 2022 DataRobot, Inc. and its affiliates. +# All rights reserved. +# DataRobot, Inc. Confidential. +# This is unpublished proprietary source code of DataRobot, Inc. +# and its affiliates. +# The copyright notice above does not evidence any actual or intended +# publication of such source code. +c.KernelGatewayApp.ip = "0.0.0.0" # nosec +c.KernelGatewayApp.prespawn_count = 1 +c.KernelGatewayApp.max_kernels = 100 +c.KernelGatewayApp.default_kernel_name = "python3" +c.JupyterWebsocketPersonality.list_kernels = True +c.KernelRestarter.restart_limit = ( + 3 # if restart happens 3 consecutive times (before kernel is ready) +) + +c.KernelGatewayApp.logging_config = { + "formatters": { + "console": {"class": "ecs_logging.StdlibFormatter"}, + }, + "loggers": { + "KernelGatewayApp": { + "handlers": ["console"], + } + }, +} diff --git a/public_dropin_notebook_environments/python311_notebook/kernel.json b/public_dropin_notebook_environments/python311_notebook/kernel.json new file mode 100644 index 000000000..f23dbc5ea --- /dev/null +++ b/public_dropin_notebook_environments/python311_notebook/kernel.json @@ -0,0 +1,15 @@ +{ + "argv": [ + "python", + "-m", + "ipykernel_launcher", + "-f", + "{connection_file}" + ], + "display_name": "Python 3 (ipykernel)", + "language": "python", + "metadata": { + "debugger": true + }, + "interrupt_mode": "message" +} \ No newline at end of file diff --git a/public_dropin_notebook_environments/python311_notebook/requirements.txt b/public_dropin_notebook_environments/python311_notebook/requirements.txt new file mode 100644 index 000000000..5a630c30a --- /dev/null +++ b/public_dropin_notebook_environments/python311_notebook/requirements.txt @@ -0,0 +1,40 @@ +altair==4.2.0 +cloudpickle==2.2.1 +dask==2022.9.2 +datarobot-bp-workshop==0.2.6 +datarobot-drum==1.10.19 +datarobot-mlops-connected-client<10.0.0 +datarobot-model-metrics==0.6.9 +datarobot-predict==1.5.1 +datarobot==3.4.0 +dummyPy==0.3 +eli5==0.13.0 +gensim==4.3.2 +graphviz==0.20.1 +imblearn +joblib==1.2.0 +latexify-py==0.4.2 +lime==0.2.0.1 +matplotlib==3.6.1 +mistune==2.0.4 +numpy==1.26.2 +opencv-python-headless==4.8.1.78 +openpyxl==3.0.10 +pandas==1.5.1 +plotly==5.18.0 +pydantic==2.5.2 +scikit-learn==1.3.2 +scipy==1.10.1 +seaborn==0.12.0 +shap==0.41.0 +snowflake-connector-python[secure-local-storage,pandas]==3.7.1 +snowflake-snowpark-python==1.11.1 +spacy==3.7.2 +statsmodels==0.14.1 +tqdm==4.64.1 +umap-learn==0.5.3 +wordcloud==1.9.3 +xgboost==1.6.2 +yellowbrick==1.5 +boto3==1.34.90 +sagemaker==2.216.1 diff --git a/public_dropin_notebook_environments/python311_notebook/setup-prompt.sh b/public_dropin_notebook_environments/python311_notebook/setup-prompt.sh new file mode 100644 index 000000000..704cdb1ce --- /dev/null +++ b/public_dropin_notebook_environments/python311_notebook/setup-prompt.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +PS1='[\[\033[38;5;172m\]\u\[$(tput sgr0)\]@kernel \[$(tput sgr0)\]\[\033[38;5;39m\]\w\[$(tput sgr0)\]]\$ \[$(tput sgr0)\]' + +# shellcheck disable=SC1091 +source /etc/system/kernel/setup-venv.sh diff --git a/public_dropin_notebook_environments/python311_notebook/setup-ssh.sh b/public_dropin_notebook_environments/python311_notebook/setup-ssh.sh new file mode 100644 index 000000000..da4607cbe --- /dev/null +++ b/public_dropin_notebook_environments/python311_notebook/setup-ssh.sh @@ -0,0 +1,19 @@ +#!/bin/bash + +echo "Persisting container environment variables for sshd..." +{ + echo "#!/bin/bash" + echo "# This file is auto-populated with kernel env vars on container creation" + echo "# to ensure that they are exposed in ssh sessions" + echo "# Ref: https://github.com/jenkinsci/docker-ssh-agent/issues/33#issuecomment-597367846" + echo "set -a" + env | grep -E -v "^(PWD=|HOME=|TERM=|SHLVL=|LD_PRELOAD=|PS1=|_=|KUBERNETES_)" | while read -r line; do + NAME=$(echo "$line" | cut -d'=' -f1) + VALUE=$(echo "$line" | cut -d'=' -f2-) + # Use eval to handle complex cases like export commands with spaces + echo "$NAME='$VALUE'" + done + echo "set +a" + # setup the working directory for terminal sessions + echo "cd $WORKING_DIR" +} > /etc/profile.d/notebooks-load-env.sh diff --git a/public_dropin_notebook_environments/python311_notebook/setup-venv.sh b/public_dropin_notebook_environments/python311_notebook/setup-venv.sh new file mode 100644 index 000000000..dab112a73 --- /dev/null +++ b/public_dropin_notebook_environments/python311_notebook/setup-venv.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +# we don't want it output anything in the terminal session setup +VERBOSE_MODE=${1:-false} + +IS_CODESPACE=$([[ "${WORKING_DIR}" == *"/storage"* ]] && echo true || echo false) +IS_PYTHON_KERNEL=$([[ "${NOTEBOOKS_KERNEL}" == "python" ]] && echo true || echo false) + +if [[ $IS_CODESPACE == true && $IS_PYTHON_KERNEL == true && -z "${NOTEBOOKS_NO_PERSISTENT_DEPENDENCIES}" ]]; then + export POETRY_VIRTUALENVS_CREATE=false + export XDG_CACHE_HOME="${WORKING_DIR%/}/.cache" + # Persistent HF artifact installation + export HF_HOME="${WORKING_DIR%/}/.cache" + export HF_HUB_CACHE="${WORKING_DIR%/}/.cache" + export HF_DATASETS_CACHE="${WORKING_DIR%/}/.datasets" + export TRANSFORMERS_CACHE="${WORKING_DIR%/}/.models" + export SENTENCE_TRANSFORMERS_HOME="${WORKING_DIR%/}/.models" + + USR_VENV="${WORKING_DIR%/}/.venv" + [[ $VERBOSE_MODE == true ]] && echo "Setting up a user venv ($USR_VENV)..." + + # we need to make sure both kernel & user venv's site-packages are in PYTHONPATH because: + # - when the user venv is activated (e.g. terminal sessions), it ignores the kernel venv + # - when Jupyter kernel is running (e.g. notebook cells) it uses the kernel venv ignoring the user venv + + # shellcheck disable=SC1091 + source "$VENV_PATH/bin/activate" + KERNEL_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])") + deactivate + + python3 -m venv "${USR_VENV}" + # shellcheck disable=SC1091 + source "${USR_VENV}/bin/activate" + USER_PACKAGES=$(python -c "import site; print(site.getsitepackages()[0])") + + export PYTHONPATH="$USER_PACKAGES:$KERNEL_PACKAGES:$PYTHONPATH" +else + [[ $VERBOSE_MODE == true ]] && echo "Skipping user venv setup..." + # shellcheck disable=SC1091 + source "$VENV_PATH/bin/activate" +fi \ No newline at end of file diff --git a/public_dropin_notebook_environments/python311_notebook/sshd_config b/public_dropin_notebook_environments/python311_notebook/sshd_config new file mode 100644 index 000000000..06f1ed69e --- /dev/null +++ b/public_dropin_notebook_environments/python311_notebook/sshd_config @@ -0,0 +1,124 @@ +# $OpenBSD: sshd_config,v 1.103 2018/04/09 20:41:22 tj Exp $ + +# This is the sshd server system-wide configuration file. See +# sshd_config(5) for more information. + +# This sshd was compiled with PATH=/bin:/usr/bin:/sbin:/usr/sbin + +# The strategy used for options in the default sshd_config shipped with +# OpenSSH is to specify options with their default value where +# possible, but leave them commented. Uncommented options override the +# default value. + +#Port 22 +#AddressFamily any +#ListenAddress 0.0.0.0 +#ListenAddress :: + +#HostKey /etc/ssh/ssh_host_rsa_key +#HostKey /etc/ssh/ssh_host_ecdsa_key +#HostKey /etc/ssh/ssh_host_ed25519_key + +# Ciphers and keying +#RekeyLimit default none + +# Logging +#SyslogFacility AUTH +#LogLevel INFO + +# Authentication: + +#LoginGraceTime 2m +#PermitRootLogin prohibit-password +#StrictModes yes +#MaxAuthTries 6 +#MaxSessions 10 + +#PubkeyAuthentication yes + +# The default is to check both .ssh/authorized_keys and .ssh/authorized_keys2 +# but this is overridden so installations will only check .ssh/authorized_keys +AuthorizedKeysFile .ssh/authorized_keys /etc/authorized_keys/%u + +#AuthorizedPrincipalsFile none + +#AuthorizedKeysCommand none +#AuthorizedKeysCommandUser nobody + +# For this to work you will also need host keys in /etc/ssh/ssh_known_hosts +#HostbasedAuthentication no +# Change to yes if you don't trust ~/.ssh/known_hosts for +# HostbasedAuthentication +#IgnoreUserKnownHosts no +# Don't read the user's ~/.rhosts and ~/.shosts files +#IgnoreRhosts yes + +# To disable tunneled clear text passwords, change to no here! +#PasswordAuthentication yes +#PermitEmptyPasswords no + +# Change to no to disable s/key passwords +#ChallengeResponseAuthentication yes + +# Kerberos options +#KerberosAuthentication no +#KerberosOrLocalPasswd yes +#KerberosTicketCleanup yes +#KerberosGetAFSToken no + +# GSSAPI options +#GSSAPIAuthentication no +#GSSAPICleanupCredentials yes + +# Set this to 'yes' to enable PAM authentication, account processing, +# and session processing. If this is enabled, PAM authentication will +# be allowed through the ChallengeResponseAuthentication and +# PasswordAuthentication. Depending on your PAM configuration, +# PAM authentication via ChallengeResponseAuthentication may bypass +# the setting of "PermitRootLogin without-password". +# If you just want the PAM account and session checks to run without +# PAM authentication, then enable this but set PasswordAuthentication +# and ChallengeResponseAuthentication to 'no'. +#UsePAM no + +#AllowAgentForwarding yes +# Feel free to re-enable these if your use case requires them. +AllowTcpForwarding no +GatewayPorts no +X11Forwarding no +#X11DisplayOffset 10 +#X11UseLocalhost yes +#PermitTTY yes +#PrintMotd yes +#PrintLastLog yes +#TCPKeepAlive yes +#PermitUserEnvironment no +#Compression delayed +#ClientAliveInterval 0 +#ClientAliveCountMax 3 +#UseDNS no +#PidFile /run/sshd.pid +#MaxStartups 10:30:100 +#PermitTunnel no +#ChrootDirectory none +#VersionAddendum none + +# no default banner path +#Banner none + +# override default of no subsystems +Subsystem sftp /usr/lib/ssh/sftp-server + +# Example of overriding settings on a per-user basis +#Match User anoncvs +# X11Forwarding no +# AllowTcpForwarding no +# PermitTTY no +# ForceCommand cvs server +Port 8022 + +HostKey /etc/ssh/keys/ssh_host_rsa_key +HostKey /etc/ssh/keys/ssh_host_dsa_key +HostKey /etc/ssh/keys/ssh_host_ecdsa_key +HostKey /etc/ssh/keys/ssh_host_ed25519_key +PasswordAuthentication no diff --git a/public_dropin_notebook_environments/python311_notebook/start_server.sh b/public_dropin_notebook_environments/python311_notebook/start_server.sh new file mode 100644 index 000000000..49781e7b3 --- /dev/null +++ b/public_dropin_notebook_environments/python311_notebook/start_server.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +# setup the working directory for the kernel +if [ -z "$1" ]; then + # Set default working directory if no argument is provided + WORKING_DIR="/home/notebooks" +else + # Use the provided working directory + WORKING_DIR="$1" +fi + +export WORKING_DIR + +VERBOSE_MODE=true +# shellcheck disable=SC1091 +source /etc/system/kernel/setup-venv.sh $VERBOSE_MODE + +cd /etc/system/kernel/agent || exit +nohup uvicorn agent:app --host 0.0.0.0 --port 8889 & + +# shellcheck disable=SC1091 +source /etc/system/kernel/common-user-limits.sh + +# shellcheck disable=SC1091 +source /etc/system/kernel/setup-ssh.sh +cp -L /var/run/notebooks/ssh/authorized_keys/notebooks /etc/authorized_keys/ && chmod 600 /etc/authorized_keys/notebooks +mkdir /etc/ssh/keys && cp -L /var/run/notebooks/ssh/keys/ssh_host_* /etc/ssh/keys/ && chmod 600 /etc/ssh/keys/ssh_host_* +nohup /usr/sbin/sshd -D & + +# no trailing slash in the working dir path +git config --global --add safe.directory "${WORKING_DIR%/}" + +# setup the working directory for the kernel +cd "$WORKING_DIR" || exit + +# setup ipython extensions +cp -r /etc/ipython/ /home/notebooks/.ipython/ + +# clear out kubernetes_specific env vars before starting kernel gateway as it will inherit them +prefix="KUBERNETES_"; for var in $(printenv | cut -d= -f1); do [[ "$var" == "$prefix"* ]] && unset "$var"; done + +exec jupyter kernelgateway --config=/etc/system/kernel/jupyter_kernel_gateway_config.py --debug