Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle installing libraries multiple times in PipResolver #3024

Merged
merged 11 commits into from
Oct 21, 2024
23 changes: 17 additions & 6 deletions src/databricks/labs/ucx/source_code/python_libraries.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@


class PythonLibraryResolver(LibraryResolver):
# TODO: https://github.com/databrickslabs/ucx/issues/1640
"""Resolve python libraries by registering and installing Python libraries."""

def __init__(
self,
Expand Down Expand Up @@ -61,13 +61,22 @@ def _install_library(self, path_lookup: PathLookup, *libraries: str) -> list[Dep
return self._install_egg(*resolved_libraries)
return self._install_pip(*resolved_libraries)

@staticmethod
def _resolve_libraries(path_lookup: PathLookup, *libraries: str) -> list[str]:
# Resolve relative pip installs from notebooks: %pip install ../../foo.whl
def _resolve_libraries(self, path_lookup: PathLookup, *libraries: str) -> list[str]:
"""Resolve pip installs as library (pip install databricks-labs-ucx) or as path (pip install ../../ucx.whl).

A library is defined as library, i.e. *not* as path, when the library:
1. Is not found in the path lookup.
2. Is not located in the temporary virtual environment where this resolver install libraries. If this is the
case, it signals the library is installed for a second time.

Otherwise, we consider the library to be a path.

Note: The above works given our design choice to *only* build dependency graphs when all dependencies are found.
"""
libs = []
for library in libraries:
maybe_library = path_lookup.resolve(Path(library))
if maybe_library is None:
if maybe_library is None or maybe_library.is_relative_to(self._temporary_virtual_environment):
libs.append(library)
else:
libs.append(maybe_library.as_posix())
Expand All @@ -81,9 +90,11 @@ def _install_pip(self, *libraries: str) -> list[DependencyProblem]:
*libraries,
"-t",
str(self._temporary_virtual_environment),
"--upgrade", # Upgrades when library already installed
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@nfx and @ericvergnaud : Could it ever happen that we have two library version within one dependency graph (or within one PathLookup)? Currently, this is not handled

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

is not the case. also, --upgrade might give a false update

]
return_code, stdout, stderr = self._runner(args)
logger.debug(f"pip output:\n{stdout}\n{stderr}")
if stdout or stderr:
logger.debug(f"pip output:\n{stdout}\n{stderr}")
if return_code != 0:
command = " ".join(args)
problem = DependencyProblem("library-install-failed", f"'{command}' failed with '{stderr}'")
Expand Down
31 changes: 31 additions & 0 deletions tests/integration/source_code/test_libraries.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,8 @@
because it uses the context and the time it takes to run the test.
"""

import logging
import re
from pathlib import Path

import pytest
Expand Down Expand Up @@ -56,3 +58,32 @@ def test_build_notebook_dependency_graphs_fails_installing_when_spaces(simple_ct

assert not maybe.problems
assert maybe.graph.all_relative_names() == {f"{notebook}.py", "thingy/__init__.py"}


def test_build_notebook_dependency_graphs_when_installing_pytest_twice(caplog, simple_ctx) -> None:
pip_already_exists_warning = re.compile(
r".*WARNING: Target directory .+ already exists\. Specify --upgrade to force replacement.*"
)
ctx = simple_ctx.replace(path_lookup=MockPathLookup())
with caplog.at_level(logging.DEBUG, logger="databricks.labs.ucx.source_code.python_libraries"):
maybe = ctx.dependency_resolver.build_notebook_dependency_graph(
Path("pip_install_pytest_twice"), CurrentSessionState()
)
assert not maybe.problems
assert maybe.graph.path_lookup.resolve(Path("pytest"))
assert not pip_already_exists_warning.match(caplog.text.replace("\n", " ")), "Pip already exists warning detected"


@pytest.mark.parametrize(
"notebook",
(
"pip_install_demo_wheel",
"pip_install_multiple_packages",
"pip_install_pytest_with_index_url",
),
)
def test_build_notebook_dependency_graphs_when_installing_notebooks_twice(caplog, simple_ctx, notebook) -> None:
ctx = simple_ctx.replace(path_lookup=MockPathLookup())
for _ in range(2):
maybe = ctx.dependency_resolver.build_notebook_dependency_graph(Path(notebook), CurrentSessionState())
assert not maybe.problems
13 changes: 13 additions & 0 deletions tests/unit/source_code/samples/pip_install_pytest_twice.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# Databricks notebook source

# COMMAND ----------

# MAGIC %pip install pytest --quiet

# COMMAND ----------

# MAGIC %pip install pytest --quiet

# COMMAND ----------

import pytest