Skip to content

Commit

Permalink
feat(cli): introduce remote config for quickstart (datahub-project#7424)
Browse files Browse the repository at this point in the history
Co-authored-by: Harshal Sheth <[email protected]>
  • Loading branch information
szalai1 and hsheth2 authored Mar 7, 2023
1 parent 406b11a commit 1d33392
Show file tree
Hide file tree
Showing 7 changed files with 308 additions and 83 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,8 @@ services:
volumes:
- ${HOME}/.datahub/plugins:/etc/datahub/plugins
datahub-upgrade:
labels:
datahub_setup_job: true
command:
- -u
- SystemUpdate
Expand Down Expand Up @@ -141,6 +143,8 @@ services:
volumes:
- esdata:/usr/share/elasticsearch/data
elasticsearch-setup:
labels:
datahub_setup_job: true
container_name: elasticsearch-setup
depends_on:
- elasticsearch
Expand All @@ -153,6 +157,8 @@ services:
labels:
datahub_setup_job: true
kafka-setup:
labels:
datahub_setup_job: true
container_name: kafka-setup
depends_on:
- broker
Expand Down Expand Up @@ -181,6 +187,8 @@ services:
- ../mysql/init.sql:/docker-entrypoint-initdb.d/init.sql
- mysqldata:/var/lib/mysql
mysql-setup:
labels:
datahub_setup_job: true
container_name: mysql-setup
depends_on:
- mysql
Expand Down
2 changes: 2 additions & 0 deletions docker/quickstart/docker-compose-without-neo4j.quickstart.yml
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,8 @@ services:
- ../mysql/init.sql:/docker-entrypoint-initdb.d/init.sql
- mysqldata:/var/lib/mysql
mysql-setup:
labels:
datahub_setup_job: true
container_name: mysql-setup
depends_on:
- mysql
Expand Down
18 changes: 18 additions & 0 deletions docker/quickstart/quickstart_version_mapping.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
# if --version is specified during CLI run
# the CLI will try to use the same quickstart and docker tags
# quickstart_mapping maps broken releases to fixed ones
quickstart_version_map:
# default key is mandatory and is used if no version is specified
# in case of a broken release or broken master branch
# specify a working version here
default:
composefile_git_ref: master
docker_tag: head
# v0.9.6 images contain security vulnerabilities
v0.9.6:
composefile_git_ref: v0.9.6.1
docker_tag: v0.9.6.1
# if stable is not defined the latest released version will be used
# stable:
# composefile_git_ref: master
# docker_tag: head
72 changes: 24 additions & 48 deletions metadata-ingestion/src/datahub/cli/docker_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,43 +7,10 @@
import docker
import docker.errors
import docker.models.containers
import yaml

from datahub.configuration.common import ExceptionWithProps

REQUIRED_CONTAINERS = [
"elasticsearch",
"datahub-gms",
"datahub-frontend-react",
"broker",
]

# We expect these containers to exit 0, while all other containers
# are expected to be running and healthy.
ENSURE_EXIT_SUCCESS = [
"kafka-setup",
"elasticsearch-setup",
"mysql-setup",
"datahub-upgrade",
]

# If present, we check that the container is ok. If it exists
# in ENSURE_EXIT_SUCCESS, we check that it exited 0. Otherwise,
# we check that it is running and healthy.
CONTAINERS_TO_CHECK_IF_PRESENT = [
"mysql",
"mysql-setup",
"cassandra",
"cassandra-setup",
"neo4j",
"elasticsearch-setup",
"schema-registry",
"zookeeper",
"datahub-upgrade",
"kafka-setup",
# "datahub-mce-consumer",
# "datahub-mae-consumer",
]

# Docker seems to under-report memory allocated, so we also need a bit of buffer to account for it.
MIN_MEMORY_NEEDED = 3.8 # GB

Expand Down Expand Up @@ -189,28 +156,38 @@ def to_exception(

def check_docker_quickstart() -> QuickstartStatus:
container_statuses: List[DockerContainerStatus] = []

with get_docker_client() as client:
containers = client.containers.list(
all=True,
filters=DATAHUB_COMPOSE_PROJECT_FILTER,
)

if len(containers) == 0:
return QuickstartStatus([])

# load the expected containers from the docker-compose file
config_files = (
containers[0]
.labels.get("com.docker.compose.project.config_files")
.split(",")
)
all_containers = set()
for config_file in config_files:
with open(config_file, "r") as config_file:
all_containers.update(
yaml.safe_load(config_file).get("services", {}).keys()
)

existing_containers = set()
# Check that the containers are running and healthy.
container: docker.models.containers.Container
for container in containers:
name = container.name
name = container.labels.get("com.docker.compose.service", container.name)
existing_containers.add(name)
status = ContainerStatus.OK

if container.name not in (
REQUIRED_CONTAINERS + CONTAINERS_TO_CHECK_IF_PRESENT
):
# Ignores things like "datahub-frontend" which are no longer used.
# This way, we only check required containers like "datahub-frontend-react"
# even if there are some old containers lying around.
if name not in all_containers:
# Ignores containers that are not part of the datahub docker-compose
continue

if container.name in ENSURE_EXIT_SUCCESS:
if container.labels.get("datahub_setup_job", False):
if container.status != "exited":
status = ContainerStatus.STILL_RUNNING
elif container.attrs["State"]["ExitCode"] != 0:
Expand All @@ -227,8 +204,7 @@ def check_docker_quickstart() -> QuickstartStatus:
container_statuses.append(DockerContainerStatus(name, status))

# Check for missing containers.
existing_containers = {container.name for container in containers}
missing_containers = set(REQUIRED_CONTAINERS) - existing_containers
missing_containers = set(all_containers) - existing_containers
for missing in missing_containers:
container_statuses.append(
DockerContainerStatus(missing, ContainerStatus.MISSING)
Expand Down
76 changes: 41 additions & 35 deletions metadata-ingestion/src/datahub/cli/docker_cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,14 +28,11 @@
get_docker_client,
run_quickstart_preflight_checks,
)
from datahub.cli.quickstart_versioning import QuickstartVersionMappingConfig
from datahub.ingestion.run.pipeline import Pipeline
from datahub.telemetry import telemetry
from datahub.upgrade import upgrade
from datahub.utilities.sample_data import (
BOOTSTRAP_MCES_FILE,
DOCKER_COMPOSE_BASE,
download_sample_data,
)
from datahub.utilities.sample_data import BOOTSTRAP_MCES_FILE, download_sample_data

logger = logging.getLogger(__name__)

Expand All @@ -60,18 +57,6 @@
KAFKA_SETUP_QUICKSTART_COMPOSE_FILE = (
"docker/quickstart/docker-compose.kafka-setup.quickstart.yml"
)
NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_URL = (
f"{DOCKER_COMPOSE_BASE}/{NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_FILE}"
)
ELASTIC_QUICKSTART_COMPOSE_URL = (
f"{DOCKER_COMPOSE_BASE}/{ELASTIC_QUICKSTART_COMPOSE_FILE}"
)
NEO4J_AND_ELASTIC_M1_QUICKSTART_COMPOSE_URL = (
f"{DOCKER_COMPOSE_BASE}/{NEO4J_AND_ELASTIC_M1_QUICKSTART_COMPOSE_FILE}"
)
ELASTIC_M1_QUICKSTART_COMPOSE_URL = (
f"{DOCKER_COMPOSE_BASE}/{ELASTIC_M1_QUICKSTART_COMPOSE_FILE}"
)


class Architectures(Enum):
Expand Down Expand Up @@ -119,7 +104,6 @@ def _print_issue_list_and_exit(
@telemetry.with_telemetry()
def check() -> None:
"""Check that the Docker containers are healthy"""

status = check_docker_quickstart()
if status.is_ok():
click.secho("✔ No issues detected", fg="green")
Expand Down Expand Up @@ -450,8 +434,8 @@ def detect_quickstart_arch(arch: Optional[str]) -> Architectures:
@click.option(
"--version",
type=str,
default=None,
help="Datahub version to be deployed. If not set, deploy using the defaults from the quickstart compose",
default="default",
help="Datahub version to be deployed. If not set, deploy using the defaults from the quickstart compose. Use 'stable' to start the latest stable version.",
)
@click.option(
"--build-locally",
Expand Down Expand Up @@ -658,6 +642,11 @@ def quickstart(
return

quickstart_arch = detect_quickstart_arch(arch)
quickstart_versioning = QuickstartVersionMappingConfig.fetch_quickstart_config()
quickstart_execution_plan = quickstart_versioning.get_quickstart_execution_plan(
version
)
click.echo(quickstart_execution_plan)

# Run pre-flight checks.
with get_docker_client() as client:
Expand All @@ -683,11 +672,12 @@ def quickstart(
kafka_setup,
quickstart_arch,
standalone_consumers,
quickstart_execution_plan.composefile_git_ref,
)

# set version
_set_environment_variables(
version=version,
version=quickstart_execution_plan.docker_tag,
mysql_port=mysql_port,
zk_port=zk_port,
kafka_broker_port=kafka_broker_port,
Expand Down Expand Up @@ -803,28 +793,43 @@ def quickstart(
)


def get_docker_compose_base_url(version_tag: str) -> str:
if os.environ.get("DOCKER_COMPOSE_BASE"):
return os.environ["DOCKER_COMPOSE_BASE"]

return f"https://raw.githubusercontent.com/datahub-project/datahub/{version_tag}"


def get_github_file_url(neo4j: bool, is_m1: bool, release_version_tag: str) -> str:
base_url = get_docker_compose_base_url(release_version_tag)
if neo4j:
github_file = (
f"{base_url}/{NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_FILE}"
if not is_m1
else f"{base_url}/{NEO4J_AND_ELASTIC_M1_QUICKSTART_COMPOSE_FILE}"
)
else:
github_file = (
f"{base_url}/{ELASTIC_QUICKSTART_COMPOSE_FILE}"
if not is_m1
else f"{base_url}/{ELASTIC_M1_QUICKSTART_COMPOSE_FILE}"
)
return github_file


def download_compose_files(
quickstart_compose_file_name,
quickstart_compose_file_list,
graph_service_impl,
kafka_setup,
quickstart_arch,
standalone_consumers,
compose_git_ref,
):
# download appropriate quickstart file
should_use_neo4j = should_use_neo4j_for_graph_service(graph_service_impl)
if should_use_neo4j:
github_file = (
NEO4J_AND_ELASTIC_QUICKSTART_COMPOSE_URL
if not is_arch_m1(quickstart_arch)
else NEO4J_AND_ELASTIC_M1_QUICKSTART_COMPOSE_URL
)
else:
github_file = (
ELASTIC_QUICKSTART_COMPOSE_URL
if not is_arch_m1(quickstart_arch)
else ELASTIC_M1_QUICKSTART_COMPOSE_URL
)
is_m1 = is_arch_m1(quickstart_arch)
github_file = get_github_file_url(should_use_neo4j, is_m1, compose_git_ref)
# also allow local files
request_session = requests.Session()
request_session.mount("file://", FileAdapter())
Expand All @@ -842,10 +847,11 @@ def download_compose_files(
tmp_file.write(quickstart_download_response.content)
logger.debug(f"Copied to {path}")
if standalone_consumers:
base_url = get_docker_compose_base_url(compose_git_ref)
consumer_github_file = (
f"{DOCKER_COMPOSE_BASE}/{CONSUMERS_QUICKSTART_COMPOSE_FILE}"
f"{base_url}/{CONSUMERS_QUICKSTART_COMPOSE_FILE}"
if should_use_neo4j
else f"{DOCKER_COMPOSE_BASE}/{ELASTIC_CONSUMERS_QUICKSTART_COMPOSE_FILE}"
else f"{base_url}/{ELASTIC_CONSUMERS_QUICKSTART_COMPOSE_FILE}"
)

default_consumer_compose_file = (
Expand Down
Loading

0 comments on commit 1d33392

Please sign in to comment.