Skip to content

Commit

Permalink
Multiple issues related to the runpod backend #1133
Browse files Browse the repository at this point in the history
  • Loading branch information
peterschmidt85 authored and Sergey Mezentsev committed Apr 15, 2024
1 parent 140438d commit d9b2870
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 36 deletions.
18 changes: 8 additions & 10 deletions src/dstack/_internal/core/backends/base/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,18 +148,14 @@ def get_gateway_user_data(authorized_key: str) -> str:
)


def get_docker_commands(authorized_keys: List[str]) -> List[str]:
def get_docker_commands(
authorized_keys: List[str], fix_path_in_dot_profile: bool = True
) -> List[str]:
authorized_keys_content = "\n".join(authorized_keys).strip()
commands = [
# note: &> redirection doesn't work in /bin/sh
# check in sshd is here, install if not
(
"if ! command -v sshd >/dev/null 2>&1; then { "
"apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y openssh-server; "
"} || { "
"yum -y install openssh-server; "
"}; fi"
),
"if ! command -v sshd >/dev/null 2>&1; then apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y openssh-server || yum install -y openssh-server; fi",
# prohibit password authentication
'sed -i "s/.*PasswordAuthentication.*/PasswordAuthentication no/g" /etc/ssh/sshd_config',
# create ssh dirs and add public key
Expand All @@ -169,7 +165,9 @@ def get_docker_commands(authorized_keys: List[str]) -> List[str]:
"chmod 600 ~/.ssh/authorized_keys",
# preserve environment variables for SSH clients
"env >> ~/.ssh/environment",
"sed -ie '1s@^@export PATH=\"'\"$PATH\"':$PATH\"\\n\\n@' ~/.profile",
"sed -ie '1s@^@export PATH=\"'\"$PATH\"':$PATH\"\\n\\n@' ~/.profile"
if fix_path_in_dot_profile
else ":",
# regenerate host keys
"rm -rf /etc/ssh/ssh_host_*",
"ssh-keygen -A > /dev/null",
Expand All @@ -187,7 +185,7 @@ def get_docker_commands(authorized_keys: List[str]) -> List[str]:
url = f"https://{bucket}.s3.eu-west-1.amazonaws.com/{build}/binaries/dstack-runner-linux-amd64"

commands += [
f'curl --connect-timeout 60 --max-time 240 --retry 1 --output {runner} "{url}"',
f"curl --connect-timeout 60 --max-time 240 --retry 1 --output {runner} {url}",
f"chmod +x {runner}",
f"{runner} --log-level 6 start --http-port 10999 --temp-dir /tmp/runner --home-dir /root --working-dir /workflow",
]
Expand Down
35 changes: 9 additions & 26 deletions src/dstack/_internal/core/backends/runpod/compute.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
from typing import List, Optional

from dstack._internal import settings
from dstack._internal.core.backends.base import Compute
from dstack._internal.core.backends.base.compute import (
get_dstack_runner_version,
get_docker_commands,
get_instance_name,
)
from dstack._internal.core.backends.base.offers import get_catalog_offers
Expand Down Expand Up @@ -76,7 +75,7 @@ def run_job(
min_memory_in_gb=memory_size,
support_public_ip=True,
docker_args=get_docker_args(authorized_keys),
ports="22/tcp",
ports="10022/tcp",
)

instance_id = resp["id"]
Expand All @@ -86,7 +85,7 @@ def run_job(
raise ComputeError(f"Wait instance {instance_id} timeout")

for port in pod["runtime"]["ports"]:
if port["privatePort"] == 22:
if port["privatePort"] == 10022:
ip = port["ip"]
publicPort = port["publicPort"]
break
Expand Down Expand Up @@ -115,25 +114,9 @@ def terminate_instance(


def get_docker_args(authorized_keys):
authorized_keys_content = "\\n".join(authorized_keys).strip()
update_and_setup_ssh = f'apt update; DEBIAN_FRONTEND=noninteractive apt-get install openssh-server -y;mkdir -p ~/.ssh;cd $_;chmod 700 ~/.ssh;echo \\"{authorized_keys_content}\\" >> authorized_keys;chmod 700 authorized_keys'
env_cmd = "env >> ~/.ssh/environment"
sed_cmd = r"sed -ie \"1s@^@export PATH=\\\"''$PATH'':\\$PATH\\\"\\n\\n@\" ~/.profile"
rm_rf = "rm -rf /etc/ssh/ssh_host_*"
ssh_key_gen = "ssh-keygen -A > /dev/null"
runner = "/usr/local/bin/dstack-runner"

build = get_dstack_runner_version()
bucket = "dstack-runner-downloads-stgn"
if settings.DSTACK_VERSION is not None:
bucket = "dstack-runner-downloads"
url = f"https://{bucket}.s3.eu-west-1.amazonaws.com/{build}/binaries/dstack-runner-linux-amd64"

runner_commands = [
f'curl --connect-timeout 60 --max-time 240 --retry 1 --output {runner} \\"{url}\\"',
f"chmod +x {runner}",
f"{runner} --log-level 6 start --http-port 10999 --temp-dir /tmp/runner --home-dir /root --working-dir /workflow",
]
runner_commands = " && ".join(runner_commands)

return f"bash -c '{update_and_setup_ssh} && {env_cmd} && {sed_cmd} && {rm_rf} && {ssh_key_gen} && service ssh start && {runner_commands}; sleep infinity'"
commands = get_docker_commands(authorized_keys, False)
command = " && ".join(commands)
command_escaped = command.replace('"', '\\"')
command_escaped = command_escaped.replace("'", '\\"')
command_escaped = command_escaped.replace("\n", "\\n")
return f"bash -c '{command_escaped}'"

0 comments on commit d9b2870

Please sign in to comment.