Skip to content

Commit

Permalink
Set backends requests timeouts (#1793)
Browse files Browse the repository at this point in the history
* Add runpod requests timeout

* Add azure requests timeout

* Add lambda requests timeout
  • Loading branch information
r4victor authored Oct 7, 2024
1 parent 6501d48 commit d53fb5e
Show file tree
Hide file tree
Showing 3 changed files with 29 additions and 18 deletions.
15 changes: 13 additions & 2 deletions src/dstack/_internal/core/backends/azure/compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
get_user_data,
)
from dstack._internal.core.backends.base.offers import get_catalog_offers
from dstack._internal.core.errors import NoCapacityError
from dstack._internal.core.errors import ComputeError, NoCapacityError
from dstack._internal.core.models.backends.base import BackendType
from dstack._internal.core.models.gateways import (
GatewayComputeConfiguration,
Expand Down Expand Up @@ -460,7 +460,18 @@ def _launch_instance(
message = e.error.message if e.error.message is not None else ""
raise NoCapacityError(message)
raise e
vm = poller.result()
vm = poller.result(timeout=600)
if not poller.done():
logger.error(
"Timed out waiting for instance {instance_name} launch. "
"The instance will be terminated."
)
_terminate_instance(
compute_client=compute_client,
resource_group=resource_group,
instance_name=instance_name,
)
raise ComputeError(f"Timed out waiting for instance {instance_name} launch")
return vm


Expand Down
28 changes: 14 additions & 14 deletions src/dstack/_internal/core/backends/lambdalabs/api_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,40 +47,40 @@ def launch_instances(
"name": name,
}
resp = self._make_request("POST", "/instance-operations/launch", data)
if resp.ok:
return resp.json()["data"]["instance_ids"]
resp.raise_for_status()
if not resp.ok:
resp.raise_for_status()
return resp.json()["data"]["instance_ids"]

def terminate_instances(self, instance_ids: List[str]) -> List[str]:
data = {"instance_ids": instance_ids}
resp = self._make_request("POST", "/instance-operations/terminate", data)
if resp.ok:
return resp.json()["data"]
resp.raise_for_status()
if not resp.ok:
resp.raise_for_status()
return resp.json()["data"]

def list_ssh_keys(self) -> List[Dict]:
resp = self._make_request("GET", "/ssh-keys")
if resp.ok:
return resp.json()["data"]
resp.raise_for_status()
if not resp.ok:
resp.raise_for_status()
return resp.json()["data"]

def add_ssh_key(self, name: str, public_key: str) -> List[Dict]:
data = {
"name": name,
"public_key": public_key,
}
resp = self._make_request("POST", "/ssh-keys", data)
if resp.ok:
return resp.json()["data"]
resp.raise_for_status()
if not resp.ok:
resp.raise_for_status()
return resp.json()["data"]

def _make_request(self, method: str, path: str, data: Any = None):
# TODO: fix S113 by setting an adequate timeout here or in every method
return requests.request( # noqa: S113
return requests.request(
method=method,
url=API_URL + path,
json=data,
headers={"Authorization": f"Bearer {self.api_key}"},
timeout=120,
)

def _url(self, path: str) -> str:
Expand Down
4 changes: 2 additions & 2 deletions src/dstack/_internal/core/backends/runpod/api_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,11 +230,11 @@ def delete_network_volume(self, volume_id: str):

def _make_request(self, data: Any = None) -> Response:
try:
# TODO: fix S113 by setting an adequate timeout here or in every method
response = requests.request( # noqa: S113
response = requests.request(
method="POST",
url=f"{API_URL}?api_key={self.api_key}",
json=data,
timeout=120,
)
response.raise_for_status()
if "errors" in response.json():
Expand Down

0 comments on commit d53fb5e

Please sign in to comment.