Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Core] Make sure dashboard agent will exit if grpc server fails #44899

Merged
merged 1 commit into from
Apr 22, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 19 additions & 18 deletions dashboard/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ def __init__(

def _init_non_minimal(self):
from ray._private.gcs_pubsub import GcsAioPublisher
from ray.dashboard.http_server_agent import HttpServerAgent

self.aio_publisher = GcsAioPublisher(address=self.gcs_address)

Expand Down Expand Up @@ -137,12 +138,12 @@ def _init_non_minimal(self):
else:
logger.info("Dashboard agent grpc address: %s:%s", grpc_ip, self.grpc_port)

async def _configure_http_server(self, modules):
from ray.dashboard.http_server_agent import HttpServerAgent

http_server = HttpServerAgent(self.ip, self.listen_port)
await http_server.start(modules)
return http_server
# If the agent is not minimal it should start the http server
# to communicate with the dashboard in a head node.
# Http server is not started in the minimal version because
# it requires additional dependencies that are not
# included in the minimal ray package.
self.http_server = HttpServerAgent(self.ip, self.listen_port)

def _load_modules(self):
"""Load dashboard agent modules."""
Expand Down Expand Up @@ -183,15 +184,9 @@ async def run(self):

modules = self._load_modules()

# Setup http server if necessary.
if not self.minimal:
# If the agent is not minimal it should start the http server
# to communicate with the dashboard in a head node.
# Http server is not started in the minimal version because
# it requires additional dependencies that are not
# included in the minimal ray package.
if self.http_server:
try:
self.http_server = await self._configure_http_server(modules)
await self.http_server.start(modules)
except Exception:
# TODO(SongGuyang): Catch the exception here because there is
# port conflict issue which brought from static port. We should
Expand All @@ -214,6 +209,7 @@ async def run(self):
)

tasks = [m.run(self.server) for m in modules]

if sys.platform not in ["win32", "cygwin"]:

def callback(msg):
Expand All @@ -225,13 +221,18 @@ def callback(msg):
self.log_dir, self.gcs_address, callback, loop
)
tasks.append(check_parent_task)
await asyncio.gather(*tasks)

if self.server:
await self.server.wait_for_termination()
tasks.append(self.server.wait_for_termination())
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I manually test it with a broken grpcio version but I don't know how to write a test for it since I don't know how to let wait_for_termination() throw an exception with a good grpcio version.

else:
while True:
await asyncio.sleep(3600) # waits forever

async def wait_forever(self):
while True:
await asyncio.sleep(3600)

tasks.append(wait_forever())

await asyncio.gather(*tasks)

if self.http_server:
await self.http_server.cleanup()
Expand Down
2 changes: 1 addition & 1 deletion dashboard/modules/healthz/healthz_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,4 @@ async def run(self, server):

@staticmethod
def is_minimal_module():
return True
return False
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this is not a minimal module since it depends on aio http server. cc @rkooo567 can you confirm?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

yeah I think you are right

Loading