-
Notifications
You must be signed in to change notification settings - Fork 5.8k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[dashboard agent] Catch agent port conflict #23024
Changes from 5 commits
fa51a76
15c286b
d2f6229
1c7ecad
b490483
dc9126d
32ce0e2
2d2539e
abb8c49
c3f2f53
355d5b2
6dfcec3
33bfb72
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -84,17 +84,30 @@ def __init__( | |
self.ppid = int(os.environ["RAY_RAYLET_PID"]) | ||
assert self.ppid > 0 | ||
logger.info("Parent pid is %s", self.ppid) | ||
self.server = aiogrpc.server(options=(("grpc.so_reuseport", 0),)) | ||
grpc_ip = "127.0.0.1" if self.ip == "127.0.0.1" else "0.0.0.0" | ||
self.grpc_port = ray._private.tls_utils.add_port_to_grpc_server( | ||
self.server, f"{grpc_ip}:{self.dashboard_agent_port}" | ||
) | ||
logger.info("Dashboard agent grpc address: %s:%s", grpc_ip, self.grpc_port) | ||
|
||
# Setup raylet channel | ||
options = (("grpc.enable_http_proxy", 0),) | ||
self.aiogrpc_raylet_channel = ray._private.utils.init_grpc_channel( | ||
f"{self.ip}:{self.node_manager_port}", options, asynchronous=True | ||
) | ||
|
||
# Setup grpc server | ||
self.server = aiogrpc.server(options=(("grpc.so_reuseport", 0),)) | ||
grpc_ip = "127.0.0.1" if self.ip == "127.0.0.1" else "0.0.0.0" | ||
try: | ||
self.grpc_port = ray._private.tls_utils.add_port_to_grpc_server( | ||
self.server, f"{grpc_ip}:{self.dashboard_agent_port}" | ||
) | ||
except Exception: | ||
logger.exception( | ||
"Failed to add port to grpc server. Agent will stay alive but " | ||
"disable the grpc service." | ||
) | ||
self.server = None | ||
self.grpc_port = None | ||
else: | ||
logger.info("Dashboard agent grpc address: %s:%s", grpc_ip, self.grpc_port) | ||
|
||
# If the agent is started as non-minimal version, http server should | ||
# be configured to communicate with the dashboard in a head node. | ||
self.http_server = None | ||
|
@@ -147,7 +160,8 @@ async def _check_parent(): | |
check_parent_task = create_task(_check_parent()) | ||
|
||
# Start a grpc asyncio server. | ||
await self.server.start() | ||
if self.server: | ||
await self.server.start() | ||
|
||
self.gcs_client = GcsClient(address=self.gcs_address) | ||
modules = self._load_modules() | ||
|
@@ -159,7 +173,13 @@ async def _check_parent(): | |
# Http server is not started in the minimal version because | ||
# it requires additional dependencies that are not | ||
# included in the minimal ray package. | ||
self.http_server = await self._configure_http_server(modules) | ||
try: | ||
self.http_server = await self._configure_http_server(modules) | ||
except Exception: | ||
logger.exception( | ||
"Failed to start http server. Agent will stay alive but " | ||
"disable the http service." | ||
) | ||
|
||
# Write the dashboard agent port to kv. | ||
# TODO: Use async version if performance is an issue | ||
|
@@ -361,36 +381,27 @@ async def _check_parent(): | |
except Exception as e: | ||
# All these env vars should be available because | ||
# they are provided by the parent raylet. | ||
restart_count = os.environ["RESTART_COUNT"] | ||
max_restart_count = os.environ["MAX_RESTART_COUNT"] | ||
raylet_pid = os.environ["RAY_RAYLET_PID"] | ||
node_ip = args.node_ip_address | ||
if restart_count >= max_restart_count: | ||
# Agent is failed to be started many times. | ||
# Push an error to all drivers, so that users can know the | ||
# impact of the issue. | ||
gcs_publisher = GcsPublisher(args.gcs_address) | ||
traceback_str = ray._private.utils.format_error_message( | ||
traceback.format_exc() | ||
) | ||
message = ( | ||
f"(ip={node_ip}) " | ||
f"The agent on node {platform.uname()[1]} failed to " | ||
f"be restarted {max_restart_count} " | ||
"times. There are 3 possible problems if you see this error." | ||
"\n 1. The dashboard might not display correct " | ||
"information on this node." | ||
"\n 2. Metrics on this node won't be reported." | ||
"\n 3. runtime_env APIs won't work." | ||
"\nCheck out the `dashboard_agent.log` to see the " | ||
"detailed failure messages." | ||
) | ||
ray._private.utils.publish_error_to_driver( | ||
ray_constants.DASHBOARD_AGENT_DIED_ERROR, | ||
message, | ||
redis_client=None, | ||
gcs_publisher=gcs_publisher, | ||
) | ||
logger.error(message) | ||
# Agent is failed to be started many times. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This comment seems wrong? |
||
# Push an error to all drivers, so that users can know the | ||
# impact of the issue. | ||
redis_client = None | ||
gcs_publisher = GcsPublisher(address=args.gcs_address) | ||
|
||
traceback_str = ray._private.utils.format_error_message(traceback.format_exc()) | ||
message = ( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we need it? In this case, raylet will just fail right? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah, if we open fate sharing by default, we don't need this. I can remove this. |
||
f"(ip={node_ip}) " | ||
f"The agent on node {platform.uname()[1]} failed to " | ||
"be started. Check out the `dashboard_agent.log` to see the" | ||
"detailed failure messages." | ||
) | ||
ray._private.utils.publish_error_to_driver( | ||
ray_constants.DASHBOARD_AGENT_DIED_ERROR, | ||
message, | ||
redis_client=None, | ||
gcs_publisher=gcs_publisher, | ||
) | ||
logger.error(message) | ||
logger.exception(e) | ||
exit(1) |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -603,7 +603,8 @@ async def _perform_iteration(self, publisher): | |
await asyncio.sleep(reporter_consts.REPORTER_UPDATE_INTERVAL_MS / 1000) | ||
|
||
async def run(self, server): | ||
reporter_pb2_grpc.add_ReporterServiceServicer_to_server(self, server) | ||
if server: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Any way to just not run this function instead of adding this logic? Seems a bit ugly There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I didn't find better way. The |
||
reporter_pb2_grpc.add_ReporterServiceServicer_to_server(self, server) | ||
|
||
gcs_addr = self._dashboard_agent.gcs_address | ||
assert gcs_addr is not None | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can you add TODO here to remove this in the future? (and do better port resolution)