-
-
Notifications
You must be signed in to change notification settings - Fork 637
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
[pantsd] Robustify client connection logic. #5952
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,11 +27,14 @@ class RemotePantsRunner(object): | |
class Fallback(Exception): | ||
"""Raised when fallback to an alternate execution mode is requested.""" | ||
|
||
class PortNotFound(Exception): | ||
"""Raised when the pailgun port can't be found.""" | ||
class Terminated(Exception): | ||
"""Raised when an active run is terminated mid-flight.""" | ||
|
||
PANTS_COMMAND = 'pants' | ||
RECOVERABLE_EXCEPTIONS = (PortNotFound, NailgunClient.NailgunConnectionError) | ||
RECOVERABLE_EXCEPTIONS = ( | ||
NailgunClient.NailgunConnectionError, | ||
NailgunClient.NailgunExecutionError | ||
) | ||
|
||
def __init__(self, exiter, args, env, bootstrap_options, stdin=None, stdout=None, stderr=None): | ||
""" | ||
|
@@ -86,6 +89,44 @@ def _setup_logging(self): | |
root.setLevel(log_level) | ||
root.addHandler(handler) | ||
|
||
@staticmethod | ||
def _backoff(attempt): | ||
"""Minimal backoff strategy for daemon restarts.""" | ||
time.sleep(attempt + (attempt - 1)) | ||
|
||
def _run_pants_with_retry(self, port, retries=3): | ||
"""Runs pants remotely with retry and recovery for nascent executions.""" | ||
attempt = 1 | ||
while 1: | ||
logger.debug( | ||
'connecting to pantsd on port {} (attempt {}/{})'.format(port, attempt, retries) | ||
) | ||
try: | ||
return self._connect_and_execute(port) | ||
except self.RECOVERABLE_EXCEPTIONS as e: | ||
if attempt > retries: | ||
raise self.Fallback(e) | ||
|
||
self._backoff(attempt) | ||
logger.warn( | ||
'pantsd was unresponsive on port {}, retrying ({}/{})' | ||
.format(port, attempt, retries) | ||
) | ||
|
||
# One possible cause of the daemon being non-responsive during an attempt might be if a | ||
# another lifecycle operation is happening concurrently (incl teardown). To account for | ||
# this, we won't begin attempting restarts until at least 1 second has passed (1 attempt). | ||
if attempt > 1: | ||
port = self._restart_pantsd() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Is it possible there are dangling pantd processes that need to be killed before restarting one? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Ah ok. Thank you! |
||
attempt += 1 | ||
except NailgunClient.NailgunError as e: | ||
# Ensure a newline. | ||
logger.fatal('') | ||
logger.fatal('lost active connection to pantsd!') | ||
raise self.Terminated, ( | ||
'abruptly lost active connection to pantsd runner: {!r}'.format(e) | ||
), e.traceback | ||
|
||
def _connect_and_execute(self, port): | ||
# Merge the nailgun TTY capability environment variables with the passed environment dict. | ||
ng_env = NailgunProtocol.isatty_to_env(self._stdin, self._stdout, self._stderr) | ||
|
@@ -99,7 +140,8 @@ def _connect_and_execute(self, port): | |
ins=self._stdin, | ||
out=self._stdout, | ||
err=self._stderr, | ||
exit_on_broken_pipe=True) | ||
exit_on_broken_pipe=True, | ||
expects_pid=True) | ||
|
||
with self._trapped_signals(client), STTYSettings.preserved(): | ||
# Execute the command on the pailgun. | ||
|
@@ -108,15 +150,13 @@ def _connect_and_execute(self, port): | |
# Exit. | ||
self._exiter.exit(result) | ||
|
||
def _restart_pantsd(self): | ||
return PantsDaemon.Factory.restart(bootstrap_options=self._bootstrap_options) | ||
|
||
def _maybe_launch_pantsd(self): | ||
return PantsDaemon.Factory.maybe_launch(bootstrap_options=self._bootstrap_options) | ||
|
||
def run(self, args=None): | ||
self._setup_logging() | ||
port = self._maybe_launch_pantsd() | ||
|
||
logger.debug('connecting to pailgun on port {}'.format(port)) | ||
try: | ||
self._connect_and_execute(port) | ||
except self.RECOVERABLE_EXCEPTIONS as e: | ||
raise self.Fallback(e) | ||
self._run_pants_with_retry(port) |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just wondering if there is any particular reason to use
attempt + (attempt - 1)
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
just trying to keep the retry overhead to <10 seconds and this staggers nicely as 1, 3, 5 seconds for 3 retries. the backoff curve here needn't be as sharp as e.g. a distributed system where things like thundering herd are a thing.