Skip to content

Commit

Permalink
Improve error handling for nailgun client connection attempts
Browse files Browse the repository at this point in the history
Fixes #2238

- Use 127.0.0.1 vs localhost for nailgun default host. this prevents a situation where a broken /etc/hosts can affect pants ability to connect to the nailgun (currently raises an unhandled and generic socket.gaierror).
- Add a specialized NailgunClient.NailgunConnectionError exception type.
- Convert NailgunClient.try_connect() from socket.connect_ex() to socket.connect() with added catching of socket.gaierror which is not covered by socket.connect_ex().
- Make NailgunClient.try_connect() raise a helpful NailgunConnectionError exception on failure vs returning None.
- Propagate the helpful NailgunConnectionError in both NailgunClient.__call__() and NailgunExecutor.ensure_connectable().
- Improve debug logging.

Testing Done:
https://travis-ci.org/pantsbuild/pants/builds/81720419 + manual testing.

Before/After for a broken /etc/hosts:

    Exception message: [Errno 8] nodename nor servname provided, or not known
vs
    Exception message: Problem connecting to nailgun server at localhost:55261: gaierror(8, 'nodename nor servname provided, or not known')

Before (with resident ng)/Before (without resident ng)/After for an unconnectable nailgun:

    Exception message: Failed to connect to ng server.
vs
    Exception message: 'NoneType' object has no attribute 'close'
vs
    Exception message: Problem connecting to nailgun server at 127.0.0.1:9999: error(61, 'Connection refused')

Bugs closed: 2238, 2246

Reviewed at https://rbcommons.com/s/twitter/r/2869/
  • Loading branch information
kwlzn committed Sep 23, 2015
1 parent 791b7af commit 35ba6bf
Show file tree
Hide file tree
Showing 2 changed files with 41 additions and 23 deletions.
45 changes: 33 additions & 12 deletions src/python/pants/java/nailgun_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from __future__ import (absolute_import, division, generators, nested_scopes, print_function,
unicode_literals, with_statement)

import logging
import os
import select
import socket
Expand All @@ -14,6 +15,9 @@
from functools import partial


logger = logging.getLogger(__name__)


class NailgunSession(object):
"""Handles a single nailgun command session."""

Expand Down Expand Up @@ -148,9 +152,12 @@ class NailgunClient(object):
"""

class NailgunError(Exception):
"""Indicates an error connecting to or interacting with a nailgun server."""
"""Indicates an error interacting with a nailgun server."""

DEFAULT_NG_HOST = 'localhost'
class NailgunConnectionError(NailgunError):
"""Indicates an error upon initial connect to the nailgun server."""

DEFAULT_NG_HOST = '127.0.0.1'
DEFAULT_NG_PORT = 2113

# For backwards compatibility with nails expecting the ng c client special env vars.
Expand All @@ -168,7 +175,7 @@ def __init__(self,
workdir=None):
"""Creates a nailgun client that can be used to issue zero or more nailgun commands.
:param string host: the nailgun server to contact (defaults to localhost)
:param string host: the nailgun server to contact (defaults to '127.0.0.1')
:param int port: the port the nailgun server is listening on (defaults to the default nailgun
port: 2113)
:param file ins: a file to read command standard input from (defaults to stdin) - can be None
Expand All @@ -187,8 +194,21 @@ def __init__(self,
self.execute = self.__call__

def try_connect(self):
"""Creates a socket, connects it to the nailgun and returns the connected socket.
:returns: a connected `socket.socket`.
:raises: `NailgunClient.NailgunConnectionError` on failure to connect.
"""
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
return sock if sock.connect_ex((self._host, self._port)) == 0 else None
try:
sock.connect((self._host, self._port))
except (socket.error, socket.gaierror) as e:
logger.debug('Encountered socket exception {!r} when attempting connect to nailgun'.format(e))
sock.close()
raise self.NailgunConnectionError(
'Problem connecting to nailgun server at {}:{}: {!r}'.format(self._host, self._port, e))
else:
return sock

def __call__(self, main_class, cwd=None, *args, **environment):
"""Executes the given main_class with any supplied args in the given environment.
Expand All @@ -203,22 +223,23 @@ def __call__(self, main_class, cwd=None, *args, **environment):
"""
environment = dict(self.ENV_DEFAULTS.items() + environment.items())
cwd = cwd or self._workdir

# N.B. This can throw NailgunConnectionError.
sock = self.try_connect()
if not sock:
raise self.NailgunError('Problem connecting to nailgun server'
' {}:{}'.format(self._host, self._port))

session = NailgunSession(sock, self._ins, self._out, self._err)
try:
return session.execute(cwd, main_class, *args, **environment)
except socket.error as e:
raise self.NailgunError('Problem contacting nailgun server {}:{}:'
' {}'.format(self._host, self._port, e))
raise self.NailgunError('Problem communicating with nailgun server at {}:{}: {!r}'
.format(self._host, self._port, e))
except session.ProtocolError as e:
raise self.NailgunError('Problem executing the nailgun protocol with nailgun server {}:{}:'
' {}'.format(self._host, self._port, e))
raise self.NailgunError('Problem in nailgun protocol with nailgun server at {}:{}: {!r}'
.format(self._host, self._port, e))
finally:
sock.close()

def __repr__(self):
return 'NailgunClient(host={!r}, port={!r}, workdir={!r})'.format(self._host, self._port, self._workdir)
return 'NailgunClient(host={!r}, port={!r}, workdir={!r})'.format(self._host,
self._port,
self._workdir)
19 changes: 8 additions & 11 deletions src/python/pants/java/nailgun_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import select
import threading
import time
from contextlib import closing

from six import string_types
from twitter.common.collections import maybe_list
Expand Down Expand Up @@ -210,20 +211,16 @@ def _create_ngclient(self, port, stdout, stderr):

def ensure_connectable(self, nailgun):
"""Ensures that a nailgun client is connectable or raises NailgunError."""
attempt_count = 0
attempt_count = 1
while 1:
if attempt_count > self._connect_attempts:
logger.debug('Failed to connect to ng after {count} attempts'
.format(count=self._connect_attempts))
raise NailgunClient.NailgunError('Failed to connect to ng server.')

try:
sock = nailgun.try_connect()
if sock:
logger.debug('Connected to ng server {server!r}'.format(server=self))
with closing(nailgun.try_connect()) as sock:
logger.debug('Verified new ng server is connectable at {}'.format(sock.getpeername()))
return
finally:
sock.close()
except nailgun.NailgunConnectionError:
if attempt_count >= self._connect_attempts:
logger.debug('Failed to connect to ng after {} attempts'.format(self._connect_attempts))
raise # Re-raise the NailgunConnectionError which provides more context to the user.

attempt_count += 1
time.sleep(self.WAIT_INTERVAL_SEC)
Expand Down

0 comments on commit 35ba6bf

Please sign in to comment.