-
Notifications
You must be signed in to change notification settings - Fork 503
/
command_runner.py
378 lines (341 loc) · 14.3 KB
/
command_runner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
"""Runner for commands to be executed on the cluster."""
import getpass
import enum
import hashlib
import os
import pathlib
import shlex
import time
from typing import List, Optional, Tuple, Union
from sky import sky_logging
from sky.utils import common_utils, subprocess_utils
from sky.skylet import log_lib
logger = sky_logging.init_logger(__name__)
# The git exclude file to support.
GIT_EXCLUDE = '.git/info/exclude'
# Rsync options
RSYNC_DISPLAY_OPTION = '-Pavz'
# Legend
# dir-merge: ignore file can appear in any subdir, applies to that
# subdir downwards
# Note that "-" is mandatory for rsync and means all patterns in the ignore
# files are treated as *exclude* patterns. Non-exclude patterns, e.g., "!
# do_not_exclude" doesn't work, even though git allows it.
RSYNC_FILTER_OPTION = '--filter=\'dir-merge,- .gitignore\''
RSYNC_EXCLUDE_OPTION = '--exclude-from={}'
_HASH_MAX_LENGTH = 10
def _ssh_control_path(ssh_control_filename: Optional[str]) -> Optional[str]:
"""Returns a temporary path to be used as the ssh control path."""
if ssh_control_filename is None:
return None
username = getpass.getuser()
path = (f'/tmp/skypilot_ssh_{username}/{ssh_control_filename}')
os.makedirs(path, exist_ok=True)
return path
def ssh_options_list(ssh_private_key: Optional[str],
ssh_control_name: Optional[str],
*,
ssh_proxy_command: Optional[str] = None,
timeout: int = 30) -> List[str]:
"""Returns a list of sane options for 'ssh'."""
# Forked from Ray SSHOptions:
# https://github.com/ray-project/ray/blob/master/python/ray/autoscaler/_private/command_runner.py
arg_dict = {
# Supresses initial fingerprint verification.
'StrictHostKeyChecking': 'no',
# SSH IP and fingerprint pairs no longer added to known_hosts.
# This is to remove a 'REMOTE HOST IDENTIFICATION HAS CHANGED'
# warning if a new node has the same IP as a previously
# deleted node, because the fingerprints will not match in
# that case.
'UserKnownHostsFile': os.devnull,
# Try fewer extraneous key pairs.
'IdentitiesOnly': 'yes',
# Abort if port forwarding fails (instead of just printing to
# stderr).
'ExitOnForwardFailure': 'yes',
# Quickly kill the connection if network connection breaks (as
# opposed to hanging/blocking).
'ServerAliveInterval': 5,
'ServerAliveCountMax': 3,
# ConnectTimeout.
'ConnectTimeout': f'{timeout}s',
# Agent forwarding for git.
'ForwardAgent': 'yes',
}
if ssh_control_name is not None:
arg_dict.update({
# Control path: important optimization as we do multiple ssh in one
# sky.launch().
'ControlMaster': 'auto',
'ControlPath': f'{_ssh_control_path(ssh_control_name)}/%C',
'ControlPersist': '300s',
})
ssh_key_option = [
'-i',
ssh_private_key,
] if ssh_private_key is not None else []
if ssh_proxy_command is not None:
logger.debug(f'--- Proxy: {ssh_proxy_command} ---')
arg_dict.update({
# Due to how log_lib.run_with_log() works (using shell=True) we
# must quote this value.
'ProxyCommand': shlex.quote(ssh_proxy_command),
})
return ssh_key_option + [
x for y in (['-o', f'{k}={v}']
for k, v in arg_dict.items()
if v is not None) for x in y
]
class SshMode(enum.Enum):
"""Enum for SSH mode."""
# Do not allocating pseudo-tty to avoid user input corrupting outputs.
NON_INTERACTIVE = 0
# Allocate a pseudo-tty, quit the ssh session after the cmd finishes.
# Be careful of this mode, as ctrl-c will be passed to remote process.
INTERACTIVE = 1
# Allocate a pseudo-tty and log into the ssh session.
LOGIN = 2
class SSHCommandRunner:
"""Runner for SSH commands."""
def __init__(
self,
ip: str,
ssh_user: str,
ssh_private_key: str,
ssh_control_name: Optional[str] = '__default__',
ssh_proxy_command: Optional[str] = None,
):
"""Initialize SSHCommandRunner.
Example Usage:
runner = SSHCommandRunner(ip, ssh_user, ssh_private_key)
runner.run('ls -l', mode=SshMode.NON_INTERACTIVE)
runner.rsync(source, target, up=True)
Args:
ip: The IP address of the remote machine.
ssh_private_key: The path to the private key to use for ssh.
ssh_user: The user to use for ssh.
ssh_control_name: The files name of the ssh_control to use. This is
used to avoid confliction between clusters for creating ssh
control files. It can simply be the cluster_name or any name
that can distinguish between clusters.
ssh_proxy_command: Optional, the value to pass to '-o
ProxyCommand'. Useful for communicating with clusters without
public IPs using a "jump server".
"""
self.ip = ip
self.ssh_user = ssh_user
self.ssh_private_key = ssh_private_key
self.ssh_control_name = (
None if ssh_control_name is None else hashlib.md5(
ssh_control_name.encode()).hexdigest()[:_HASH_MAX_LENGTH])
self._ssh_proxy_command = ssh_proxy_command
@staticmethod
def make_runner_list(
ip_list: List[str],
ssh_user: str,
ssh_private_key: str,
ssh_control_name: Optional[str] = None,
ssh_proxy_command: Optional[str] = None,
) -> List['SSHCommandRunner']:
"""Helper function for creating runners with the same ssh credentials"""
return [
SSHCommandRunner(ip, ssh_user, ssh_private_key, ssh_control_name,
ssh_proxy_command) for ip in ip_list
]
def _ssh_base_command(self, *, ssh_mode: SshMode,
port_forward: Optional[List[int]]) -> List[str]:
ssh = ['ssh']
if ssh_mode == SshMode.NON_INTERACTIVE:
# Disable pseudo-terminal allocation. Otherwise, the output of
# ssh will be corrupted by the user's input.
ssh += ['-T']
else:
# Force pseudo-terminal allocation for interactive/login mode.
ssh += ['-tt']
if port_forward is not None:
for port in port_forward:
local = remote = port
logger.info(
f'Forwarding port {local} to port {remote} on localhost.')
ssh += ['-L', f'{remote}:localhost:{local}']
return ssh + ssh_options_list(
self.ssh_private_key,
self.ssh_control_name,
ssh_proxy_command=self._ssh_proxy_command,
) + [f'{self.ssh_user}@{self.ip}']
def run(
self,
cmd: Union[str, List[str]],
*,
require_outputs: bool = False,
port_forward: Optional[List[int]] = None,
# Advanced options.
log_path: str = os.devnull,
# If False, do not redirect stdout/stderr to optimize performance.
process_stream: bool = True,
stream_logs: bool = True,
ssh_mode: SshMode = SshMode.NON_INTERACTIVE,
separate_stderr: bool = False,
**kwargs) -> Union[int, Tuple[int, str, str]]:
"""Uses 'ssh' to run 'cmd' on a node with ip.
Args:
ip: The IP address of the node.
cmd: The command to run.
port_forward: A list of ports to forward from the localhost to the
remote host.
Advanced options:
require_outputs: Whether to return the stdout/stderr of the command.
log_path: Redirect stdout/stderr to the log_path.
stream_logs: Stream logs to the stdout/stderr.
check: Check the success of the command.
ssh_mode: The mode to use for ssh.
See SSHMode for more details.
separate_stderr: Whether to separate stderr from stdout.
Returns:
returncode
or
A tuple of (returncode, stdout, stderr).
"""
base_ssh_command = self._ssh_base_command(ssh_mode=ssh_mode,
port_forward=port_forward)
if ssh_mode == SshMode.LOGIN:
assert isinstance(cmd, list), 'cmd must be a list for login mode.'
command = base_ssh_command + cmd
proc = subprocess_utils.run(command, shell=False, check=False)
return proc.returncode, '', ''
if isinstance(cmd, list):
cmd = ' '.join(cmd)
log_dir = os.path.expanduser(os.path.dirname(log_path))
os.makedirs(log_dir, exist_ok=True)
# We need this to correctly run the cmd, and get the output.
command = [
'bash',
'--login',
'-c',
# Need this `-i` option to make sure `source ~/.bashrc` work.
'-i',
]
command += [
shlex.quote(f'true && source ~/.bashrc && export OMP_NUM_THREADS=1 '
f'PYTHONWARNINGS=ignore && ({cmd})'),
]
if not separate_stderr:
command.append('2>&1')
if not process_stream and ssh_mode == SshMode.NON_INTERACTIVE:
command += [
# A hack to remove the following bash warnings (twice):
# bash: cannot set terminal process group
# bash: no job control in this shell
'| stdbuf -o0 tail -n +5',
# This is required to make sure the executor of command can get
# correct returncode, since linux pipe is used.
'; exit ${PIPESTATUS[0]}'
]
command_str = ' '.join(command)
command = base_ssh_command + [shlex.quote(command_str)]
executable = None
if not process_stream:
if stream_logs:
command += [
f'| tee {log_path}',
# This also requires the executor to be '/bin/bash' instead
# of the default '/bin/sh'.
'; exit ${PIPESTATUS[0]}'
]
else:
command += [f'> {log_path}']
executable = '/bin/bash'
return log_lib.run_with_log(' '.join(command),
log_path,
require_outputs=require_outputs,
stream_logs=stream_logs,
process_stream=process_stream,
shell=True,
executable=executable,
**kwargs)
def rsync(
self,
source: str,
target: str,
*,
up: bool,
# Advanced options.
log_path: str = os.devnull,
stream_logs: bool = True,
max_retry: int = 1,
) -> None:
"""Uses 'rsync' to sync 'source' to 'target'.
Args:
source: The source path.
target: The target path.
up: The direction of the sync, True for local to cluster, False
for cluster to local.
log_path: Redirect stdout/stderr to the log_path.
stream_logs: Stream logs to the stdout/stderr.
max_retry: The maximum number of retries for the rsync command.
This value should be non-negative.
Raises:
exceptions.CommandError: rsync command failed.
"""
# Build command.
# TODO(zhwu): This will print a per-file progress bar (with -P),
# shooting a lot of messages to the output. --info=progress2 is used
# to get a total progress bar, but it requires rsync>=3.1.0 and Mac
# OS has a default rsync==2.6.9 (16 years old).
rsync_command = ['rsync', RSYNC_DISPLAY_OPTION]
# --filter
rsync_command.append(RSYNC_FILTER_OPTION)
if up:
# The source is a local path, so we need to resolve it.
# --exclude-from
resolved_source = pathlib.Path(source).expanduser().resolve()
if (resolved_source / GIT_EXCLUDE).exists():
# Ensure file exists; otherwise, rsync will error out.
rsync_command.append(
RSYNC_EXCLUDE_OPTION.format(
str(resolved_source / GIT_EXCLUDE)))
ssh_options = ' '.join(
ssh_options_list(
self.ssh_private_key,
self.ssh_control_name,
ssh_proxy_command=self._ssh_proxy_command,
))
rsync_command.append(f'-e "ssh {ssh_options}"')
# To support spaces in the path, we need to quote source and target.
# rsync doesn't support '~' in a quoted local path, but it is ok to
# have '~' in a quoted remote path.
if up:
full_source_str = str(resolved_source)
if resolved_source.is_dir():
full_source_str = os.path.join(full_source_str, '')
rsync_command.extend([
f'{full_source_str!r}',
f'{self.ssh_user}@{self.ip}:{target!r}',
])
else:
rsync_command.extend([
f'{self.ssh_user}@{self.ip}:{source!r}',
f'{os.path.expanduser(target)!r}',
])
command = ' '.join(rsync_command)
backoff = common_utils.Backoff(initial_backoff=5, max_backoff_factor=5)
while max_retry >= 0:
returncode, _, stderr = log_lib.run_with_log(
command,
log_path=log_path,
stream_logs=stream_logs,
shell=True,
require_outputs=True)
if returncode == 0:
break
max_retry -= 1
time.sleep(backoff.current_backoff())
direction = 'up' if up else 'down'
error_msg = (f'Failed to rsync {direction}: {source} -> {target}. '
'Ensure that the network is stable, then retry.')
subprocess_utils.handle_returncode(returncode,
command,
error_msg,
stderr=stderr,
stream_logs=stream_logs)