Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

avoid doing it if only one exp #7

Merged
merged 4 commits into from
Apr 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions sshlab/sshlab.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
DEFAULT_CONFIG_FILE = os.getenv("HOME") + "/.sshlab_config.yml"



def cleanup(user, server, process):
# Get the PID of the remote Jupyter process
utils.kill_remote_jupyter(user, server)
Expand Down Expand Up @@ -56,7 +55,7 @@ def main():
ip = config['SSH'].get('ip', '127.0.0.1')

# Check if the specified port is available, and find an available port if it's not
port = specified_port if utils.is_port_available(specified_port) else find_available_port(specified_port)
port = specified_port if utils.is_port_available(specified_port) else utils.find_available_port(specified_port)

# Build the SSH command string
ssh_cmd = f'ssh -L {port}:{ip}:{port} {user}@{server}'
Expand Down
15 changes: 11 additions & 4 deletions sshlab/sshlab_kill.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,15 +2,22 @@
import os
import yaml
from pathlib import Path
from . utils import get_remote_jupyter_pid, kill_remote_jupyter
from . utils import kill_remote_process

DEFAULT_CONFIG_FILE = os.getenv("HOME") + "/.sshlab_config.yml"

def build_argparser():
# Parse command-line arguments
parser = argparse.ArgumentParser()
parser.add_argument('--config', '-c', default=None, help='User configuration name to use (defined in the config file)')
parser.add_argument('--file', '-f', default=DEFAULT_CONFIG_FILE, type=Path, help=f'YAML configuration file, default: {DEFAULT_CONFIG_FILE}')
parser.add_argument('--config', '-c',
default=None,
help='User configuration name to use (defined in the config file)')
parser.add_argument('--file', '-f',
default=DEFAULT_CONFIG_FILE,
type=Path,
help=f'YAML configuration file, default: {DEFAULT_CONFIG_FILE}')
parser.add_argument('--process', '-p', default='jupyter', type=str,
help=f'Process name to kill, default: jupyter')
args = parser.parse_args()
return args

Expand All @@ -35,7 +42,7 @@ def main():
server = config['SSH']['server']

# Kill the remote Jupyter process
kill_remote_jupyter(user, server)
kill_remote_process(user, server, args.process)

if __name__ == '__main__':
main()
113 changes: 113 additions & 0 deletions sshlab/sshtensorboard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,113 @@
import subprocess
import yaml
import argparse
from pathlib import Path
import os
import signal
import atexit
import tempfile
import shutil

import utils

DEFAULT_CONFIG_FILE = os.getenv("HOME") + "/.sshlab_config.yml"


def cleanup(user, server, process, tmpdir):

# Remove the temporary directory
utils.delete_remote_dir(user, server, tmpdir)

# Get the PID of the remote Jupyter process
utils.kill_remote_tensorboard(user, server)

# Check if the process is still running
if process.poll() is None:
# Terminate the SSH connection
os.kill(process.pid, signal.SIGTERM)
print("Closed the SSH connection.")


def build_argparser():
parser = argparse.ArgumentParser()
parser.add_argument('--config', '-c', default=None,
help='User configuration name to use (defined in the config file)')
parser.add_argument('--file', '-f', default=DEFAULT_CONFIG_FILE, type=Path,
help=f'YAML configuration file, default: {DEFAULT_CONFIG_FILE}')
parser.add_argument('exp_names', metavar='EXP_NAMES', type=str,
help='Comma-separated list of experiment names')
args = parser.parse_args()
return args

def main():
args = build_argparser()

with open(args.file, 'r') as f:
configs = yaml.safe_load(f)

if args.config is None:
config_name, config = next(iter(configs.items()))
else:
config_name = args.config
config = configs[args.config]

print(f'Using configuration: {config_name}\n')

user = config['SSH']['user']
server = config['SSH']['server']
specified_port = int(config['SSH'].get('port', '6006'))
ip = config['SSH'].get('ip', '127.0.0.1')

port = specified_port if utils.is_port_available(specified_port) else utils.find_available_port(specified_port)

ssh_cmd = f'ssh -L {port}:{ip}:{port} {user}@{server}'

# Get the environment settings from the config file
env_cmd = config['Environment']['cmd']
env_target = config['Environment']['target']
env_cmd_options = config['Environment'].get('options', '')

# Build the tensorboard command string
tensorboard_cmd = config['Tensorboard']['cmd']
logdir = config['Tensorboard']['logdir']
exp_names = args.exp_names.split(',')

# this should work if we believe stackoverflow:
# logdirs = ",".join([f'{exp_name}:{os.path.join(logdir, exp_name)}' for exp_name in exp_names])

# Instead we have to create a temporary directory and symlink the logdirs there
logdirs = [f'{os.path.join(logdir, exp_name)}' for exp_name in exp_names]

if len(logdirs) == 1:
tmp_dir_runs = logdirs[0]
symlink_cmd = ""
else:
tmp_dir_runs = os.path.join(logdir, os.path.basename(tempfile.mkdtemp()))
symlink_cmd = f'mkdir -p {tmp_dir_runs};'
symlink_cmd += ";".join([f'ln -s {logdir} {os.path.join(tmp_dir_runs, os.path.basename(logdir))}' for logdir in logdirs])
symlink_cmd += ";"

env_cmd = f'{symlink_cmd} {env_cmd} {env_cmd_options} {env_target} {tensorboard_cmd} --logdir={tmp_dir_runs} --bind_all --port {port} '

# Combine the SSH and Singularity command strings
cmd = f'{ssh_cmd} "{env_cmd}"'

print(f"\n** Open tensorboard in your browser at: http://{ip}:{port} **\n\n")

try:
# Launch the command using subprocess.Popen
process = subprocess.Popen(cmd, shell=True)

# Register the cleanup function to be called upon script exit
atexit.register(lambda: cleanup(user, server, process, tmp_dir_runs))

# Wait for the process to complete
process.communicate()

except KeyboardInterrupt:
print("\nCTRL+C detected. Terminating Jupyter process...")
cleanup(user, server, process, tmp_dir_runs)


if __name__ == '__main__':
main()
54 changes: 40 additions & 14 deletions sshlab/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,40 +17,66 @@ def find_available_port(start_port=8888):
return port


# Function to get the PID of the remote Jupyter process
def get_remote_jupyter_pid(user, server):
jupyter_pid_cmd = f"ssh {user}@{server} 'pgrep -f jupyter -u {user}'"
def get_remote_cmd_pid(user, server, cmd):
pid_cmd = f"ssh {user}@{server} 'pgrep -f \"{cmd}\" -u {user}'"
try:
output = subprocess.check_output(jupyter_pid_cmd, shell=True).decode().strip().split('\n')[0].strip()
output = subprocess.check_output(pid_cmd, shell=True).decode().strip().split('\n')[0].strip()
pid = int(output)
except (subprocess.CalledProcessError, ValueError):
pid = None

return pid


def kill_remote_jupyter(user, server):
pid = get_remote_jupyter_pid(user, server)
def get_remote_jupyter_pid(user, server):
jupyter_cmd = 'jupyter'
return get_remote_cmd_pid(user, server, jupyter_cmd)


def get_remote_tensorboard_pid(user, server):
tensorboard_cmd = 'tensorboard'
return get_remote_cmd_pid(user, server, tensorboard_cmd)


def kill_remote_process(user, server, cmd):
pid = get_remote_cmd_pid(user, server, cmd)
print(f"PID {pid}")

if pid:
print(f"Jupyter server PID on remote machine: {pid}")
print(f"Process PID on remote machine: {pid}")
kill_cmd = f"ssh {user}@{server} 'kill -TERM {pid} &> /dev/null'"
subprocess.run(kill_cmd, shell=True)
print(f"Sent termination signal to Jupyter server {pid} on the remote machine.")
print(f"Sent termination signal to process {pid} on the remote machine.")

# Wait for the Jupyter process to terminate
# Wait for the process to terminate
max_retries = 5
retry_count = 0
while retry_count < max_retries:
time.sleep(1)
current_pid = get_remote_jupyter_pid(user, server)
current_pid = get_remote_cmd_pid(user, server, cmd)
if not current_pid:
print(f"Terminated the Jupyter server {pid} on the remote machine.")
print(f"Terminated the process {pid} on the remote machine.")
break
else:
print(f"Jupyter server {pid} still running. Retry count: {retry_count}")
print(f"Process {pid} still running. Retry count: {retry_count}")
retry_count += 1
else:
print(f"Unable to terminate the Jupyter server {pid} on the remote machine.")
print(f"Unable to terminate the process {pid} on the remote machine.")
else:
print("No Jupyter server found on the remote machine.")
print("No process found on the remote machine.")


def kill_remote_jupyter(user, server):
jupyter_cmd = 'jupyter'
kill_remote_process(user, server, jupyter_cmd)


def kill_remote_tensorboard(user, server):
tensorboard_cmd = 'tensorboard'
kill_remote_process(user, server, tensorboard_cmd)


def delete_remote_dir(user, server, dir_path):
print(f"Deleting temporary directory {dir_path} on the remote machine.")
delete_cmd = f"ssh {user}@{server} 'rm -rf {dir_path}'"
subprocess.run(delete_cmd, shell=True)