From d8a51f34fd15dc4fbbe4e8416cd3270f23818c09 Mon Sep 17 00:00:00 2001 From: bmaltais Date: Fri, 26 Apr 2024 07:11:07 -0400 Subject: [PATCH 1/4] Set `max_train_steps` to 0 if not specified in older `.json` config files --- .release | 2 +- README.md | 5 +++++ kohya_gui/common_gui.py | 2 +- test/config/dreambooth-AdamW8bit.json | 2 +- 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/.release b/.release index 8603b5d5d..e7088bda7 100644 --- a/.release +++ b/.release @@ -1 +1 @@ -v24.0.7 \ No newline at end of file +v24.0.8 \ No newline at end of file diff --git a/README.md b/README.md index 0880392ac..528be4bc2 100644 --- a/README.md +++ b/README.md @@ -47,6 +47,7 @@ The GUI allows you to set the training parameters and generate and run the requi - [SDXL training](#sdxl-training) - [Masked loss](#masked-loss) - [Change History](#change-history) + - [2024/04/26 (v24.0.8)](#20240426-v2408) - [2024/04/25 (v24.0.7)](#20240425-v2407) - [2024/04/22 (v24.0.6)](#20240422-v2406) - [2024/04/19 (v24.0.5)](#20240419-v2405) @@ -453,6 +454,10 @@ ControlNet dataset is used to specify the mask. The mask images should be the RG ## Change History +### 2024/04/26 (v24.0.8) + +- Set `max_train_steps` to 0 if not specified in older `.json` config files. + ### 2024/04/25 (v24.0.7) - Prevent crash if tkinter is not installed diff --git a/kohya_gui/common_gui.py b/kohya_gui/common_gui.py index 2876ab7ae..12947b0a8 100644 --- a/kohya_gui/common_gui.py +++ b/kohya_gui/common_gui.py @@ -364,7 +364,7 @@ def update_my_data(my_data): my_data[key] = int(value) except ValueError: # Handle the case where the string is not a valid float - my_data[key] = int(1600) + my_data[key] = int(0) # Convert values to int if they are strings for key in ["max_token_length"]: diff --git a/test/config/dreambooth-AdamW8bit.json b/test/config/dreambooth-AdamW8bit.json index 48794323e..51f0962d1 100644 --- a/test/config/dreambooth-AdamW8bit.json +++ b/test/config/dreambooth-AdamW8bit.json @@ -53,7 +53,7 @@ "max_timestep": 1000, "max_token_length": 75, "max_train_epochs": 0, - "max_train_steps": 0, + "max_train_steps": "", "mem_eff_attn": false, "min_bucket_reso": 256, "min_snr_gamma": 0, From 0c2c2d4e063e778ee135754ca989bc0da9958e23 Mon Sep 17 00:00:00 2001 From: bmaltais Date: Fri, 26 Apr 2024 10:03:25 -0400 Subject: [PATCH 2/4] Add "Open tensorboard" button for docker containers without tensorflow installed --- kohya_gui/class_tensorboard.py | 142 +++++++++++++++++++++++++++++++++ 1 file changed, 142 insertions(+) diff --git a/kohya_gui/class_tensorboard.py b/kohya_gui/class_tensorboard.py index f28e01f7e..e0594a8ee 100644 --- a/kohya_gui/class_tensorboard.py +++ b/kohya_gui/class_tensorboard.py @@ -130,3 +130,145 @@ def gradio_interface(self): outputs=[button_start_tensorboard, button_stop_tensorboard], show_progress=False, ) +import os +import gradio as gr +import subprocess +import time +import webbrowser + +try: + os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0" + import tensorflow # Attempt to import tensorflow to check if it is installed + + visibility = True +except ImportError: + visibility = False + +from easygui import msgbox +from threading import Thread, Event +from .custom_logging import setup_logging + + +class TensorboardManager: + DEFAULT_TENSORBOARD_PORT = 6006 + + def __init__(self, logging_dir, headless: bool = False, wait_time=5): + self.logging_dir = logging_dir + self.headless = headless + self.wait_time = wait_time + self.tensorboard_proc = None + self.tensorboard_port = os.environ.get( + "TENSORBOARD_PORT", self.DEFAULT_TENSORBOARD_PORT + ) + self.log = setup_logging() + self.thread = None + self.stop_event = Event() + + self.gradio_interface() + + def get_button_states(self, started=False): + return gr.Button( + visible=visibility and (not started or self.headless) + ), gr.Button(visible=visibility and (started or self.headless)) + + def open_tensorboard_url(self): + tensorboard_url = f"http://localhost:{self.tensorboard_port}" + self.log.info(f"Opening TensorBoard URL in browser: {tensorboard_url}") + webbrowser.open(tensorboard_url) + + def start_tensorboard(self, logging_dir=None): + if self.tensorboard_proc is not None: + self.log.info( + "Tensorboard is already running. Terminating existing process before starting new one..." + ) + self.stop_tensorboard() + + if not os.path.exists(logging_dir) or not os.listdir(logging_dir): + self.log.error( + "Error: logging folder does not exist or does not contain logs." + ) + msgbox(msg="Error: logging folder does not exist or does not contain logs.") + return self.get_button_states(started=False) + + run_cmd = [ + "tensorboard", + "--logdir", + logging_dir, + "--host", + "0.0.0.0", + "--port", + str(self.tensorboard_port), + ] + + self.log.info(run_cmd) + + self.log.info("Starting TensorBoard on port {}".format(self.tensorboard_port)) + try: + env = os.environ.copy() + env["TF_ENABLE_ONEDNN_OPTS"] = "0" + self.tensorboard_proc = subprocess.Popen(run_cmd, env=env) + except Exception as e: + self.log.error("Failed to start Tensorboard:", e) + return self.get_button_states(started=False) + + if not self.headless: + self.stop_event.clear() + + time.sleep(self.wait_time) + if not self.stop_event.is_set(): + self.thread = Thread(target=self.open_tensorboard_url) + self.thread.start() + + return self.get_button_states(started=True) + + def stop_tensorboard(self): + if self.tensorboard_proc is not None: + self.log.info("Stopping tensorboard process...") + try: + self.tensorboard_proc.terminate() + self.tensorboard_proc = None + self.log.info("...process stopped") + except Exception as e: + self.log.error("Failed to stop Tensorboard:", e) + + if self.thread is not None: + self.stop_event.set() + self.thread.join() # Wait for the thread to finish + self.thread = None + self.log.info("Thread terminated successfully.") + + return self.get_button_states(started=False) + + def gradio_interface(self): + + with gr.Row(): + button_start_tensorboard = gr.Button( + value="Start tensorboard", + elem_id="myTensorButton", + visible=visibility, + ) + button_stop_tensorboard = gr.Button( + value="Stop tensorboard", + visible=visibility and self.headless, + elem_id="myTensorButtonStop", + ) + button_open_tensorboard = gr.Button( + value="Open tensorboard", + elem_id="myTensorButton", + visible=not visibility, + ) + button_start_tensorboard.click( + self.start_tensorboard, + inputs=[self.logging_dir], + outputs=[button_start_tensorboard, button_stop_tensorboard], + show_progress=False, + ) + button_stop_tensorboard.click( + self.stop_tensorboard, + outputs=[button_start_tensorboard, button_stop_tensorboard], + show_progress=False, + ) + button_open_tensorboard.click( + self.open_tensorboard_url, + show_progress=False, + ) From f2c3b6234c796e2bc26cf6f7903750023dcbbec4 Mon Sep 17 00:00:00 2001 From: bmaltais Date: Fri, 26 Apr 2024 10:25:03 -0400 Subject: [PATCH 3/4] Update README --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 528be4bc2..00f977eb2 100644 --- a/README.md +++ b/README.md @@ -106,7 +106,7 @@ To set up the project, follow these steps: 2. Clone the repository by running the following command: ```shell - git clone https://github.com/bmaltais/kohya_ss.git + git clone --recursive https://github.com/bmaltais/kohya_ss.git ``` 3. Change into the `kohya_ss` directory: @@ -154,7 +154,7 @@ To set up the project on Linux or macOS, perform the following steps: 2. Clone the repository by running the following command: ```shell - git clone https://github.com/bmaltais/kohya_ss.git + git clone --recursive https://github.com/bmaltais/kohya_ss.git ``` 3. Change into the `kohya_ss` directory: @@ -199,7 +199,7 @@ To install the necessary components for Runpod and run kohya_ss, follow these st ```shell cd /workspace - git clone https://github.com/bmaltais/kohya_ss.git + git clone --recursive https://github.com/bmaltais/kohya_ss.git ``` 4. Run the setup script: @@ -267,7 +267,7 @@ Install the NVIDIA Container Toolkit with this guide. #### Use the pre-built Docker image ```bash -git clone https://github.com/bmaltais/kohya_ss.git +git clone --recursive https://github.com/bmaltais/kohya_ss.git cd kohya_ss docker compose up -d ``` From 074de82dc5fdd94d26c45c7365c46c8ac58c4d1c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E9=99=B3=E9=88=9E?= Date: Sat, 27 Apr 2024 07:43:37 +0800 Subject: [PATCH 4/4] chore(docker): Configure TensorBoard port through .env file (#2397) * chore(docker): Configure TensorBoard port through .env file - Added a new `.env` file to specify the TensorBoard port - Updated the `docker-compose.yaml` file to import the TensorBoard port from the `.env` file - Adjusted the tensorboard service in `docker-compose.yaml` to make the port configurable via an environment variable - Added a comment in `docker-compose.yaml` to encourage changing the port in the `.env` file instead of the docker-compose file itself * fix: the `Open tensorboard` button is not working in headless environment Use the gradio builtin feature instead. - In `class_tensorboard.py`, the "Open tensorboard" button now directly links to the tensorboard URL instead of calling the `open_tensorboard_url` function when clicked. --- .env | 1 + docker-compose.yaml | 4 +- kohya_gui/class_tensorboard.py | 137 +-------------------------------- 3 files changed, 5 insertions(+), 137 deletions(-) create mode 100644 .env diff --git a/.env b/.env new file mode 100644 index 000000000..52d0322b2 --- /dev/null +++ b/.env @@ -0,0 +1 @@ +TENSORBOARD_PORT=6006 diff --git a/docker-compose.yaml b/docker-compose.yaml index ddd20f227..4932bcee2 100644 --- a/docker-compose.yaml +++ b/docker-compose.yaml @@ -15,6 +15,7 @@ services: - 7860:7860 environment: SAFETENSORS_FAST_GPU: 1 + TENSORBOARD_PORT: ${TENSORBOARD_PORT:-6006} tmpfs: - /tmp volumes: @@ -42,7 +43,8 @@ services: container_name: tensorboard image: tensorflow/tensorflow:latest-gpu ports: - - 6006:6006 + # !Please change the port in .env file + - ${TENSORBOARD_PORT:-6006}:6006 volumes: - ./dataset/logs:/app/logs command: tensorboard --logdir=/app/logs --bind_all diff --git a/kohya_gui/class_tensorboard.py b/kohya_gui/class_tensorboard.py index e0594a8ee..5e95ebf53 100644 --- a/kohya_gui/class_tensorboard.py +++ b/kohya_gui/class_tensorboard.py @@ -17,138 +17,6 @@ from .custom_logging import setup_logging -class TensorboardManager: - DEFAULT_TENSORBOARD_PORT = 6006 - - def __init__(self, logging_dir, headless: bool = False, wait_time=5): - self.logging_dir = logging_dir - self.headless = headless - self.wait_time = wait_time - self.tensorboard_proc = None - self.tensorboard_port = os.environ.get( - "TENSORBOARD_PORT", self.DEFAULT_TENSORBOARD_PORT - ) - self.log = setup_logging() - self.thread = None - self.stop_event = Event() - - self.gradio_interface() - - def get_button_states(self, started=False): - return gr.Button( - visible=visibility and (not started or self.headless) - ), gr.Button(visible=visibility and (started or self.headless)) - - def start_tensorboard(self, logging_dir=None): - if self.tensorboard_proc is not None: - self.log.info( - "Tensorboard is already running. Terminating existing process before starting new one..." - ) - self.stop_tensorboard() - - if not os.path.exists(logging_dir) or not os.listdir(logging_dir): - self.log.error( - "Error: logging folder does not exist or does not contain logs." - ) - msgbox(msg="Error: logging folder does not exist or does not contain logs.") - return self.get_button_states(started=False) - - run_cmd = [ - "tensorboard", - "--logdir", - logging_dir, - "--host", - "0.0.0.0", - "--port", - str(self.tensorboard_port), - ] - - self.log.info(run_cmd) - - self.log.info("Starting TensorBoard on port {}".format(self.tensorboard_port)) - try: - env = os.environ.copy() - env["TF_ENABLE_ONEDNN_OPTS"] = "0" - self.tensorboard_proc = subprocess.Popen(run_cmd, env=env) - except Exception as e: - self.log.error("Failed to start Tensorboard:", e) - return self.get_button_states(started=False) - - def open_tensorboard_url(): - time.sleep(self.wait_time) - if not self.stop_event.is_set(): - tensorboard_url = f"http://localhost:{self.tensorboard_port}" - self.log.info(f"Opening TensorBoard URL in browser: {tensorboard_url}") - webbrowser.open(tensorboard_url) - - if not self.headless: - self.stop_event.clear() - self.thread = Thread(target=open_tensorboard_url) - self.thread.start() - - return self.get_button_states(started=True) - - def stop_tensorboard(self): - if self.tensorboard_proc is not None: - self.log.info("Stopping tensorboard process...") - try: - self.tensorboard_proc.terminate() - self.tensorboard_proc = None - self.log.info("...process stopped") - except Exception as e: - self.log.error("Failed to stop Tensorboard:", e) - - if self.thread is not None: - self.stop_event.set() - self.thread.join() # Wait for the thread to finish - self.thread = None - self.log.info("Thread terminated successfully.") - - return self.get_button_states(started=False) - - def gradio_interface(self): - - with gr.Row(): - button_start_tensorboard = gr.Button( - value="Start tensorboard", - elem_id="myTensorButton", - visible=visibility, - ) - button_stop_tensorboard = gr.Button( - value="Stop tensorboard", - visible=visibility and self.headless, - elem_id="myTensorButtonStop", - ) - button_start_tensorboard.click( - self.start_tensorboard, - inputs=[self.logging_dir], - outputs=[button_start_tensorboard, button_stop_tensorboard], - show_progress=False, - ) - button_stop_tensorboard.click( - self.stop_tensorboard, - outputs=[button_start_tensorboard, button_stop_tensorboard], - show_progress=False, - ) -import os -import gradio as gr -import subprocess -import time -import webbrowser - -try: - os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0" - import tensorflow # Attempt to import tensorflow to check if it is installed - - visibility = True -except ImportError: - visibility = False - -from easygui import msgbox -from threading import Thread, Event -from .custom_logging import setup_logging - - class TensorboardManager: DEFAULT_TENSORBOARD_PORT = 6006 @@ -256,6 +124,7 @@ def gradio_interface(self): value="Open tensorboard", elem_id="myTensorButton", visible=not visibility, + link=f"http://localhost:{self.tensorboard_port}", ) button_start_tensorboard.click( self.start_tensorboard, @@ -268,7 +137,3 @@ def gradio_interface(self): outputs=[button_start_tensorboard, button_stop_tensorboard], show_progress=False, ) - button_open_tensorboard.click( - self.open_tensorboard_url, - show_progress=False, - )