Merge pull request #2227 from bmaltais/dev

v23.1.2
bmaltais · Apr 8, 2024 · aa9fcf3 · aa9fcf3
2 parents c839fad + ff70a1e
commit aa9fcf3
Show file tree

Hide file tree

Showing 7 changed files with 181 additions and 102 deletions.
diff --git a/.release b/.release
@@ -1 +1 @@
-v23.1.1
+v23.1.2
diff --git a/README.md b/README.md
@@ -42,6 +42,7 @@ The GUI allows you to set the training parameters and generate and run the requi
   - [SDXL training](#sdxl-training)
   - [Masked loss](#masked-loss)
   - [Change History](#change-history)
+    - [2024/04/08 (v23.1.2)](#20240408-v2312)
     - [2024/04/07 (v23.1.1)](#20240407-v2311)
     - [2024/04/07 (v23.1.0)](#20240407-v2310)
     - [2024/03/21 (v23.0.15)](#20240321-v23015)
@@ -404,6 +405,10 @@ ControlNet dataset is used to specify the mask. The mask images should be the RG
 
 ## Change History
 
+### 2024/04/08 (v23.1.2)
+
+- Added config.toml support for wd14_caption.
+
 ### 2024/04/07 (v23.1.1)
 
 - Added support for Huber loss under the Parameters / Advanced tab.

diff --git a/config example.toml b/config example.toml
@@ -135,3 +135,27 @@ sample_sampler = "euler_a" # Sampler to use for image sampling
 [sdxl]
 sdxl_cache_text_encoder_outputs = false # Cache text encoder outputs
 sdxl_no_half_vae = true                 # No half VAE
+
+[wd14_caption]
+always_first_tags = ""                        # comma-separated list of tags to always put at the beginning, e.g. 1girl,1boy
+append_tags = false                           # Append TAGs
+batch_size = 8                                # Batch size
+caption_extension = ".txt"                    # Extension for caption file (e.g., .caption, .txt)
+caption_separator = ", "                      # Caption Separator
+character_tag_expand = false                  # Expand tag tail parenthesis to another tag for character tags. `chara_name_(series)` becomes `chara_name, series`
+character_threshold = 0.35                    # Character threshold
+debug = false                                 # Debug mode
+force_download = false                        # Force model re-download when switching to onnx
+frequency_tags = false                        # Frequency tags
+general_threshold = 0.35                      # General threshold
+max_data_loader_n_workers = 2                 # Max dataloader workers
+onnx = true                                   # ONNX
+recursive = false                             # Recursive
+remove_underscore = false                     # Remove underscore
+repo_id = "SmilingWolf/wd-convnext-tagger-v3" # Repo id for wd14 tagger on Hugging Face
+tag_replacement = ""                          # Tag replacement in the format of `source1,target1;source2,target2; ...`. Escape `,` and `;` with `\`. e.g. `tag1,tag2;tag3,tag4`
+thresh = 0.36                                 # Threshold
+train_data_dir = ""                           # Image folder to caption (containing the images to caption)
+undesired_tags = ""                           # comma-separated list of tags to remove, e.g. 1girl,1boy
+use_rating_tags = false                       # Use rating tags
+use_rating_tags_as_last_tag = false           # Use rating tags as last tagging tags
diff --git a/kohya_gui.py b/kohya_gui.py
@@ -64,6 +64,7 @@ def UI(**kwargs):
                 logging_dir_input=logging_dir_input,
                 enable_copy_info_button=True,
                 headless=headless,
+                config=config,
             )
             with gr.Tab("LoRA"):
                 _ = LoRATools(headless=headless)

diff --git a/kohya_gui/blip2_caption_gui.py b/kohya_gui/blip2_caption_gui.py
@@ -70,6 +70,7 @@ def generate_caption(
     max_new_tokens=40,
     min_new_tokens=20,
     do_sample=True,
+    temperature=1.0,
     top_p=0.0,
 ):
     """
@@ -108,6 +109,7 @@ def generate_caption(
                 top_p=top_p,
                 max_new_tokens=max_new_tokens,
                 min_new_tokens=min_new_tokens,
+                temperature=temperature,
             )
 
         generated_text = processor.batch_decode(
@@ -154,7 +156,7 @@ def caption_images_beam_search(
         model=model,
         device=device,
         num_beams=int(num_beams),
-        repetition_penalty=repetition_penalty,
+        repetition_penalty=float(repetition_penalty),
         length_penalty=length_penalty,
         min_new_tokens=int(min_new_tokens),
         max_new_tokens=int(max_new_tokens),
@@ -165,6 +167,7 @@ def caption_images_beam_search(
 def caption_images_nucleus(
     directory_path,
     do_sample,
+    temperature,
     top_p,
     min_new_tokens,
     max_new_tokens,
@@ -190,6 +193,7 @@ def caption_images_nucleus(
         model=model,
         device=device,
         do_sample=do_sample,
+        temperature=temperature,
         top_p=top_p,
         min_new_tokens=int(min_new_tokens),
         max_new_tokens=int(max_new_tokens),
@@ -278,16 +282,6 @@ def list_train_dirs(path):
                         label="Number of beams",
                     )
 
-                    temperature = gr.Slider(
-                        minimum=0.5,
-                        maximum=1.0,
-                        value=1.0,
-                        step=0.1,
-                        interactive=True,
-                        label="Temperature",
-                        info="used with nucleus sampling",
-                    )
-
                     len_penalty = gr.Slider(
                         minimum=-1.0,
                         maximum=2.0,
@@ -326,6 +320,16 @@ def list_train_dirs(path):
             with gr.Tab("Nucleus sampling"):
                 with gr.Row():
                     do_sample = gr.Checkbox(label="Sample", value=True)
+
+                    temperature = gr.Slider(
+                        minimum=0.5,
+                        maximum=1.0,
+                        value=1.0,
+                        step=0.1,
+                        interactive=True,
+                        label="Temperature",
+                        info="used with nucleus sampling",
+                    )
 
                     top_p = gr.Slider(
                         minimum=-0,
@@ -344,6 +348,7 @@ def list_train_dirs(path):
                     inputs=[
                         directory_path_dir,
                         do_sample,
+                        temperature,
                         top_p,
                         min_new_tokens,
                         max_new_tokens,

diff --git a/kohya_gui/utilities.py b/kohya_gui/utilities.py
@@ -18,13 +18,14 @@ def utilities_tab(
     enable_copy_info_button=bool(False),
     enable_dreambooth_tab=True,
     headless=False,
+    config: dict = {},
 ):
     with gr.Tab("Captioning"):
         gradio_basic_caption_gui_tab(headless=headless)
         gradio_blip_caption_gui_tab(headless=headless)
         gradio_blip2_caption_gui_tab(headless=headless)
         gradio_git_caption_gui_tab(headless=headless)
-        gradio_wd14_caption_gui_tab(headless=headless)
+        gradio_wd14_caption_gui_tab(headless=headless, config=config)
         gradio_manual_caption_gui_tab(headless=headless)
     gradio_convert_model_tab(headless=headless)
     gradio_group_images_gui_tab(headless=headless)