Merge pull request #1815 from bmaltais/dev

v22.4.0
bmaltais · Dec 28, 2023 · 89cfc46 · 89cfc46
2 parents 41009ae + a2c18df
commit 89cfc46
Show file tree

Hide file tree

Showing 28 changed files with 599 additions and 356 deletions.
diff --git a/.release b/.release
@@ -1 +1 @@
-v22.3.1
+v22.4.0
diff --git a/README.md b/README.md
@@ -651,6 +651,21 @@ masterpiece, best quality, 1boy, in business suit, standing at street, looking b
 
 
 ## Change History
+* 2023/12/28 (v22.4.0)
+- Fixed to work `tools/convert_diffusers20_original_sd.py`. Thanks to Disty0! PR [#1016](https://github.com/kohya-ss/sd-scripts/pull/1016)
+- The issues in multi-GPU training are fixed. Thanks to Isotr0py! PR [#989](https://github.com/kohya-ss/sd-scripts/pull/989) and [#1000](https://github.com/kohya-ss/sd-scripts/pull/1000)
+  - `--ddp_gradient_as_bucket_view` and `--ddp_bucket_view`options are added to `sdxl_train.py`. Please specify these options for multi-GPU training.
+- IPEX support is updated. Thanks to Disty0!
+- Fixed the bug that the size of the bucket becomes less than `min_bucket_reso`. Thanks to Cauldrath! PR [#1008](https://github.com/kohya-ss/sd-scripts/pull/1008)
+- `--sample_at_first` option is added to each training script. This option is useful to generate images at the first step, before training. Thanks to shirayu! PR [#907](https://github.com/kohya-ss/sd-scripts/pull/907)
+- `--ss` option is added to the sampling prompt in training. You can specify the scheduler for the sampling like `--ss euler_a`. Thanks to shirayu! PR [#906](https://github.com/kohya-ss/sd-scripts/pull/906)
+- `keep_tokens_separator` is added to the dataset config. This option is useful to keep (prevent from shuffling) the tokens in the captions. See [#975](https://github.com/kohya-ss/sd-scripts/pull/975) for details. Thanks to Linaqruf!
+  - You can specify the separator with an option like `--keep_tokens_separator "|||"` or with `keep_tokens_separator: "|||"` in `.toml`. The tokens before `|||` are not shuffled.
+- Attention processor hook is added. See [#961](https://github.com/kohya-ss/sd-scripts/pull/961) for details. Thanks to rockerBOO!
+- The optimizer `PagedAdamW` is added. Thanks to xzuyn! PR [#955](https://github.com/kohya-ss/sd-scripts/pull/955)
+- NaN replacement in SDXL VAE is sped up. Thanks to liubo0902! PR [#1009](https://github.com/kohya-ss/sd-scripts/pull/1009)
+- Fixed the path error in `finetune/make_captions.py`. Thanks to CjangCjengh! PR [#986](https://github.com/kohya-ss/sd-scripts/pull/986)
+
 * 2023/12/20 (v22.3.1)
 - Add goto button to manual caption utility
 - Add missing options for various LyCORIS training algorythms

diff --git a/README_中文教程.md b/README_中文教程.md
@@ -51,11 +51,11 @@ Enter your choice: 2
 
 当仓库内和note.com有相关文章,请参考那里。(未来可能全部移到这里)
 
-* [关于训练,通用篇](./docs/train_README-ja.md): 数据准备和选项等
+* [关于训练,通用篇](./docs/train_README-zh.md): 数据准备和选项等
     * [数据集设置](./docs/config_README-ja.md)
-* [DreamBooth训练指南](./docs/train_db_README-ja.md) 
+* [DreamBooth训练指南](./docs/train_db_README-zh.md) 
 * [fine-tuning指南](./docs/fine_tune_README_ja.md)
-* [LoRA训练指南](./docs/train_network_README-ja.md)
+* [LoRA训练指南](./docs/train_network_README-zh.md)
 * [文本反转训练指南](./docs/train_ti_README-ja.md)
 * [图像生成脚本](./docs/gen_img_README-ja.md)
 * note.com [模型转换脚本](https://note.com/kohya_ss/n/n374f316fe4ad)
@@ -185,4 +185,4 @@ LoRA实现基于[cloneofsimo的仓库](https://github.com/cloneofsimo/lora)。
 
 [bitsandbytes](https://github.com/TimDettmers/bitsandbytes): MIT
 
-[BLIP](https://github.com/salesforce/BLIP): BSD-3-Clause
+[BLIP](https://github.com/salesforce/BLIP): BSD-3-Clause
diff --git a/docs/train_README-ja.md b/docs/train_README-ja.md
@@ -374,6 +374,10 @@ classがひとつで対象が複数の場合、正則化画像フォルダはひ
 
     サンプル出力するステップ数またはエポック数を指定します。この数ごとにサンプル出力します。両方指定するとエポック数が優先されます。
 
+- `--sample_at_first`
+
+    学習開始前にサンプル出力します。学習前との比較ができます。
+
 - `--sample_prompts`
 
     サンプル出力用プロンプトのファイルを指定します。

diff --git a/fine_tune.py b/fine_tune.py
@@ -253,9 +253,6 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
     else:
         unet, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(unet, optimizer, train_dataloader, lr_scheduler)
 
-    # transform DDP after prepare
-    text_encoder, unet = train_util.transform_if_model_is_DDP(text_encoder, unet)
-
     # 実験的機能：勾配も含めたfp16学習を行う　PyTorchにパッチを当ててfp16でのgrad scaleを有効にする
     if args.full_fp16:
         train_util.patch_accelerator_for_fp16_training(accelerator)
@@ -298,6 +295,9 @@ def fn_recursive_set_mem_eff(module: torch.nn.Module):
             init_kwargs = toml.load(args.log_tracker_config)
         accelerator.init_trackers("finetuning" if args.log_tracker_name is None else args.log_tracker_name, init_kwargs=init_kwargs)
 
+    # For --sample_at_first
+    train_util.sample_images(accelerator, args, 0, global_step, accelerator.device, vae, tokenizer, text_encoder, unet)
+
     loss_recorder = train_util.LossRecorder()
     for epoch in range(num_train_epochs):
         accelerator.print(f"\nepoch {epoch+1}/{num_train_epochs}")

diff --git a/finetune/make_captions.py b/finetune/make_captions.py
@@ -76,6 +76,8 @@ def main(args):
         cwd = os.getcwd()
         print("Current Working Directory is: ", cwd)
         os.chdir("finetune")
+        if not is_url(args.caption_weights) and not os.path.isfile(args.caption_weights):
+            args.caption_weights = os.path.join("..", args.caption_weights)
 
     print(f"load images from {args.train_data_dir}")
     train_data_dir_path = Path(args.train_data_dir)

diff --git a/library/config_util.py b/library/config_util.py
@@ -53,6 +53,7 @@ class BaseSubsetParams:
   shuffle_caption: bool = False
   caption_separator: str = ',',
   keep_tokens: int = 0
+  keep_tokens_separator: str = None,
   color_aug: bool = False
   flip_aug: bool = False
   face_crop_aug_range: Optional[Tuple[float, float]] = None
@@ -160,6 +161,7 @@ def __validate_and_convert_scalar_or_twodim(klass, value: Union[float, Sequence]
     "random_crop": bool,
     "shuffle_caption": bool,
     "keep_tokens": int,
+    "keep_tokens_separator": str,
     "token_warmup_min": int,
     "token_warmup_step": Any(float,int),
     "caption_prefix": str,
@@ -461,6 +463,7 @@ def generate_dataset_group_by_blueprint(dataset_group_blueprint: DatasetGroupBlu
           num_repeats: {subset.num_repeats}
           shuffle_caption: {subset.shuffle_caption}
           keep_tokens: {subset.keep_tokens}
+          keep_tokens_separator: {subset.keep_tokens_separator}
           caption_dropout_rate: {subset.caption_dropout_rate}
           caption_dropout_every_n_epoches: {subset.caption_dropout_every_n_epochs}
           caption_tag_dropout_rate: {subset.caption_tag_dropout_rate}

diff --git a/library/ipex/__init__.py b/library/ipex/__init__.py
@@ -4,13 +4,12 @@
 import torch
 import intel_extension_for_pytorch as ipex # pylint: disable=import-error, unused-import
 from .hijacks import ipex_hijacks
-from .attention import attention_init
 
 # pylint: disable=protected-access, missing-function-docstring, line-too-long
 
 def ipex_init(): # pylint: disable=too-many-statements
     try:
-        #Replace cuda with xpu:
+        # Replace cuda with xpu:
         torch.cuda.current_device = torch.xpu.current_device
         torch.cuda.current_stream = torch.xpu.current_stream
         torch.cuda.device = torch.xpu.device
@@ -30,6 +29,7 @@ def ipex_init(): # pylint: disable=too-many-statements
         torch.cuda.FloatTensor = torch.xpu.FloatTensor
         torch.Tensor.cuda = torch.Tensor.xpu
         torch.Tensor.is_cuda = torch.Tensor.is_xpu
+        torch.UntypedStorage.cuda = torch.UntypedStorage.xpu
         torch.cuda._initialization_lock = torch.xpu.lazy_init._initialization_lock
         torch.cuda._initialized = torch.xpu.lazy_init._initialized
         torch.cuda._lazy_seed_tracker = torch.xpu.lazy_init._lazy_seed_tracker
@@ -90,9 +90,9 @@ def ipex_init(): # pylint: disable=too-many-statements
         torch.cuda.CharStorage = torch.xpu.CharStorage
         torch.cuda.__file__ = torch.xpu.__file__
         torch.cuda._is_in_bad_fork = torch.xpu.lazy_init._is_in_bad_fork
-        #torch.cuda.is_current_stream_capturing = torch.xpu.is_current_stream_capturing
+        # torch.cuda.is_current_stream_capturing = torch.xpu.is_current_stream_capturing
 
-        #Memory:
+        # Memory:
         torch.cuda.memory = torch.xpu.memory
         if 'linux' in sys.platform and "WSL2" in os.popen("uname -a").read():
             torch.xpu.empty_cache = lambda: None
@@ -112,7 +112,7 @@ def ipex_init(): # pylint: disable=too-many-statements
         torch.cuda.memory_stats_as_nested_dict = torch.xpu.memory_stats_as_nested_dict
         torch.cuda.reset_accumulated_memory_stats = torch.xpu.reset_accumulated_memory_stats
 
-        #RNG:
+        # RNG:
         torch.cuda.get_rng_state = torch.xpu.get_rng_state
         torch.cuda.get_rng_state_all = torch.xpu.get_rng_state_all
         torch.cuda.set_rng_state = torch.xpu.set_rng_state
@@ -123,7 +123,7 @@ def ipex_init(): # pylint: disable=too-many-statements
         torch.cuda.seed_all = torch.xpu.seed_all
         torch.cuda.initial_seed = torch.xpu.initial_seed
 
-        #AMP:
+        # AMP:
         torch.cuda.amp = torch.xpu.amp
         if not hasattr(torch.cuda.amp, "common"):
             torch.cuda.amp.common = contextlib.nullcontext()
@@ -138,12 +138,12 @@ def ipex_init(): # pylint: disable=too-many-statements
             except Exception: # pylint: disable=broad-exception-caught
                 torch.cuda.amp.GradScaler = ipex.cpu.autocast._grad_scaler.GradScaler
 
-        #C
+        # C
         torch._C._cuda_getCurrentRawStream = ipex._C._getCurrentStream
         ipex._C._DeviceProperties.major = 2023
         ipex._C._DeviceProperties.minor = 2
 
-        #Fix functions with ipex:
+        # Fix functions with ipex:
         torch.cuda.mem_get_info = lambda device=None: [(torch.xpu.get_device_properties(device).total_memory - torch.xpu.memory_reserved(device)), torch.xpu.get_device_properties(device).total_memory]
         torch._utils._get_available_device_type = lambda: "xpu"
         torch.has_cuda = True
@@ -156,20 +156,14 @@ def ipex_init(): # pylint: disable=too-many-statements
         torch.cuda.get_device_properties.minor = 7
         torch.cuda.ipc_collect = lambda *args, **kwargs: None
         torch.cuda.utilization = lambda *args, **kwargs: 0
-        if hasattr(torch.xpu, 'getDeviceIdListForCard'):
-            torch.cuda.getDeviceIdListForCard = torch.xpu.getDeviceIdListForCard
-            torch.cuda.get_device_id_list_per_card = torch.xpu.getDeviceIdListForCard
-        else:
-            torch.cuda.getDeviceIdListForCard = torch.xpu.get_device_id_list_per_card
-            torch.cuda.get_device_id_list_per_card = torch.xpu.get_device_id_list_per_card
 
         ipex_hijacks()
-        attention_init()
-        try:
-            from .diffusers import ipex_diffusers
-            ipex_diffusers()
-        except Exception: # pylint: disable=broad-exception-caught
-            pass
+        if not torch.xpu.has_fp64_dtype():
+            try:
+                from .diffusers import ipex_diffusers
+                ipex_diffusers()
+            except Exception: # pylint: disable=broad-exception-caught
+                pass
     except Exception as e:
         return False, e
     return True, None