Merge pull request #1995 from bmaltais/dev

v22.6.2
bmaltais · Feb 24, 2024 · 5d77bf4 · 5d77bf4
2 parents f8d2673 + 822d94c
commit 5d77bf4
Show file tree

Hide file tree

Showing 81 changed files with 6,855 additions and 1,791 deletions.
diff --git a/.dockerignore b/.dockerignore
@@ -5,3 +5,11 @@ bitsandbytes_windows_deprecated/
 dataset/
 __pycache__/
 venv/
+**/.hadolint.yml
+**/*.log
+**/.git
+**/.gitignore
+**/.env
+**/.github
+**/.vscode
+**/*.ps1
diff --git a/.hadolint.yml b/.hadolint.yml
@@ -0,0 +1,6 @@
+ignored:
+  - DL3042 # Avoid use of cache directory with pip. Use `pip install --no-cache-dir <package>`
+  - DL3013 # Pin versions in pip. Instead of `pip install <package>` use `pip install <package>==<version>`
+  - DL3008 # Pin versions in apt get install. Instead of `apt-get install <package>` use `apt-get install <package>=<version>`
+  - DL4006 # Set the SHELL option -o pipefail before RUN with a pipe in it
+  - SC2015 # Note that A && B || C is not if-then-else. C may run when A is true.
diff --git a/Dockerfile b/Dockerfile
@@ -1,54 +1,118 @@
-FROM nvcr.io/nvidia/pytorch:23.04-py3 as base
-ENV DEBIAN_FRONTEND=noninteractive
-ENV TZ=Europe/London
-
-RUN apt update && apt-get install -y software-properties-common
-RUN --mount=type=cache,target=/var/cache/apt,sharing=locked \
-    --mount=type=cache,target=/var/lib/apt,sharing=locked \
-    add-apt-repository ppa:deadsnakes/ppa && \
-    apt update && \
-    apt-get install -y git curl libgl1 libglib2.0-0 libgoogle-perftools-dev \
-    python3.10-dev python3.10-tk python3-html5lib python3-apt python3-pip python3.10-distutils && \
-    rm -rf /var/lib/apt/lists/*
+# syntax=docker/dockerfile:1
+ARG UID=1000
+ARG VERSION=EDGE
+ARG RELEASE=0
 
-# Set python 3.10 and cuda 11.8 as default
-RUN update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.10 3 && \
-    update-alternatives --set python3 /usr/bin/python3.10 && \
-    update-alternatives --set cuda /usr/local/cuda-11.8
+FROM python:3.10-slim as build
 
-RUN curl -sS https://bootstrap.pypa.io/get-pip.py | python3
+# RUN mount cache for multi-arch: https://github.com/docker/buildx/issues/549#issuecomment-1788297892
+ARG TARGETARCH
+ARG TARGETVARIANT
 
 WORKDIR /app
-RUN --mount=type=cache,target=/root/.cache/pip python3 -m pip install wheel
 
-# Todo: Install torch 2.1.0 for cu121 support (only available as nightly as of writing)
-## RUN --mount=type=cache,target=/root/.cache/pip python3 -m pip install --pre torch ninja setuptools --extra-index-url https://download.pytorch.org/whl/nightly/cu121
+# Install under /root/.local
+ENV PIP_USER="true"
+ARG PIP_NO_WARN_SCRIPT_LOCATION=0
+ARG PIP_ROOT_USER_ACTION="ignore"
+
+# Install build dependencies
+RUN apt-get update && apt-get upgrade -y && \
+    apt-get install -y --no-install-recommends python3-launchpadlib git curl && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
 
-# Todo: Install xformers nightly for Torch 2.1.0 support
-## RUN --mount=type=cache,target=/root/.cache/pip python3 -m pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers
+# Install PyTorch and TensorFlow
+# The versions must align and be in sync with the requirements_linux_docker.txt
+# hadolint ignore=SC2102
+RUN --mount=type=cache,id=pip-$TARGETARCH$TARGETVARIANT,sharing=locked,target=/root/.cache/pip \
+    pip install -U --extra-index-url https://download.pytorch.org/whl/cu121 --extra-index-url https://pypi.nvidia.com \
+    torch==2.1.2 torchvision==0.16.2 \
+    xformers==0.0.23.post1 \
+    # Why [and-cuda]: https://github.com/tensorflow/tensorflow/issues/61468#issuecomment-1759462485
+    tensorflow[and-cuda]==2.14.0 \
+    ninja \
+    pip setuptools wheel
 
 # Install requirements
-COPY ./requirements.txt ./requirements_linux_docker.txt ./
-COPY ./setup/docker_setup.py ./setup.py
-RUN --mount=type=cache,target=/root/.cache/pip python3 -m pip install -r ./requirements_linux_docker.txt
-RUN --mount=type=cache,target=/root/.cache/pip python3 -m pip install -r ./requirements.txt
+RUN --mount=type=cache,id=pip-$TARGETARCH$TARGETVARIANT,sharing=locked,target=/root/.cache/pip \
+    --mount=source=requirements_linux_docker.txt,target=requirements_linux_docker.txt \
+    --mount=source=requirements.txt,target=requirements.txt \
+    --mount=source=setup/docker_setup.py,target=setup.py \
+    pip install -r requirements_linux_docker.txt -r requirements.txt
+
+# Replace pillow with pillow-simd (Only for x86)
+ARG TARGETPLATFORM
+RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
+    apt-get update && apt-get install -y --no-install-recommends zlib1g-dev libjpeg62-turbo-dev build-essential && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/* && \
+    pip uninstall -y pillow && \
+    CC="cc -mavx2" pip install -U --force-reinstall pillow-simd; \
+    fi
+
+FROM python:3.10-slim as final
 
-# Replace pillow with pillow-simd
-RUN --mount=type=cache,target=/root/.cache/pip python3 -m pip uninstall -y pillow && \
-    CC="cc -mavx2" python3 -m pip install -U --force-reinstall pillow-simd
+ARG UID
+ARG VERSION
+ARG RELEASE
+
+LABEL name="bmaltais/kohya_ss" \
+    vendor="bmaltais" \
+    maintainer="bmaltais" \
+    # Dockerfile source repository
+    url="https://github.com/bmaltais/kohya_ss" \
+    version=${VERSION} \
+    # This should be a number, incremented with each change
+    release=${RELEASE} \
+    io.k8s.display-name="kohya_ss" \
+    summary="Kohya's GUI: This repository provides a Gradio GUI for Kohya's Stable Diffusion trainers(https://github.com/kohya-ss/sd-scripts)." \
+    description="The GUI allows you to set the training parameters and generate and run the required CLI commands to train the model. This is the docker image for Kohya's GUI. For more information about this tool, please visit the following website: https://github.com/bmaltais/kohya_ss."
+
+# Install runtime dependencies
+RUN apt-get update && \
+    apt-get install -y --no-install-recommends libgl1 libglib2.0-0 libjpeg62 libtcl8.6 libtk8.6 libgoogle-perftools-dev dumb-init && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
 
 # Fix missing libnvinfer7
-USER root
 RUN ln -s /usr/lib/x86_64-linux-gnu/libnvinfer.so /usr/lib/x86_64-linux-gnu/libnvinfer.so.7 && \
     ln -s /usr/lib/x86_64-linux-gnu/libnvinfer_plugin.so /usr/lib/x86_64-linux-gnu/libnvinfer_plugin.so.7
 
-RUN useradd -m -s /bin/bash appuser && \
-    chown -R appuser: /app
-USER appuser
-COPY --chown=appuser . .
+# Create user
+RUN groupadd -g $UID $UID && \
+    useradd -l -u $UID -g $UID -m -s /bin/sh -N $UID
 
-STOPSIGNAL SIGINT
+# Create directories with correct permissions
+RUN install -d -m 775 -o $UID -g 0 /dataset && \
+    install -d -m 775 -o $UID -g 0 /licenses && \
+    install -d -m 775 -o $UID -g 0 /app
+
+# Copy dist and support arbitrary user ids (OpenShift best practice)
+COPY --chown=$UID:0 --chmod=775 \
+    --from=build /root/.local /home/$UID/.local
+
+WORKDIR /app
+COPY --chown=$UID:0 --chmod=775 . .
+
+# Copy licenses (OpenShift Policy)
+COPY --chmod=775 LICENSE.md /licenses/LICENSE.md
+
+ENV PATH="/home/$UID/.local/bin:$PATH"
+ENV PYTHONPATH="${PYTHONPATH}:/home/$UID/.local/lib/python3.10/site-packages" 
 ENV LD_PRELOAD=libtcmalloc.so
 ENV PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION=python
-ENV PATH="$PATH:/home/appuser/.local/bin"
-CMD python3 "./kohya_gui.py" ${CLI_ARGS} --listen 0.0.0.0 --server_port 7860
+
+VOLUME [ "/dataset" ]
+
+# 7860: Kohya GUI
+# 6006: TensorBoard
+EXPOSE 7860 6006
+
+USER $UID
+
+STOPSIGNAL SIGINT
+
+# Use dumb-init as PID 1 to handle signals properly
+ENTRYPOINT ["dumb-init", "--"]
+CMD ["python3", "kohya_gui.py", "--listen", "0.0.0.0", "--server_port", "7860"]
diff --git a/README.md b/README.md
@@ -505,6 +505,30 @@ masterpiece, best quality, 1boy, in business suit, standing at street, looking b
 ## Change History
 * 2024/02/17 (v22.6.2)
 - Fix issue with Lora Extract GUI
+  - Fix syntax issue where parameter lora_network_weights is actually called network_weights
+- Merge sd-scripts v0.8.4 code update
+  - Fixed a bug that the VRAM usage without Text Encoder training is larger than before in training scripts for LoRA etc (`train_network.py`, `sdxl_train_network.py`).
+    - Text Encoders were not moved to CPU.
+  - Fixed typos. Thanks to akx! [PR #1053](https://github.com/kohya-ss/sd-scripts/pull/1053)
+  - The log output has been improved. PR [#905](https://github.com/kohya-ss/sd-scripts/pull/905) Thanks to shirayu!
+    - The log is formatted by default. The `rich` library is required. Please see [Upgrade](#upgrade) and update the library.
+    - If `rich` is not installed, the log output will be the same as before.
+    - The following options are available in each training script:
+    - `--console_log_simple` option can be used to switch to the previous log output.
+    - `--console_log_level` option can be used to specify the log level. The default is `INFO`.
+    - `--console_log_file` option can be used to output the log to a file. The default is `None` (output to the console).
+  - The sample image generation during multi-GPU training is now done with multiple GPUs. PR [#1061](https://github.com/kohya-ss/sd-scripts/pull/1061) Thanks to DKnight54!
+  - The support for mps devices is improved. PR [#1054](https://github.com/kohya-ss/sd-scripts/pull/1054) Thanks to akx! If mps device exists instead of CUDA, the mps device is used automatically.
+  - The `--new_conv_rank` option to specify the new rank of Conv2d is added to `networks/resize_lora.py`. PR [#1102](https://github.com/kohya-ss/sd-scripts/pull/1102) Thanks to mgz-dev!
+  - An option `--highvram` to disable the optimization for environments with little VRAM is added to the training scripts. If you specify it when there is enough VRAM, the operation will be faster.
+    - Currently, only the cache part of latents is optimized.
+  - The IPEX support is improved. PR [#1086](https://github.com/kohya-ss/sd-scripts/pull/1086) Thanks to Disty0!
+  - Fixed a bug that `svd_merge_lora.py` crashes in some cases. PR [#1087](https://github.com/kohya-ss/sd-scripts/pull/1087) Thanks to mgz-dev!
+  - DyLoRA is fixed to work with SDXL. PR [#1126](https://github.com/kohya-ss/sd-scripts/pull/1126) Thanks to tamlog06!
+  - The common image generation script `gen_img.py` for SD 1/2 and SDXL is added. The basic functions are the same as the scripts for SD 1/2 and SDXL, but some new features are added.
+    - External scripts to generate prompts can be supported. It can be called with `--from_module` option. (The documentation will be added later)
+    - The normalization method after prompt weighting can be specified with `--emb_normalize_mode` option. `original` is the original method, `abs` is the normalization with the average of the absolute values, `none` is no normalization.
+  - Gradual Latent Hires fix is added to each generation script. See [here](./docs/gen_img_README-ja.md#about-gradual-latent) for details.
 
 * 2024/02/15 (v22.6.1)
 - Add support for multi-gpu parameters in the GUI under the "Parameters > Advanced" tab.

diff --git a/XTI_hijack.py b/XTI_hijack.py
@@ -1,7 +1,7 @@
 import torch
-from library.ipex_interop import init_ipex
-
+from library.device_utils import init_ipex
 init_ipex()
+
 from typing import Union, List, Optional, Dict, Any, Tuple
 from diffusers.models.unet_2d_condition import UNet2DConditionOutput
 

diff --git a/docker-compose.yaml b/docker-compose.yaml
@@ -3,10 +3,12 @@ services:
   kohya-ss-gui:
     container_name: kohya-ss-gui
     image: kohya-ss-gui:latest
+    user: 1000:0
     build:
       context: .
+      args:
+        - UID=1000
     ports:
-      - 127.0.0.1:3000:3000
       - 7860:7860
       - 6006:6006
     tty: true
@@ -16,15 +18,15 @@ services:
       SAFETENSORS_FAST_GPU: 1
       DISPLAY: $DISPLAY
     tmpfs:
-      - /tmp      
+      - /tmp
     volumes:
-      - ./dataset:/dataset
-      - ./.cache/user:/home/appuser/.cache
-      - ./.cache/triton:/home/appuser/.triton    
-      - ./.cache/config:/app/appuser/.config
-      - ./.cache/nv:/home/appuser/.nv 
-      - ./.cache/keras:/home/appuser/.keras      
       - /tmp/.X11-unix:/tmp/.X11-unix
+      - ./dataset:/dataset
+      - ./.cache/user:/home/1000/.cache
+      - ./.cache/triton:/home/1000/.triton
+      - ./.cache/nv:/home/1000/.nv
+      - ./.cache/keras:/home/1000/.keras
+      - ./.cache/config:/home/1000/.config
     deploy:
       resources:
         reservations:

diff --git a/docs/LoRA/options.md b/docs/LoRA/options.md
@@ -34,7 +34,7 @@ U-Net is divided into "Down" (left half), "Mid" (bottom) and "Up" (right half).
 
 And it consists of 25 blocks in total: Down12 block, Mid1 block, and Up12 block. The neural net added here is simply called "UNet" in Kohya_ss.
 
-### RoLA Learning Object 2: Text Encoder
+### LoRA Learning Object 2: Text Encoder
 
 This isn't the only time LoRA adds neural nets .
 
@@ -177,15 +177,15 @@ The default value is 0.0001.
 
 ### LR Scheduler:
 
-You can change the learning rate in the middle of learning. A scheduler is a setting for how to change the learning rate.
+You can change the learning rate in the middle of learning. A scheduler is a setting for how to change the learning rate. Possible values include:
 
-adafactor: Select this to set the optimizer (described later) to Adafactor . Learn while automatically adjusting the learning rate according to the situation to save VRAM
-constant: the learning rate does not change from beginning to end
-constant_with_warmup: Start with a learning rate of 0 and gradually increase it toward the set value of Learning rate during warm-up, and use the set value of Learning rate during main learning.
-cosine : Gradually decrease the learning rate toward 0 while drawing a wave (cosine curve)
-cosine _with_restarts: repeat cosine many times (see also description of LR number of cycles)
-linear: Start at the Learning rate setting and decrease linearly towards 0
-polynomial: Same behavior as linear, but a bit more complicated to reduce (see also LR power description)
+- `adafactor`: Select this to set the optimizer (described later) to Adafactor . Learn while automatically adjusting the learning rate according to the situation to save VRAM
+- `constant`: the learning rate does not change from beginning to end
+- `constant_with_warmup`: Start with a learning rate of 0 and gradually increase it toward the set value of Learning rate during warm-up, and use the set value of Learning rate during main learning.
+- `cosine` : Gradually decrease the learning rate toward 0 while drawing a wave (cosine curve)
+- `cosine _with_restarts`: repeat cosine many times (see also description of LR number of cycles)
+- `linear`: Start at the Learning rate setting and decrease linearly towards 0
+- `polynomial`: Same behavior as linear, but a bit more complicated to reduce (see also LR power description)
 Set to constant if you want the learning rate to be fixed at the Learning rate setting.
 
 Default is cosine
@@ -204,13 +204,13 @@ Default is 10.
 
 ### Optimizer
 
-The optimizer is a setting for "how to update the neural net weights during training ". Various methods have been proposed for smart learning, but the most commonly used in LoRA learning is ``AdamW'' (32-bit) or ``AdamW8bit''. AdamW8bit uses less VRAM and has enough accuracy, so if you get lost, use this.
+The optimizer is a setting for "how to update the neural net weights during training ". Various methods have been proposed for smart learning, but the most commonly used in LoRA learning is "AdamW" (32-bit) or "AdamW8bit". AdamW8bit uses less VRAM and has enough accuracy, so if you get lost, use this.
 
 In addition, "Adafactor", which adjusts the learning rate appropriately according to the progress of learning while incorporating Adam's method, is also often used (Learning rate setting is ignored when using Adafactor).
 
-``DAdapt'' is an optimizer that adjusts the learning rate , and ``Lion'' is a relatively new optimizer , but it has not been fully verified yet. There is a report that "SGDNesterov" has good learning accuracy but slows down.
+"DAdapt" is an optimizer that adjusts the learning rate, and "Lion" is a relatively new optimizer , but it has not been fully verified yet. There is a report that "SGDNesterov" has good learning accuracy but slows down.
 
-The default is AdamW8bit. There is no problem basically as it is.
+The default is "AdamW8bit". There is no problem basically as it is.
 
 ### Optimizer extra arguments
 
@@ -785,4 +785,4 @@ Here are some commonly used settings:
 
 Default is blank. When the field is blank, the description example is displayed in faint color, so please refer to it.
 
-
+
diff --git a/docs/gen_img_README-ja.md b/docs/gen_img_README-ja.md
@@ -452,3 +452,36 @@ python gen_img_diffusers.py --ckpt wd-v1-3-full-pruned-half.ckpt
 
 - `--network_show_meta` : 追加ネットワークのメタデータを表示します。
 
+
+--- 
+
+# About Gradual Latent
+
+Gradual Latent is a Hires fix that gradually increases the size of the latent.  `gen_img.py`, `sdxl_gen_img.py`, and `gen_img_diffusers.py` have the following options.
+
+- `--gradual_latent_timesteps`: Specifies the timestep to start increasing the size of the latent. The default is None, which means Gradual Latent is not used. Please try around 750 at first.
+- `--gradual_latent_ratio`: Specifies the initial size of the latent. The default is 0.5, which means it starts with half the default latent size.
+- `--gradual_latent_ratio_step`: Specifies the ratio to increase the size of the latent. The default is 0.125, which means the latent size is gradually increased to 0.625, 0.75, 0.875, 1.0.
+- `--gradual_latent_ratio_every_n_steps`: Specifies the interval to increase the size of the latent. The default is 3, which means the latent size is increased every 3 steps.
+
+Each option can also be specified with prompt options, `--glt`, `--glr`, `--gls`, `--gle`.
+
+__Please specify `euler_a` for the sampler.__ Because the source code of the sampler is modified. It will not work with other samplers.
+
+It is more effective with SD 1.5. It is quite subtle with SDXL.
+
+# Gradual Latent について
+
+latentのサイズを徐々に大きくしていくHires fixです。`gen_img.py` 、``sdxl_gen_img.py` 、`gen_img_diffusers.py` に以下のオプションが追加されています。
+
+- `--gradual_latent_timesteps` : latentのサイズを大きくし始めるタイムステップを指定します。デフォルトは None で、Gradual Latentを使用しません。750 くらいから始めてみてください。
+- `--gradual_latent_ratio` : latentの初期サイズを指定します。デフォルトは 0.5 で、デフォルトの latent サイズの半分のサイズから始めます。
+- `--gradual_latent_ratio_step`: latentのサイズを大きくする割合を指定します。デフォルトは 0.125 で、latentのサイズを 0.625, 0.75, 0.875, 1.0 と徐々に大きくします。
+- `--gradual_latent_ratio_every_n_steps`: latentのサイズを大きくする間隔を指定します。デフォルトは 3 で、3ステップごとに latent のサイズを大きくします。
+
+それぞれのオプションは、プロンプトオプション、`--glt`、`--glr`、`--gls`、`--gle` でも指定できます。
+
+サンプラーに手を加えているため、__サンプラーに `euler_a` を指定してください。__ 他のサンプラーでは動作しません。
+
+SD 1.5 のほうが効果があります。SDXL ではかなり微妙です。
+