okotaku · okotaku · Dec 25, 2023 · Dec 23, 2023 · Dec 24, 2023 · Dec 25, 2023
diff --git a/Dockerfile b/Dockerfile
@@ -1,4 +1,4 @@
-FROM nvcr.io/nvidia/pytorch:23.11-py3
+FROM nvcr.io/nvidia/pytorch:23.12-py3
 
 RUN apt update -y && apt install -y \
     git tmux

diff --git a/README.md b/README.md
@@ -141,6 +141,7 @@ For detailed user guides and advanced guides, please refer to our [Documentation
 - [Run PixArt-α](https://diffengine.readthedocs.io/en/latest/run_guides/run_pixart_alpha.html)
 - [Run PixArt-α LoRA](https://diffengine.readthedocs.io/en/latest/run_guides/run_pixart_alpha_lora.html)
 - [Run PixArt-α DreamBooth](https://diffengine.readthedocs.io/en/latest/run_guides/run_pixart_alpha_dreambooth.html)
+- - [Run Kandinsky 2.2](https://diffengine.readthedocs.io/en/latest/run_guides/run_kandinsky_v22.html)
 - [Inference](https://diffengine.readthedocs.io/en/latest/run_guides/inference.html)
 
 </details>
@@ -248,6 +249,9 @@ For detailed user guides and advanced guides, please refer to our [Documentation
       <td>
         <b>PixArt-α</b>
       </td>
+      <td>
+        <b>Kandinsky</b>
+      </td>
     </tr>
     <tr valign="top">
       <td>
@@ -269,6 +273,11 @@ For detailed user guides and advanced guides, please refer to our [Documentation
             <li><a href="configs/pixart_alpha_dreambooth/README.md">DreamBooth (CVPR'2023)</a></li>
       </ul>
       </td>
+      <td>
+        <ul>
+            <li><a href="configs/kandinsky_v22/README.md">Kandinsky 2.2 (2023)</a></li>
+      </ul>
+      </td>
     </tr>
 </td>
     </tr>

diff --git a/configs/_base_/datasets/pokemon_blip_kandinsky_decoder.py b/configs/_base_/datasets/pokemon_blip_kandinsky_decoder.py
@@ -0,0 +1,30 @@
+train_pipeline = [
+    dict(type="CLIPImageProcessor",
+         pretrained="kandinsky-community/kandinsky-2-2-prior"),
+    dict(type="torchvision/Resize", size=768, interpolation="bicubic"),
+    dict(type="RandomCrop", size=768),
+    dict(type="RandomHorizontalFlip", p=0.5),
+    dict(type="torchvision/ToTensor"),
+    dict(type="torchvision/Normalize", mean=[0.5], std=[0.5]),
+    dict(type="PackInputs", input_keys=["img", "text", "clip_img"]),
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    dataset=dict(
+        type="HFDataset",
+        dataset="lambdalabs/pokemon-blip-captions",
+        pipeline=train_pipeline),
+    sampler=dict(type="DefaultSampler", shuffle=True),
+)
+
+val_dataloader = None
+val_evaluator = None
+test_dataloader = val_dataloader
+test_evaluator = val_evaluator
+
+custom_hooks = [
+    dict(type="VisualizationHook", prompt=["yoda pokemon"] * 4,
+         height=768, width=768),
+    dict(type="SDCheckpointHook"),
+]
diff --git a/configs/_base_/datasets/pokemon_blip_kandinsky_prior.py b/configs/_base_/datasets/pokemon_blip_kandinsky_prior.py
@@ -0,0 +1,25 @@
+train_pipeline = [
+    dict(type="CLIPImageProcessor", output_key="img",
+         pretrained="kandinsky-community/kandinsky-2-2-prior"),
+    dict(type="PackInputs"),
+]
+train_dataloader = dict(
+    batch_size=4,
+    num_workers=4,
+    dataset=dict(
+        type="HFDataset",
+        dataset="lambdalabs/pokemon-blip-captions",
+        pipeline=train_pipeline),
+    sampler=dict(type="DefaultSampler", shuffle=True),
+)
+
+val_dataloader = None
+val_evaluator = None
+test_dataloader = val_dataloader
+test_evaluator = val_evaluator
+
+custom_hooks = [
+    dict(type="VisualizationHook", prompt=["yoda pokemon"] * 4,
+         height=512, width=512),
+    dict(type="PriorSaveHook"),
+]
diff --git a/configs/_base_/datasets/pokemon_blip_wuerstchen.py b/configs/_base_/datasets/pokemon_blip_wuerstchen.py
@@ -25,5 +25,5 @@
 custom_hooks = [
     dict(type="VisualizationHook", prompt=["A robot pokemon, 4k photo"] * 4,
          height=768, width=768),
-    dict(type="WuerstchenSaveHook"),
+    dict(type="PriorSaveHook"),
 ]
diff --git a/configs/_base_/models/kandinsky_v22_decoder.py b/configs/_base_/models/kandinsky_v22_decoder.py
@@ -0,0 +1,4 @@
+model = dict(
+    type="KandinskyV22Decoder",
+    decoder_model="kandinsky-community/kandinsky-2-2-decoder",
+    prior_model="kandinsky-community/kandinsky-2-2-prior")
diff --git a/configs/_base_/models/kandinsky_v22_prior.py b/configs/_base_/models/kandinsky_v22_prior.py
@@ -0,0 +1,4 @@
+model = dict(
+    type="KandinskyV22Prior",
+    decoder_model="kandinsky-community/kandinsky-2-2-decoder",
+    prior_model="kandinsky-community/kandinsky-2-2-prior")
diff --git a/configs/kandinsky_v22/README.md b/configs/kandinsky_v22/README.md
@@ -0,0 +1,72 @@
+# Kandinsky 2.2
+
+[Kandinsky 2.2](https://habr.com/ru/companies/sberbank/articles/747446/)
+
+## Abstract
+
+Kandinsky 2.2 brings substantial improvements upon its predecessor, Kandinsky 2.1, by introducing a new, more powerful image encoder - CLIP-ViT-G and the ControlNet support. The switch to CLIP-ViT-G as the image encoder significantly increases the model’s capability to generate more aesthetic pictures and better understand text, thus enhancing the model’s overall performance. The addition of the ControlNet mechanism allows the model to effectively control the process of generating images. This leads to more accurate and visually appealing outputs and opens new possibilities for text-guided image manipulation.
+
+<div align=center>
+<img src="https://github.com/okotaku/diffengine/assets/24734142/b07d82fb-4c2c-4216-a4b1-a64b278cee2a"/>
+</div>
+
+## Citation
+
+```
+```
+
+## Run Training
+
+Run Training
+
+```
+# single gpu
+$ mim train diffengine ${CONFIG_FILE}
+# multi gpus
+$ mim train diffengine ${CONFIG_FILE} --gpus 2 --launcher pytorch
+
+# Example.
+$ mim train diffengine configs/kandinsky_v22/kandinsky_v22_prior_pokemon_blip.py
+```
+
+## Inference prior with diffusers
+
+Once you have trained a model, specify the path to the saved model and utilize it for inference using the `diffusers.pipeline` module.
+
+```py
+import torch
+from diffusers import AutoPipelineForText2Image, PriorTransformer
+
+prompt = 'yoda pokemon'
+checkpoint = 'work_dirs/kandinsky_v22_prior_pokemon_blip/step10450'
+
+prior = PriorTransformer.from_pretrained(
+    checkpoint, subfolder="prior",
+)
+pipe = AutoPipelineForText2Image.from_pretrained(
+    "kandinsky-community/kandinsky-2-2-decoder",
+    prior_prior=prior,
+    torch_dtype=torch.float32,
+)
+pipe.to('cuda')
+
+image = pipe(
+    prompt,
+    num_inference_steps=50,
+    width=512,
+    height=512,
+).images[0]
+image.save('demo.png')
+```
+
+You can see more details on [`docs/source/run_guides/run_kandinsky_v22.md`](../../docs/source/run_guides/run_kandinsky_v22.md#inference-with-diffusers).
+
+## Results Example
+
+#### kandinsky_v22_prior_pokemon_blip
+
+![example1](https://github.com/okotaku/diffengine/assets/24734142/b709f558-5c03-4235-98d7-fe1c663182b8)
+
+#### kandinsky_v22_decoder_pokemon_blip
+
+![example1](https://github.com/okotaku/diffengine/assets/24734142/6c9cce50-9f31-4637-9933-27697d65c830)
diff --git a/configs/kandinsky_v22/kandinsky_v22_decoder_pokemon_blip.py b/configs/kandinsky_v22/kandinsky_v22_decoder_pokemon_blip.py
@@ -0,0 +1,6 @@
+_base_ = [
+    "../_base_/models/kandinsky_v22_decoder.py",
+    "../_base_/datasets/pokemon_blip_kandinsky_decoder.py",
+    "../_base_/schedules/stable_diffusion_50e.py",
+    "../_base_/default_runtime.py",
+]
diff --git a/configs/kandinsky_v22/kandinsky_v22_prior_pokemon_blip.py b/configs/kandinsky_v22/kandinsky_v22_prior_pokemon_blip.py
@@ -0,0 +1,6 @@
+_base_ = [
+    "../_base_/models/kandinsky_v22_prior.py",
+    "../_base_/datasets/pokemon_blip_kandinsky_prior.py",
+    "../_base_/schedules/stable_diffusion_50e.py",
+    "../_base_/default_runtime.py",
+]
diff --git a/diffengine/datasets/transforms/processing.py b/diffengine/datasets/transforms/processing.py
@@ -409,10 +409,15 @@ class CLIPImageProcessor(BaseTransform):
             results. Defaults to 'clip_img'.
     """
 
-    def __init__(self, key: str = "img", output_key: str = "clip_img") -> None:
+    def __init__(self, key: str = "img", output_key: str = "clip_img",
+                 pretrained: str | None = None) -> None:
         self.key = key
         self.output_key = output_key
-        self.pipeline = HFCLIPImageProcessor()
+        if pretrained is None:
+            self.pipeline = HFCLIPImageProcessor()
+        else:
+            self.pipeline = HFCLIPImageProcessor.from_pretrained(
+                pretrained, subfolder="image_processor")
 
     def transform(self, results: dict) -> dict | tuple[list, list] | None:
         """Transform.

diff --git a/diffengine/engine/hooks/__init__.py b/diffengine/engine/hooks/__init__.py
@@ -5,11 +5,11 @@
 from .lcm_ema_update_hook import LCMEMAUpdateHook
 from .peft_save_hook import PeftSaveHook
 from .pixart_checkpoint_hook import PixArtCheckpointHook
+from .prior_save_hook import PriorSaveHook
 from .sd_checkpoint_hook import SDCheckpointHook
 from .t2i_adapter_save_hook import T2IAdapterSaveHook
 from .unet_ema_hook import UnetEMAHook
 from .visualization_hook import VisualizationHook
-from .wuerstchen_save_hook import WuerstchenSaveHook
 
 __all__ = [
     "VisualizationHook",
@@ -21,7 +21,7 @@
     "T2IAdapterSaveHook",
     "CompileHook",
     "FastNormHook",
-    "WuerstchenSaveHook",
+    "PriorSaveHook",
     "LCMEMAUpdateHook",
     "PixArtCheckpointHook",
 ]
diff --git a/...gine/engine/hooks/wuerstchen_save_hook.py → diffengine/engine/hooks/prior_save_hook.py b/...gine/engine/hooks/wuerstchen_save_hook.py → diffengine/engine/hooks/prior_save_hook.py
@@ -7,11 +7,11 @@
 
 
 @HOOKS.register_module()
-class WuerstchenSaveHook(Hook):
-    """Wuerstchen Save Hook.
+class PriorSaveHook(Hook):
+    """Prior Save Hook.
 
-    Save Wuerstchen weights with diffusers format and pick up Wuerstchen
-    weights from checkpoint.
+    Save Prior weights with diffusers format and pick up Prior weights from
+    checkpoint.
     """
 
     priority = "VERY_LOW"
@@ -30,7 +30,8 @@ def before_save_checkpoint(self, runner, checkpoint: dict) -> None:
             model = model.module
         ckpt_path = osp.join(runner.work_dir, f"step{runner.iter}")
         model.prior.save_pretrained(osp.join(ckpt_path, "prior"))
-        if model.finetune_text_encoder:
+        if hasattr(
+            model, "finetune_text_encoder") and model.finetune_text_encoder:
             model.text_encoder.save_pretrained(
                 osp.join(ckpt_path, "text_encoder"))
 

diff --git a/diffengine/models/editors/__init__.py b/diffengine/models/editors/__init__.py
@@ -3,6 +3,7 @@
 from .esd import *  # noqa: F403
 from .instruct_pix2pix import *  # noqa: F403
 from .ip_adapter import *  # noqa: F403
+from .kandinsky import *  # noqa: F403
 from .lcm import *  # noqa: F403
 from .pixart_alpha import *  # noqa: F403
 from .ssd_1b import *  # noqa: F403

diff --git a/diffengine/models/editors/deepfloyd_if/deepfloyd_if.py b/diffengine/models/editors/deepfloyd_if/deepfloyd_if.py
@@ -46,6 +46,7 @@ class DeepFloydIF(BaseModel):
             training. Choose between 'epsilon' or 'v_prediction' or leave
             `None`. If left to `None` the default prediction type of the
             scheduler: `noise_scheduler.config.prediciton_type` is chosen.
+            Defaults to None.
         data_preprocessor (dict, optional): The pre-process config of
             :class:`SDDataPreprocessor`.
         noise_generator (dict, optional): The noise generator config.

diff --git a/diffengine/models/editors/kandinsky/__init__.py b/diffengine/models/editors/kandinsky/__init__.py
@@ -0,0 +1,6 @@
+from .kandinskyv22_decoder import KandinskyV22Decoder
+from .kandinskyv22_decoder_preprocessor import KandinskyV22DecoderDataPreprocessor
+from .kandinskyv22_prior import KandinskyV22Prior
+
+__all__ = ["KandinskyV22Prior", "KandinskyV22Decoder",
+           "KandinskyV22DecoderDataPreprocessor"]