Update SD3 init parameters (replacing height, width with image_shape) (#1951)

james77777778 · web-flow · commit e24a516d5010 · 2024-10-23T20:38:46.000-07:00
* Replace SD3 `height` and `width` with `image_shape`

* Update URI

* Revert comment

* Update SD3 handle

* Replace `height` and `width` with `image_shape`

* Update docstrings

* Fix CI
diff --git a/keras_hub/src/models/image_to_image.py b/keras_hub/src/models/image_to_image.py
@@ -234,7 +234,7 @@ def normalize_images(x):
                 input_is_scalar = True
             x = ops.image.resize(
                 x,
-                (self.backbone.height, self.backbone.width),
+                (self.backbone.image_shape[0], self.backbone.image_shape[1]),
                 interpolation="nearest",
                 data_format=data_format,
             )
diff --git a/keras_hub/src/models/inpaint.py b/keras_hub/src/models/inpaint.py
@@ -202,7 +202,7 @@ def normalize(x):
                 input_is_scalar = True
             x = ops.image.resize(
                 x,
-                (self.backbone.height, self.backbone.width),
+                (self.backbone.image_shape[0], self.backbone.image_shape[1]),
                 interpolation="nearest",
                 data_format=data_format,
             )
@@ -240,7 +240,7 @@ def normalize(x):
                 x = ops.cast(x, "float32")
             x = ops.image.resize(
                 x,
-                (self.backbone.height, self.backbone.width),
+                (self.backbone.image_shape[0], self.backbone.image_shape[1]),
                 interpolation="nearest",
                 data_format=data_format,
             )
@@ -303,7 +303,7 @@ def normalize_images(x):
                 input_is_scalar = True
             x = ops.image.resize(
                 x,
-                (self.backbone.height, self.backbone.width),
+                (self.backbone.image_shape[0], self.backbone.image_shape[1]),
                 interpolation="nearest",
                 data_format=data_format,
             )
@@ -323,7 +323,7 @@ def normalize_masks(x):
                 x = ops.cast(x, "float32")
             x = ops.image.resize(
                 x,
-                (self.backbone.height, self.backbone.width),
+                (self.backbone.image_shape[0], self.backbone.image_shape[1]),
                 interpolation="nearest",
                 data_format=data_format,
             )
@@ -384,8 +384,8 @@ def generate(
 
         Typically, `inputs` is a dict with `"images"` `"masks"` and `"prompts"`
         keys. `"images"` are reference images within a value range of
-        `[-1.0, 1.0]`, which will be resized to `self.backbone.height` and
-        `self.backbone.width`, then encoded into latent space by the VAE
+        `[-1.0, 1.0]`, which will be resized to height and width from
+        `self.backbone.image_shape`, then encoded into latent space by the VAE
         encoder. `"masks"` are mask images with a boolean dtype, where white
         pixels are repainted while black pixels are preserved. `"prompts"` are
         strings that will be tokenized and encoded by the text encoder.
diff --git a/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_backbone.py b/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_backbone.py
@@ -215,8 +215,8 @@ class StableDiffusion3Backbone(Backbone):
             model. Defaults to `1000`.
         shift: float. The shift value for the timestep schedule. Defaults to
             `3.0`.
-        height: optional int. The output height of the image.
-        width: optional int. The output width of the image.
+        image_shape: tuple. The input shape without the batch size. Defaults to
+            `(1024, 1024, 3)`.
         data_format: `None` or str. If specified, either `"channels_last"` or
             `"channels_first"`. The ordering of the dimensions in the
             inputs. `"channels_last"` corresponds to inputs with shape
@@ -270,23 +270,21 @@ def __init__(
         output_channels=3,
         num_train_timesteps=1000,
         shift=3.0,
-        height=None,
-        width=None,
+        image_shape=(1024, 1024, 3),
         data_format=None,
         dtype=None,
         **kwargs,
     ):
-        height = int(height or 1024)
-        width = int(width or 1024)
-        if height % 8 != 0 or width % 8 != 0:
-            raise ValueError(
-                "`height` and `width` must be divisible by 8. "
-                f"Received: height={height}, width={width}"
-            )
         data_format = standardize_data_format(data_format)
         if data_format != "channels_last":
             raise NotImplementedError
-        image_shape = (height, width, int(vae.input_channels))
+        height = image_shape[0]
+        width = image_shape[1]
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(
+                "height and width in `image_shape` must be divisible by 8. "
+                f"Received: image_shape={image_shape}"
+            )
         latent_shape = (height // 8, width // 8, int(latent_channels))
         context_shape = (None, 4096 if t5 is None else t5.hidden_dim)
         pooled_projection_shape = (clip_l.hidden_dim + clip_g.hidden_dim,)
@@ -452,8 +450,7 @@ def __init__(
         self.output_channels = output_channels
         self.num_train_timesteps = num_train_timesteps
         self.shift = shift
-        self.height = height
-        self.width = width
+        self.image_shape = image_shape
 
     @property
     def latent_shape(self):
@@ -585,8 +582,7 @@ def get_config(self):
                 "output_channels": self.output_channels,
                 "num_train_timesteps": self.num_train_timesteps,
                 "shift": self.shift,
-                "height": self.height,
-                "width": self.width,
+                "image_shape": self.image_shape,
             }
         )
         return config
diff --git a/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_backbone_test.py b/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_backbone_test.py
@@ -11,7 +11,8 @@
 
 class StableDiffusion3BackboneTest(TestCase):
     def setUp(self):
-        height, width = 64, 64
+        image_shape = (64, 64, 3)
+        height, width = image_shape[0], image_shape[1]
         vae = VAEBackbone(
             [32, 32, 32, 32],
             [1, 1, 1, 1],
@@ -36,8 +37,7 @@ def setUp(self):
             "vae": vae,
             "clip_l": clip_l,
             "clip_g": clip_g,
-            "height": height,
-            "width": width,
+            "image_shape": image_shape,
         }
         self.input_data = {
             "images": ops.ones((2, height, width, 3)),
@@ -82,7 +82,6 @@ def test_all_presets(self):
                 preset=preset,
                 input_data=self.input_data,
                 init_kwargs={
-                    "height": self.init_kwargs["height"],
-                    "width": self.init_kwargs["width"],
+                    "image_shape": self.init_kwargs["image_shape"],
                 },
             )
diff --git a/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_image_to_image.py b/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_image_to_image.py
@@ -27,7 +27,7 @@ class StableDiffusion3ImageToImage(ImageToImage):
     Use `generate()` to do image generation.
     ```python
     image_to_image = keras_hub.models.StableDiffusion3ImageToImage.from_preset(
-        "stable_diffusion_3_medium", height=512, width=512
+        "stable_diffusion_3_medium", image_shape=(512, 512, 3)
     )
     image_to_image.generate(
         {
diff --git a/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_image_to_image_test.py b/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_image_to_image_test.py
@@ -55,8 +55,7 @@ def setUp(self):
             clip_g=CLIPTextEncoder(
                 20, 128, 128, 2, 2, 256, "gelu", -2, name="clip_g"
             ),
-            height=64,
-            width=64,
+            image_shape=(64, 64, 3),
         )
         self.init_kwargs = {
             "preprocessor": self.preprocessor,
diff --git a/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_inpaint.py b/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_inpaint.py
@@ -29,7 +29,7 @@ class StableDiffusion3Inpaint(Inpaint):
     reference_image = np.ones((1024, 1024, 3), dtype="float32")
     reference_mask = np.ones((1024, 1024), dtype="float32")
     inpaint = keras_hub.models.StableDiffusion3Inpaint.from_preset(
-        "stable_diffusion_3_medium", height=512, width=512
+        "stable_diffusion_3_medium", image_shape=(512, 512, 3)
     )
     inpaint.generate(
         reference_image,
diff --git a/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_inpaint_test.py b/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_inpaint_test.py
@@ -55,8 +55,7 @@ def setUp(self):
             clip_g=CLIPTextEncoder(
                 20, 128, 128, 2, 2, 256, "gelu", -2, name="clip_g"
             ),
-            height=64,
-            width=64,
+            image_shape=(64, 64, 3),
         )
         self.init_kwargs = {
             "preprocessor": self.preprocessor,
diff --git a/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_presets.py b/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_presets.py
@@ -13,6 +13,6 @@
             "path": "stable_diffusion_3",
             "model_card": "https://arxiv.org/abs/2110.00476",
         },
-        "kaggle_handle": "kaggle://keras/stablediffusion3/keras/stable_diffusion_3_medium/2",
+        "kaggle_handle": "kaggle://keras/stablediffusion3/keras/stable_diffusion_3_medium/3",
     }
 }
diff --git a/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image.py b/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image.py
@@ -27,7 +27,7 @@ class StableDiffusion3TextToImage(TextToImage):
     Use `generate()` to do image generation.
     ```python
     text_to_image = keras_hub.models.StableDiffusion3TextToImage.from_preset(
-        "stable_diffusion_3_medium", height=512, width=512
+        "stable_diffusion_3_medium", image_shape=(512, 512, 3)
     )
     text_to_image.generate(
         "Astronaut in a jungle, cold color palette, muted colors, detailed, 8k"
diff --git a/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image_test.py b/keras_hub/src/models/stable_diffusion_3/stable_diffusion_3_text_to_image_test.py
@@ -55,8 +55,7 @@ def setUp(self):
             clip_g=CLIPTextEncoder(
                 20, 128, 128, 2, 2, 256, "gelu", -2, name="clip_g"
             ),
-            height=64,
-            width=64,
+            image_shape=(64, 64, 3),
         )
         self.init_kwargs = {
             "preprocessor": self.preprocessor,
diff --git a/keras_hub/src/utils/preset_utils.py b/keras_hub/src/utils/preset_utils.py
@@ -563,10 +563,8 @@ def get_backbone_kwargs(self, **kwargs):
         backbone_kwargs["dtype"] = kwargs.pop("dtype", None)
 
         # Forward `height` and `width` to backbone when using `TextToImage`.
-        if "height" in kwargs:
-            backbone_kwargs["height"] = kwargs.pop("height", None)
-        if "width" in kwargs:
-            backbone_kwargs["width"] = kwargs.pop("width", None)
+        if "image_shape" in kwargs:
+            backbone_kwargs["image_shape"] = kwargs.pop("image_shape", None)
 
         return backbone_kwargs, kwargs
 
diff --git a/tools/checkpoint_conversion/convert_stable_diffusion_3_checkpoints.py b/tools/checkpoint_conversion/convert_stable_diffusion_3_checkpoints.py
@@ -113,8 +113,7 @@ def convert_model(preset, height, width):
             vae,
             clip_l,
             clip_g,
-            height=height,
-            width=width,
+            image_shape=(height, width, 3),
             name="stable_diffusion_3_backbone",
         )
     return backbone
@@ -532,8 +531,7 @@ def main(_):
 
     keras_preprocessor.save_to_preset(preset)
     # Set the image size to 1024, the same as in huggingface/diffusers.
-    keras_model.height = 1024
-    keras_model.width = 1024
+    keras_model.image_shape = (1024, 1024, 3)
     keras_model.save_to_preset(preset)
     print(f"🏁 Preset saved to ./{preset}.")
 

Original file line number	Diff line number	Diff line change
`@@ -234,7 +234,7 @@ def normalize_images(x):`
`234`	`234`	`input_is_scalar = True`
`235`	`235`	`x = ops.image.resize(`
`236`	`236`	`x,`
`237`		`- (self.backbone.height, self.backbone.width),`
	`237`	`+ (self.backbone.image_shape[0], self.backbone.image_shape[1]),`
`238`	`238`	`interpolation="nearest",`
`239`	`239`	`data_format=data_format,`
`240`	`240`	`)`
Original file line number	Diff line number	Diff line change
`@@ -27,7 +27,7 @@ class StableDiffusion3ImageToImage(ImageToImage):`
`27`	`27`	Use `generate()` to do image generation.
`28`	`28`	```python
`29`	`29`	`image_to_image = keras_hub.models.StableDiffusion3ImageToImage.from_preset(`
`30`		`- "stable_diffusion_3_medium", height=512, width=512`
	`30`	`+ "stable_diffusion_3_medium", image_shape=(512, 512, 3)`
`31`	`31`	`)`
`32`	`32`	`image_to_image.generate(`
`33`	`33`	`{`
Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,7 @@ class StableDiffusion3Inpaint(Inpaint):`
`29`	`29`	`reference_image = np.ones((1024, 1024, 3), dtype="float32")`
`30`	`30`	`reference_mask = np.ones((1024, 1024), dtype="float32")`
`31`	`31`	`inpaint = keras_hub.models.StableDiffusion3Inpaint.from_preset(`
`32`		`- "stable_diffusion_3_medium", height=512, width=512`
	`32`	`+ "stable_diffusion_3_medium", image_shape=(512, 512, 3)`
`33`	`33`	`)`
`34`	`34`	`inpaint.generate(`
`35`	`35`	`reference_image,`
Original file line number	Diff line number	Diff line change
`@@ -13,6 +13,6 @@`
`13`	`13`	`"path": "stable_diffusion_3",`
`14`	`14`	`"model_card": "https://arxiv.org/abs/2110.00476",`
`15`	`15`	`},`
`16`		`- "kaggle_handle": "kaggle://keras/stablediffusion3/keras/stable_diffusion_3_medium/2",`
	`16`	`+ "kaggle_handle": "kaggle://keras/stablediffusion3/keras/stable_diffusion_3_medium/3",`
`17`	`17`	`}`
`18`	`18`	`}`