invoke-ai
diff --git a/‎invokeai/app/invocations/compel.py
Lines changed: 6 additions & 4 deletions b/‎invokeai/app/invocations/compel.py
Lines changed: 6 additions & 4 deletions
diff --git a/‎invokeai/app/invocations/denoise_latents.py
Lines changed: 3 additions & 2 deletions b/‎invokeai/app/invocations/denoise_latents.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎invokeai/app/invocations/flux_denoise.py
Lines changed: 19 additions & 22 deletions b/‎invokeai/app/invocations/flux_denoise.py
Lines changed: 19 additions & 22 deletions
diff --git a/‎invokeai/app/invocations/flux_text_encoder.py
Lines changed: 3 additions & 2 deletions b/‎invokeai/app/invocations/flux_text_encoder.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎invokeai/app/invocations/sd3_text_encoder.py
Lines changed: 3 additions & 2 deletions b/‎invokeai/app/invocations/sd3_text_encoder.py
Lines changed: 3 additions & 2 deletions
diff --git a/‎invokeai/app/invocations/tiled_multi_diffusion_denoise_latents.py
Lines changed: 4 additions & 2 deletions b/‎invokeai/app/invocations/tiled_multi_diffusion_denoise_latents.py
Lines changed: 4 additions & 2 deletions
diff --git a/‎invokeai/backend/model_manager/load/model_cache/cached_model/cached_model_with_partial_load.py
Lines changed: 13 additions & 10 deletions b/‎invokeai/backend/model_manager/load/model_cache/cached_model/cached_model_with_partial_load.py
Lines changed: 13 additions & 10 deletions
diff --git a/‎invokeai/backend/model_manager/load/model_cache/model_cache.py
Lines changed: 7 additions & 0 deletions b/‎invokeai/backend/model_manager/load/model_cache/model_cache.py
Lines changed: 7 additions & 0 deletions
diff --git a/‎invokeai/backend/model_manager/load/model_cache/torch_module_autocast/autocast_modules.py
Lines changed: 0 additions & 50 deletions b/‎invokeai/backend/model_manager/load/model_cache/torch_module_autocast/autocast_modules.py
Lines changed: 0 additions & 50 deletions
diff --git a/‎invokeai/backend/model_manager/load/model_cache/torch_module_autocast/custom_modules/README.md
Lines changed: 8 additions & 0 deletions b/‎invokeai/backend/model_manager/load/model_cache/torch_module_autocast/custom_modules/README.md
Lines changed: 8 additions & 0 deletions
diff --git a/‎invokeai/backend/patches/sidecar_wrappers/__init__.py renamed to ‎invokeai/backend/model_manager/load/model_cache/torch_module_autocast/custom_modules/__init__.py b/‎invokeai/backend/patches/sidecar_wrappers/__init__.py renamed to ‎invokeai/backend/model_manager/load/model_cache/torch_module_autocast/custom_modules/__init__.py
diff --git a/‎invokeai/backend/model_manager/load/model_cache/torch_module_autocast/custom_modules/custom_conv1d.py
Lines changed: 43 additions & 0 deletions b/‎invokeai/backend/model_manager/load/model_cache/torch_module_autocast/custom_modules/custom_conv1d.py
Lines changed: 43 additions & 0 deletions
@@ -20,8 +20,8 @@
 from invokeai.app.services.shared.invocation_context import InvocationContext
 from invokeai.app.util.ti_utils import generate_ti_list
 from invokeai.backend.model_patcher import ModelPatcher
+from invokeai.backend.patches.layer_patcher import LayerPatcher
 from invokeai.backend.patches.model_patch_raw import ModelPatchRaw
-from invokeai.backend.patches.model_patcher import LayerPatcher
 from invokeai.backend.stable_diffusion.diffusion.conditioning_data import (
     BasicConditioningInfo,
     ConditioningFieldData,
@@ -82,10 +82,11 @@ def _lora_loader() -> Iterator[Tuple[ModelPatchRaw, float]]:
             # apply all patches while the model is on the target device
             text_encoder_info.model_on_device() as (cached_weights, text_encoder),
             tokenizer_info as tokenizer,
-            LayerPatcher.apply_model_patches(
+            LayerPatcher.apply_smart_model_patches(
                 model=text_encoder,
                 patches=_lora_loader(),
                 prefix="lora_te_",
+                dtype=text_encoder.dtype,
                 cached_weights=cached_weights,
             ),
             # Apply CLIP Skip after LoRA to prevent LoRA application from failing on skipped layers.
@@ -179,10 +180,11 @@ def _lora_loader() -> Iterator[Tuple[ModelPatchRaw, float]]:
             # apply all patches while the model is on the target device
             text_encoder_info.model_on_device() as (cached_weights, text_encoder),
             tokenizer_info as tokenizer,
-            LayerPatcher.apply_model_patches(
-                text_encoder,
+            LayerPatcher.apply_smart_model_patches(
+                model=text_encoder,
                 patches=_lora_loader(),
                 prefix=lora_prefix,
+                dtype=text_encoder.dtype,
                 cached_weights=cached_weights,
             ),
             # Apply CLIP Skip after LoRA to prevent LoRA application from failing on skipped layers.
 
@@ -39,8 +39,8 @@
 from invokeai.backend.ip_adapter.ip_adapter import IPAdapter
 from invokeai.backend.model_manager import BaseModelType, ModelVariantType
 from invokeai.backend.model_patcher import ModelPatcher
+from invokeai.backend.patches.layer_patcher import LayerPatcher
 from invokeai.backend.patches.model_patch_raw import ModelPatchRaw
-from invokeai.backend.patches.model_patcher import LayerPatcher
 from invokeai.backend.stable_diffusion import PipelineIntermediateState
 from invokeai.backend.stable_diffusion.denoise_context import DenoiseContext, DenoiseInputs
 from invokeai.backend.stable_diffusion.diffusers_pipeline import (
@@ -1003,10 +1003,11 @@ def _lora_loader() -> Iterator[Tuple[ModelPatchRaw, float]]:
             ModelPatcher.apply_freeu(unet, self.unet.freeu_config),
             SeamlessExt.static_patch_model(unet, self.unet.seamless_axes),  # FIXME
             # Apply the LoRA after unet has been moved to its target device for faster patching.
-            LayerPatcher.apply_model_patches(
+            LayerPatcher.apply_smart_model_patches(
                 model=unet,
                 patches=_lora_loader(),
                 prefix="lora_unet_",
+                dtype=unet.dtype,
                 cached_weights=cached_weights,
             ),
         ):
 
@@ -48,9 +48,9 @@
 )
 from invokeai.backend.flux.text_conditioning import FluxTextConditioning
 from invokeai.backend.model_manager.config import ModelFormat
+from invokeai.backend.patches.layer_patcher import LayerPatcher
 from invokeai.backend.patches.lora_conversions.flux_lora_constants import FLUX_LORA_TRANSFORMER_PREFIX
 from invokeai.backend.patches.model_patch_raw import ModelPatchRaw
-from invokeai.backend.patches.model_patcher import LayerPatcher
 from invokeai.backend.stable_diffusion.diffusers_pipeline import PipelineIntermediateState
 from invokeai.backend.stable_diffusion.diffusion.conditioning_data import FLUXConditioningInfo
 from invokeai.backend.util.devices import TorchDevice
@@ -304,36 +304,33 @@ def _run_diffusion(
             config = transformer_info.config
             assert config is not None
 
-            # Apply LoRA models to the transformer.
-            # Note: We apply the LoRA after the transformer has been moved to its target device for faster patching.
+            # Determine if the model is quantized.
+            # If the model is quantized, then we need to apply the LoRA weights as sidecar layers. This results in
+            # slower inference than direct patching, but is agnostic to the quantization format.
             if config.format in [ModelFormat.Checkpoint]:
-                # The model is non-quantized, so we can apply the LoRA weights directly into the model.
-                exit_stack.enter_context(
-                    LayerPatcher.apply_model_patches(
-                        model=transformer,
-                        patches=self._lora_iterator(context),
-                        prefix=FLUX_LORA_TRANSFORMER_PREFIX,
-                        cached_weights=cached_weights,
-                    )
-                )
+                model_is_quantized = False
             elif config.format in [
                 ModelFormat.BnbQuantizedLlmInt8b,
                 ModelFormat.BnbQuantizednf4b,
                 ModelFormat.GGUFQuantized,
             ]:
-                # The model is quantized, so apply the LoRA weights as sidecar layers. This results in slower inference,
-                # than directly patching the weights, but is agnostic to the quantization format.
-                exit_stack.enter_context(
-                    LayerPatcher.apply_model_sidecar_patches(
-                        model=transformer,
-                        patches=self._lora_iterator(context),
-                        prefix=FLUX_LORA_TRANSFORMER_PREFIX,
-                        dtype=inference_dtype,
-                    )
-                )
+                model_is_quantized = True
             else:
                 raise ValueError(f"Unsupported model format: {config.format}")
 
+            # Apply LoRA models to the transformer.
+            # Note: We apply the LoRA after the transformer has been moved to its target device for faster patching.
+            exit_stack.enter_context(
+                LayerPatcher.apply_smart_model_patches(
+                    model=transformer,
+                    patches=self._lora_iterator(context),
+                    prefix=FLUX_LORA_TRANSFORMER_PREFIX,
+                    dtype=inference_dtype,
+                    cached_weights=cached_weights,
+                    force_sidecar_patching=model_is_quantized,
+                )
+            )
+
             # Prepare IP-Adapter extensions.
             pos_ip_adapter_extensions, neg_ip_adapter_extensions = self._prep_ip_adapter_extensions(
                 pos_image_prompt_clip_embeds=pos_image_prompt_clip_embeds,
 
@@ -18,9 +18,9 @@
 from invokeai.app.services.shared.invocation_context import InvocationContext
 from invokeai.backend.flux.modules.conditioner import HFEncoder
 from invokeai.backend.model_manager.config import ModelFormat
+from invokeai.backend.patches.layer_patcher import LayerPatcher
 from invokeai.backend.patches.lora_conversions.flux_lora_constants import FLUX_LORA_CLIP_PREFIX
 from invokeai.backend.patches.model_patch_raw import ModelPatchRaw
-from invokeai.backend.patches.model_patcher import LayerPatcher
 from invokeai.backend.stable_diffusion.diffusion.conditioning_data import ConditioningFieldData, FLUXConditioningInfo
 
 
@@ -111,10 +111,11 @@ def _clip_encode(self, context: InvocationContext) -> torch.Tensor:
             if clip_text_encoder_config.format in [ModelFormat.Diffusers]:
                 # The model is non-quantized, so we can apply the LoRA weights directly into the model.
                 exit_stack.enter_context(
-                    LayerPatcher.apply_model_patches(
+                    LayerPatcher.apply_smart_model_patches(
                         model=clip_text_encoder,
                         patches=self._clip_lora_iterator(context),
                         prefix=FLUX_LORA_CLIP_PREFIX,
+                        dtype=clip_text_encoder.dtype,
                         cached_weights=cached_weights,
                     )
                 )
 
@@ -17,9 +17,9 @@
 from invokeai.app.invocations.primitives import SD3ConditioningOutput
 from invokeai.app.services.shared.invocation_context import InvocationContext
 from invokeai.backend.model_manager.config import ModelFormat
+from invokeai.backend.patches.layer_patcher import LayerPatcher
 from invokeai.backend.patches.lora_conversions.flux_lora_constants import FLUX_LORA_CLIP_PREFIX
 from invokeai.backend.patches.model_patch_raw import ModelPatchRaw
-from invokeai.backend.patches.model_patcher import LayerPatcher
 from invokeai.backend.stable_diffusion.diffusion.conditioning_data import ConditioningFieldData, SD3ConditioningInfo
 
 # The SD3 T5 Max Sequence Length set based on the default in diffusers.
@@ -150,10 +150,11 @@ def _clip_encode(
             if clip_text_encoder_config.format in [ModelFormat.Diffusers]:
                 # The model is non-quantized, so we can apply the LoRA weights directly into the model.
                 exit_stack.enter_context(
-                    LayerPatcher.apply_model_patches(
+                    LayerPatcher.apply_smart_model_patches(
                         model=clip_text_encoder,
                         patches=self._clip_lora_iterator(context, clip_model),
                         prefix=FLUX_LORA_CLIP_PREFIX,
+                        dtype=clip_text_encoder.dtype,
                         cached_weights=cached_weights,
                     )
                 )
 
@@ -22,8 +22,8 @@
 from invokeai.app.invocations.model import UNetField
 from invokeai.app.invocations.primitives import LatentsOutput
 from invokeai.app.services.shared.invocation_context import InvocationContext
+from invokeai.backend.patches.layer_patcher import LayerPatcher
 from invokeai.backend.patches.model_patch_raw import ModelPatchRaw
-from invokeai.backend.patches.model_patcher import LayerPatcher
 from invokeai.backend.stable_diffusion.diffusers_pipeline import ControlNetData, PipelineIntermediateState
 from invokeai.backend.stable_diffusion.multi_diffusion_pipeline import (
     MultiDiffusionPipeline,
@@ -207,7 +207,9 @@ def _lora_loader() -> Iterator[Tuple[ModelPatchRaw, float]]:
         with (
             ExitStack() as exit_stack,
             unet_info as unet,
-            LayerPatcher.apply_model_patches(model=unet, patches=_lora_loader(), prefix="lora_unet_"),
+            LayerPatcher.apply_smart_model_patches(
+                model=unet, patches=_lora_loader(), prefix="lora_unet_", dtype=unet.dtype
+            ),
         ):
             assert isinstance(unet, UNet2DConditionModel)
             latents = latents.to(device=unet.device, dtype=unet.dtype)
 
@@ -1,9 +1,7 @@
 import torch
 
-from invokeai.backend.model_manager.load.model_cache.torch_module_autocast.torch_module_autocast import (
-    AUTOCAST_MODULE_TYPE_MAPPING,
-    apply_custom_layers_to_model,
-    remove_custom_layers_from_model,
+from invokeai.backend.model_manager.load.model_cache.torch_module_autocast.custom_modules.custom_module_mixin import (
+    CustomModuleMixin,
 )
 from invokeai.backend.util.calc_tensor_size import calc_tensor_size
 from invokeai.backend.util.logging import InvokeAILogger
@@ -45,10 +43,10 @@ def __init__(self, model: torch.nn.Module, compute_device: torch.device):
 
     def _find_modules_that_support_autocast(self) -> dict[str, torch.nn.Module]:
         """Find all modules that support autocasting."""
-        return {n: m for n, m in self._model.named_modules() if type(m) in AUTOCAST_MODULE_TYPE_MAPPING}
+        return {n: m for n, m in self._model.named_modules() if isinstance(m, CustomModuleMixin)}  # type: ignore
 
     def _find_keys_in_modules_that_do_not_support_autocast(self) -> set[str]:
-        keys_in_modules_that_do_not_support_autocast = set()
+        keys_in_modules_that_do_not_support_autocast: set[str] = set()
         for key in self._cpu_state_dict.keys():
             for module_name in self._modules_that_support_autocast.keys():
                 if key.startswith(module_name):
@@ -70,6 +68,11 @@ def _move_non_persistent_buffers_to_device(self, device: torch.device):
                 if name in module._non_persistent_buffers_set:
                     module._buffers[name] = buffer.to(device, copy=True)
 
+    def _set_autocast_enabled_in_all_modules(self, enabled: bool):
+        """Set autocast_enabled flag in all modules that support device autocasting."""
+        for module in self._modules_that_support_autocast.values():
+            module.set_device_autocasting_enabled(enabled)
+
     @property
     def model(self) -> torch.nn.Module:
         return self._model
@@ -114,7 +117,7 @@ def partial_load_to_vram(self, vram_bytes_to_load: int) -> int:
 
         cur_state_dict = self._model.state_dict()
 
-        # First, process the keys *must* be loaded into VRAM.
+        # First, process the keys that *must* be loaded into VRAM.
         for key in self._keys_in_modules_that_do_not_support_autocast:
             param = cur_state_dict[key]
             if param.device.type == self._compute_device.type:
@@ -157,10 +160,10 @@ def partial_load_to_vram(self, vram_bytes_to_load: int) -> int:
             self._cur_vram_bytes += vram_bytes_loaded
 
         if fully_loaded:
-            remove_custom_layers_from_model(self._model)
+            self._set_autocast_enabled_in_all_modules(False)
             # TODO(ryand): Warn if the self.cur_vram_bytes() and self.total_bytes() are out of sync.
         else:
-            apply_custom_layers_to_model(self._model)
+            self._set_autocast_enabled_in_all_modules(True)
 
         # Move all non-persistent buffers to the compute device. These are a weird edge case and do not participate in
         # the vram_bytes_loaded tracking.
@@ -197,5 +200,5 @@ def partial_unload_from_vram(self, vram_bytes_to_free: int) -> int:
 
         # We may have gone from a fully-loaded model to a partially-loaded model, so we need to reapply the custom
         # layers.
-        apply_custom_layers_to_model(self._model)
+        self._set_autocast_enabled_in_all_modules(True)
         return vram_bytes_freed
@@ -13,6 +13,9 @@
 from invokeai.backend.model_manager.load.memory_snapshot import MemorySnapshot, get_pretty_snapshot_diff
 from invokeai.backend.model_manager.load.model_cache.cache_record import CacheRecord
 from invokeai.backend.model_manager.load.model_cache.cache_stats import CacheStats
+from invokeai.backend.model_manager.load.model_cache.torch_module_autocast.torch_module_autocast import (
+    apply_custom_layers_to_model,
+)
 from invokeai.backend.model_manager.load.model_util import calc_model_size_by_data
 from invokeai.backend.util.devices import TorchDevice
 from invokeai.backend.util.logging import InvokeAILogger
@@ -143,6 +146,10 @@ def put(
         size = calc_model_size_by_data(self._logger, model)
         self.make_room(size)
 
+        # Inject custom modules into the model.
+        if isinstance(model, torch.nn.Module):
+            apply_custom_layers_to_model(model)
+
         running_on_cpu = self._execution_device == torch.device("cpu")
         state_dict = model.state_dict() if isinstance(model, torch.nn.Module) and not running_on_cpu else None
         cache_record = CacheRecord(key=key, model=model, device=self._storage_device, state_dict=state_dict, size=size)
 
@@ -0,0 +1,8 @@
+
+This directory contains custom implementations of common torch.nn.Module classes that add support for:
+- Streaming weights to the execution device
+- Applying sidecar patches at execution time (e.g. sidecar LoRA layers)
+
+Each custom class sub-classes the original module type that is is replacing, so the following properties are preserved:
+- `isinstance(m, torch.nn.OrginalModule)` should still work.
+- Patching the weights directly (e.g. for LoRA) should still work. (Of course, this is not possible for quantized layers, hence the sidecar support.)
@@ -0,0 +1,43 @@
+import torch
+
+from invokeai.backend.model_manager.load.model_cache.torch_module_autocast.cast_to_device import cast_to_device
+from invokeai.backend.model_manager.load.model_cache.torch_module_autocast.custom_modules.custom_module_mixin import (
+    CustomModuleMixin,
+)
+from invokeai.backend.model_manager.load.model_cache.torch_module_autocast.custom_modules.utils import (
+    add_nullable_tensors,
+)
+
+
+class CustomConv1d(torch.nn.Conv1d, CustomModuleMixin):
+    def _autocast_forward_with_patches(self, input: torch.Tensor) -> torch.Tensor:
+        weight = cast_to_device(self.weight, input.device)
+        bias = cast_to_device(self.bias, input.device)
+
+        # Prepare the original parameters for the patch aggregation.
+        orig_params = {"weight": weight, "bias": bias}
+        # Filter out None values.
+        orig_params = {k: v for k, v in orig_params.items() if v is not None}
+
+        aggregated_param_residuals = self._aggregate_patch_parameters(
+            patches_and_weights=self._patches_and_weights,
+            orig_params=orig_params,
+            device=input.device,
+        )
+
+        weight = add_nullable_tensors(weight, aggregated_param_residuals.get("weight", None))
+        bias = add_nullable_tensors(bias, aggregated_param_residuals.get("bias", None))
+        return self._conv_forward(input, weight, bias)
+
+    def _autocast_forward(self, input: torch.Tensor) -> torch.Tensor:
+        weight = cast_to_device(self.weight, input.device)
+        bias = cast_to_device(self.bias, input.device)
+        return self._conv_forward(input, weight, bias)
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        if len(self._patches_and_weights) > 0:
+            return self._autocast_forward_with_patches(input)
+        elif self._device_autocasting_enabled:
+            return self._autocast_forward(input)
+        else:
+            return super().forward(input)