|
| 1 | +import torch |
| 2 | + |
| 3 | +from invokeai.backend.model_manager.load.model_cache.torch_module_autocast.torch_module_autocast import ( |
| 4 | + AUTOCAST_MODULE_TYPE_MAPPING, |
| 5 | + apply_custom_layers_to_model, |
| 6 | + remove_custom_layers_from_model, |
| 7 | +) |
| 8 | +from invokeai.backend.util.calc_tensor_size import calc_tensor_size |
| 9 | +from invokeai.backend.util.logging import InvokeAILogger |
| 10 | + |
| 11 | + |
| 12 | +def set_nested_attr(obj: object, attr: str, value: object): |
| 13 | + """A helper function that extends setattr() to support nested attributes. |
| 14 | +
|
| 15 | + Example: |
| 16 | + set_nested_attr(model, "module.encoder.conv1.weight", new_conv1_weight) |
| 17 | + """ |
| 18 | + attrs = attr.split(".") |
| 19 | + for attr in attrs[:-1]: |
| 20 | + obj = getattr(obj, attr) |
| 21 | + setattr(obj, attrs[-1], value) |
| 22 | + |
| 23 | + |
| 24 | +class CachedModelWithPartialLoad: |
| 25 | + """A wrapper around a PyTorch model to handle partial loads and unloads between the CPU and the compute device. |
| 26 | +
|
| 27 | + Note: "VRAM" is used throughout this class to refer to the memory on the compute device. It could be CUDA memory, |
| 28 | + MPS memory, etc. |
| 29 | + """ |
| 30 | + |
| 31 | + def __init__(self, model: torch.nn.Module, compute_device: torch.device): |
| 32 | + self._model = model |
| 33 | + self._compute_device = compute_device |
| 34 | + |
| 35 | + # A CPU read-only copy of the model's state dict. |
| 36 | + self._cpu_state_dict: dict[str, torch.Tensor] = model.state_dict() |
| 37 | + |
| 38 | + # TODO(ryand): Handle the case where the model sizes changes after initial load (e.g. due to dtype casting). |
| 39 | + # Consider how we should handle this for both self._total_bytes and self._cur_vram_bytes. |
| 40 | + self._total_bytes = sum(calc_tensor_size(p) for p in self._cpu_state_dict.values()) |
| 41 | + self._cur_vram_bytes: int | None = None |
| 42 | + |
| 43 | + self._modules_that_support_autocast = self._find_modules_that_support_autocast() |
| 44 | + self._keys_in_modules_that_do_not_support_autocast = self._find_keys_in_modules_that_do_not_support_autocast() |
| 45 | + |
| 46 | + def _find_modules_that_support_autocast(self) -> dict[str, torch.nn.Module]: |
| 47 | + """Find all modules that support autocasting.""" |
| 48 | + return {n: m for n, m in self._model.named_modules() if type(m) in AUTOCAST_MODULE_TYPE_MAPPING} |
| 49 | + |
| 50 | + def _find_keys_in_modules_that_do_not_support_autocast(self) -> set[str]: |
| 51 | + keys_in_modules_that_do_not_support_autocast = set() |
| 52 | + for key in self._cpu_state_dict.keys(): |
| 53 | + for module_name in self._modules_that_support_autocast.keys(): |
| 54 | + if key.startswith(module_name): |
| 55 | + break |
| 56 | + else: |
| 57 | + keys_in_modules_that_do_not_support_autocast.add(key) |
| 58 | + return keys_in_modules_that_do_not_support_autocast |
| 59 | + |
| 60 | + def _move_non_persistent_buffers_to_device(self, device: torch.device): |
| 61 | + """Move the non-persistent buffers to the target device. These buffers are not included in the state dict, |
| 62 | + so we need to move them manually. |
| 63 | + """ |
| 64 | + # HACK(ryand): Typically, non-persistent buffers are moved when calling module.to(device). We don't move entire |
| 65 | + # modules, because we manage the devices of individual tensors using the state dict. Since non-persistent |
| 66 | + # buffers are not included in the state dict, we need to handle them manually. The only way to do this is by |
| 67 | + # using private torch.nn.Module attributes. |
| 68 | + for module in self._model.modules(): |
| 69 | + for name, buffer in module.named_buffers(): |
| 70 | + if name in module._non_persistent_buffers_set: |
| 71 | + module._buffers[name] = buffer.to(device, copy=True) |
| 72 | + |
| 73 | + @property |
| 74 | + def model(self) -> torch.nn.Module: |
| 75 | + return self._model |
| 76 | + |
| 77 | + def get_cpu_state_dict(self) -> dict[str, torch.Tensor] | None: |
| 78 | + """Get a read-only copy of the model's state dict in RAM.""" |
| 79 | + # TODO(ryand): Document this better. |
| 80 | + return self._cpu_state_dict |
| 81 | + |
| 82 | + def total_bytes(self) -> int: |
| 83 | + """Get the total size (in bytes) of all the weights in the model.""" |
| 84 | + return self._total_bytes |
| 85 | + |
| 86 | + def cur_vram_bytes(self) -> int: |
| 87 | + """Get the size (in bytes) of the weights that are currently in VRAM.""" |
| 88 | + if self._cur_vram_bytes is None: |
| 89 | + cur_state_dict = self._model.state_dict() |
| 90 | + self._cur_vram_bytes = sum( |
| 91 | + calc_tensor_size(p) for p in cur_state_dict.values() if p.device.type == self._compute_device.type |
| 92 | + ) |
| 93 | + return self._cur_vram_bytes |
| 94 | + |
| 95 | + def full_load_to_vram(self) -> int: |
| 96 | + """Load all weights into VRAM.""" |
| 97 | + return self.partial_load_to_vram(self.total_bytes()) |
| 98 | + |
| 99 | + def full_unload_from_vram(self) -> int: |
| 100 | + """Unload all weights from VRAM.""" |
| 101 | + return self.partial_unload_from_vram(self.total_bytes()) |
| 102 | + |
| 103 | + @torch.no_grad() |
| 104 | + def partial_load_to_vram(self, vram_bytes_to_load: int) -> int: |
| 105 | + """Load more weights into VRAM without exceeding vram_bytes_to_load. |
| 106 | +
|
| 107 | + Returns: |
| 108 | + The number of bytes loaded into VRAM. |
| 109 | + """ |
| 110 | + # TODO(ryand): Handle the case where an exception is thrown while loading or unloading weights. At the very |
| 111 | + # least, we should reset self._cur_vram_bytes to None. |
| 112 | + |
| 113 | + vram_bytes_loaded = 0 |
| 114 | + |
| 115 | + cur_state_dict = self._model.state_dict() |
| 116 | + |
| 117 | + # First, process the keys *must* be loaded into VRAM. |
| 118 | + for key in self._keys_in_modules_that_do_not_support_autocast: |
| 119 | + param = cur_state_dict[key] |
| 120 | + if param.device.type == self._compute_device.type: |
| 121 | + continue |
| 122 | + |
| 123 | + param_size = calc_tensor_size(param) |
| 124 | + cur_state_dict[key] = param.to(self._compute_device, copy=True) |
| 125 | + vram_bytes_loaded += param_size |
| 126 | + |
| 127 | + if vram_bytes_loaded > vram_bytes_to_load: |
| 128 | + logger = InvokeAILogger.get_logger() |
| 129 | + logger.warning( |
| 130 | + f"Loaded {vram_bytes_loaded / 2**20} MB into VRAM, but only {vram_bytes_to_load / 2**20} MB were " |
| 131 | + "requested. This is the minimum set of weights in VRAM required to run the model." |
| 132 | + ) |
| 133 | + |
| 134 | + # Next, process the keys that can optionally be loaded into VRAM. |
| 135 | + fully_loaded = True |
| 136 | + for key, param in cur_state_dict.items(): |
| 137 | + if param.device.type == self._compute_device.type: |
| 138 | + continue |
| 139 | + |
| 140 | + param_size = calc_tensor_size(param) |
| 141 | + if vram_bytes_loaded + param_size > vram_bytes_to_load: |
| 142 | + # TODO(ryand): Should we just break here? If we couldn't fit this parameter into VRAM, is it really |
| 143 | + # worth continuing to search for a smaller parameter that would fit? |
| 144 | + fully_loaded = False |
| 145 | + continue |
| 146 | + |
| 147 | + cur_state_dict[key] = param.to(self._compute_device, copy=True) |
| 148 | + vram_bytes_loaded += param_size |
| 149 | + |
| 150 | + if vram_bytes_loaded > 0: |
| 151 | + # We load the entire state dict, not just the parameters that changed, in case there are modules that |
| 152 | + # override _load_from_state_dict() and do some funky stuff that requires the entire state dict. |
| 153 | + # Alternatively, in the future, grouping parameters by module could probably solve this problem. |
| 154 | + self._model.load_state_dict(cur_state_dict, assign=True) |
| 155 | + |
| 156 | + if self._cur_vram_bytes is not None: |
| 157 | + self._cur_vram_bytes += vram_bytes_loaded |
| 158 | + |
| 159 | + if fully_loaded: |
| 160 | + remove_custom_layers_from_model(self._model) |
| 161 | + # TODO(ryand): Warn if the self.cur_vram_bytes() and self.total_bytes() are out of sync. |
| 162 | + else: |
| 163 | + apply_custom_layers_to_model(self._model) |
| 164 | + |
| 165 | + # Move all non-persistent buffers to the compute device. These are a weird edge case and do not participate in |
| 166 | + # the vram_bytes_loaded tracking. |
| 167 | + self._move_non_persistent_buffers_to_device(self._compute_device) |
| 168 | + |
| 169 | + return vram_bytes_loaded |
| 170 | + |
| 171 | + @torch.no_grad() |
| 172 | + def partial_unload_from_vram(self, vram_bytes_to_free: int) -> int: |
| 173 | + """Unload weights from VRAM until vram_bytes_to_free bytes are freed. Or the entire model is unloaded. |
| 174 | +
|
| 175 | + Returns: |
| 176 | + The number of bytes unloaded from VRAM. |
| 177 | + """ |
| 178 | + vram_bytes_freed = 0 |
| 179 | + |
| 180 | + offload_device = "cpu" |
| 181 | + cur_state_dict = self._model.state_dict() |
| 182 | + for key, param in cur_state_dict.items(): |
| 183 | + if vram_bytes_freed >= vram_bytes_to_free: |
| 184 | + break |
| 185 | + |
| 186 | + if param.device.type == offload_device: |
| 187 | + continue |
| 188 | + |
| 189 | + cur_state_dict[key] = self._cpu_state_dict[key] |
| 190 | + vram_bytes_freed += calc_tensor_size(param) |
| 191 | + |
| 192 | + if vram_bytes_freed > 0: |
| 193 | + self._model.load_state_dict(cur_state_dict, assign=True) |
| 194 | + |
| 195 | + if self._cur_vram_bytes is not None: |
| 196 | + self._cur_vram_bytes -= vram_bytes_freed |
| 197 | + |
| 198 | + # We may have gone from a fully-loaded model to a partially-loaded model, so we need to reapply the custom |
| 199 | + # layers. |
| 200 | + apply_custom_layers_to_model(self._model) |
| 201 | + return vram_bytes_freed |
0 commit comments