Merge pull request #6510 from brkirch/unet16-upcast-precision (1574e967) · Commits · github_fork / Stable Diffusion Webui

README.md

+1 −0

Original line number	Diff line number	Diff line
		@@ -157,4 +157,5 @@ Licenses for borrowed code can be found in `Settings -> Licenses` screen, and al
		- DeepDanbooru - interrogator for anime diffusers https://github.com/KichangKim/DeepDanbooru
		- Security advice - RyotaK
		- Initial Gradio script - posted on 4chan by an Anonymous user. Thank you Anonymous user.
		- Sampling in float32 precision from a float16 UNet - marunine for the idea, Birch-san for the example Diffusers implementation (https://github.com/Birch-san/diffusers-play/tree/92feee6)
		- (You)

modules/deepbooru_model.py

+3 −1

Original line number	Diff line number	Diff line
		@@ -2,6 +2,8 @@ import torch
		import torch.nn as nn
		import torch.nn.functional as F

		from modules import devices

		# see https://github.com/AUTOMATIC1111/TorchDeepDanbooru for more


		@@ -196,7 +198,7 @@ class DeepDanbooruModel(nn.Module):
		t_358, = inputs
		t_359 = t_358.permute(*[0, 3, 1, 2])
		t_359_padded = F.pad(t_359, [2, 3, 2, 3], value=0)
		t_360 = self.n_Conv_0(t_359_padded)
		t_360 = self.n_Conv_0(t_359_padded.to(self.n_Conv_0.bias.dtype) if devices.unet_needs_upcast else t_359_padded)
		t_361 = F.relu(t_360)
		t_361 = F.pad(t_361, [0, 1, 0, 1], value=float('-inf'))
		t_362 = self.n_MaxPool_0(t_361)

modules/devices.py

+7 −1

Original line number	Diff line number	Diff line
		@@ -79,6 +79,8 @@ cpu = torch.device("cpu")
		device = device_interrogate = device_gfpgan = device_esrgan = device_codeformer = None
		dtype = torch.float16
		dtype_vae = torch.float16
		dtype_unet = torch.float16
		unet_needs_upcast = False


		def randn(seed, shape):
		@@ -106,6 +108,10 @@ def autocast(disable=False):
		return torch.autocast("cuda")


		def without_autocast(disable=False):
		return torch.autocast("cuda", enabled=False) if torch.is_autocast_enabled() and not disable else contextlib.nullcontext()


		class NansException(Exception):
		pass

		@@ -123,7 +129,7 @@ def test_for_nans(x, where):
		message = "A tensor with all NaNs was produced in Unet."

		if not shared.cmd_opts.no_half:
		message += " This could be either because there's not enough precision to represent the picture, or because your video card does not support half type. Try using --no-half commandline argument to fix this."
		message += " This could be either because there's not enough precision to represent the picture, or because your video card does not support half type. Try setting the \"Upcast cross attention layer to float32\" option in Settings > Stable Diffusion or using the --no-half commandline argument to fix this."

		elif where == "vae":
		message = "A tensor with all NaNs was produced in VAE."

modules/processing.py

+8 −7

Original line number	Diff line number	Diff line
		@@ -172,7 +172,8 @@ class StableDiffusionProcessing:
		midas_in = torch.from_numpy(transformed["midas_in"][None, ...]).to(device=shared.device)
		midas_in = repeat(midas_in, "1 ... -> n ...", n=self.batch_size)

		conditioning_image = self.sd_model.get_first_stage_encoding(self.sd_model.encode_first_stage(source_image))
		conditioning_image = self.sd_model.get_first_stage_encoding(self.sd_model.encode_first_stage(source_image.to(devices.dtype_unet) if devices.unet_needs_upcast else source_image))
		conditioning_image = conditioning_image.float() if devices.unet_needs_upcast else conditioning_image
		conditioning = torch.nn.functional.interpolate(
		self.sd_model.depth_model(midas_in),
		size=conditioning_image.shape[2:],
		@@ -203,7 +204,7 @@ class StableDiffusionProcessing:

		# Create another latent image, this time with a masked version of the original input.
		# Smoothly interpolate between the masked and unmasked latent conditioning image using a parameter.
		conditioning_mask = conditioning_mask.to(source_image.device).to(source_image.dtype)
		conditioning_mask = conditioning_mask.to(device=source_image.device, dtype=source_image.dtype)
		conditioning_image = torch.lerp(
		source_image,
		source_image * (1.0 - conditioning_mask),
		@@ -211,7 +212,7 @@ class StableDiffusionProcessing:
		)

		# Encode the new masked image using first stage of network.
		conditioning_image = self.sd_model.get_first_stage_encoding(self.sd_model.encode_first_stage(conditioning_image))
		conditioning_image = self.sd_model.get_first_stage_encoding(self.sd_model.encode_first_stage(conditioning_image.to(devices.dtype_unet) if devices.unet_needs_upcast else conditioning_image))

		# Create the concatenated conditioning tensor to be fed to `c_concat`
		conditioning_mask = torch.nn.functional.interpolate(conditioning_mask, size=latent_image.shape[-2:])
		@@ -225,10 +226,10 @@ class StableDiffusionProcessing:
		# HACK: Using introspection as the Depth2Image model doesn't appear to uniquely
		# identify itself with a field common to all models. The conditioning_key is also hybrid.
		if isinstance(self.sd_model, LatentDepth2ImageDiffusion):
		return self.depth2img_image_conditioning(source_image)
		return self.depth2img_image_conditioning(source_image.float() if devices.unet_needs_upcast else source_image)

		if self.sampler.conditioning_key in {'hybrid', 'concat'}:
		return self.inpainting_image_conditioning(source_image, latent_image, image_mask=image_mask)
		return self.inpainting_image_conditioning(source_image.float() if devices.unet_needs_upcast else source_image, latent_image, image_mask=image_mask)

		# Dummy zero conditioning if we're not using inpainting or depth model.
		return latent_image.new_zeros(latent_image.shape[0], 5, 1, 1)
		@@ -614,7 +615,7 @@ def process_images_inner(p: StableDiffusionProcessing) -> Processed:
		if p.n_iter > 1:
		shared.state.job = f"Batch {n+1} out of {p.n_iter}"

		with devices.autocast():
		with devices.without_autocast() if devices.unet_needs_upcast else devices.autocast():
		samples_ddim = p.sample(conditioning=c, unconditional_conditioning=uc, seeds=seeds, subseeds=subseeds, subseed_strength=p.subseed_strength, prompts=prompts)

		x_samples_ddim = [decode_first_stage(p.sd_model, samples_ddim[i:i+1].to(dtype=devices.dtype_vae))[0].cpu() for i in range(samples_ddim.size(0))]
		@@ -992,7 +993,7 @@ class StableDiffusionProcessingImg2Img(StableDiffusionProcessing):

		image = torch.from_numpy(batch_images)
		image = 2. * image - 1.
		image = image.to(shared.device)
		image = image.to(device=shared.device, dtype=devices.dtype_unet if devices.unet_needs_upcast else None)

		self.init_latent = self.sd_model.get_first_stage_encoding(self.sd_model.encode_first_stage(image))

modules/sd_hijack_optimizations.py

+99 −60

Original line number	Diff line number	Diff line
		@@ -9,7 +9,7 @@ from torch import einsum
		from ldm.util import default
		from einops import rearrange

		from modules import shared, errors
		from modules import shared, errors, devices
		from modules.hypernetworks import hypernetwork

		from .sub_quadratic_attention import efficient_dot_product_attention
		@@ -52,7 +52,12 @@ def split_cross_attention_forward_v1(self, x, context=None, mask=None):
		q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q_in, k_in, v_in))
		del q_in, k_in, v_in

		r1 = torch.zeros(q.shape[0], q.shape[1], v.shape[2], device=q.device)
		dtype = q.dtype
		if shared.opts.upcast_attn:
		q, k, v = q.float(), k.float(), v.float()

		with devices.without_autocast(disable=not shared.opts.upcast_attn):
		r1 = torch.zeros(q.shape[0], q.shape[1], v.shape[2], device=q.device, dtype=q.dtype)
		for i in range(0, q.shape[0], 2):
		end = i + 2
		s1 = einsum('b i d, b j d -> b i j', q[i:end], k[i:end])
		@@ -65,6 +70,8 @@ def split_cross_attention_forward_v1(self, x, context=None, mask=None):
		del s2
		del q, k, v

		r1 = r1.to(dtype)

		r2 = rearrange(r1, '(b h) n d -> b n (h d)', h=h)
		del r1

		@@ -82,7 +89,12 @@ def split_cross_attention_forward(self, x, context=None, mask=None):
		k_in = self.to_k(context_k)
		v_in = self.to_v(context_v)

		k_in *= self.scale
		dtype = q_in.dtype
		if shared.opts.upcast_attn:
		q_in, k_in, v_in = q_in.float(), k_in.float(), v_in if v_in.device.type == 'mps' else v_in.float()

		with devices.without_autocast(disable=not shared.opts.upcast_attn):
		k_in = k_in * self.scale

		del context, x

		@@ -122,6 +134,8 @@ def split_cross_attention_forward(self, x, context=None, mask=None):

		del q, k, v

		r1 = r1.to(dtype)

		r2 = rearrange(r1, '(b h) n d -> b n (h d)', h=h)
		del r1

		@@ -204,12 +218,20 @@ def split_cross_attention_forward_invokeAI(self, x, context=None, mask=None):
		context = default(context, x)

		context_k, context_v = hypernetwork.apply_hypernetworks(shared.loaded_hypernetworks, context)
		k = self.to_k(context_k) * self.scale
		k = self.to_k(context_k)
		v = self.to_v(context_v)
		del context, context_k, context_v, x

		dtype = q.dtype
		if shared.opts.upcast_attn:
		q, k, v = q.float(), k.float(), v if v.device.type == 'mps' else v.float()

		with devices.without_autocast(disable=not shared.opts.upcast_attn):
		k = k * self.scale

		q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
		r = einsum_op(q, k, v)
		r = r.to(dtype)
		return self.to_out(rearrange(r, '(b h) n d -> b n (h d)', h=h))

		# -- End of code from https://github.com/invoke-ai/InvokeAI --
		@@ -234,8 +256,14 @@ def sub_quad_attention_forward(self, x, context=None, mask=None):
		k = k.unflatten(-1, (h, -1)).transpose(1,2).flatten(end_dim=1)
		v = v.unflatten(-1, (h, -1)).transpose(1,2).flatten(end_dim=1)

		dtype = q.dtype
		if shared.opts.upcast_attn:
		q, k = q.float(), k.float()

		x = sub_quad_attention(q, k, v, q_chunk_size=shared.cmd_opts.sub_quad_q_chunk_size, kv_chunk_size=shared.cmd_opts.sub_quad_kv_chunk_size, chunk_threshold=shared.cmd_opts.sub_quad_chunk_threshold, use_checkpoint=self.training)

		x = x.to(dtype)

		x = x.unflatten(0, (-1, h)).transpose(1,2).flatten(start_dim=2)

		out_proj, dropout = self.to_out
		@@ -268,6 +296,7 @@ def sub_quad_attention(q, k, v, q_chunk_size=1024, kv_chunk_size=None, kv_chunk_
		query_chunk_size = q_tokens
		kv_chunk_size = k_tokens

		with devices.without_autocast(disable=q.dtype == v.dtype):
		return efficient_dot_product_attention(
		q,
		k,
		@@ -306,8 +335,14 @@ def xformers_attention_forward(self, x, context=None, mask=None):
		q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b n h d', h=h), (q_in, k_in, v_in))
		del q_in, k_in, v_in

		dtype = q.dtype
		if shared.opts.upcast_attn:
		q, k = q.float(), k.float()

		out = xformers.ops.memory_efficient_attention(q, k, v, attn_bias=None, op=get_xformers_flash_attention_op(q, k, v))

		out = out.to(dtype)

		out = rearrange(out, 'b n h d -> b n (h d)', h=h)
		return self.to_out(out)

		@@ -378,10 +413,14 @@ def xformers_attnblock_forward(self, x):
		v = self.v(h_)
		b, c, h, w = q.shape
		q, k, v = map(lambda t: rearrange(t, 'b c h w -> b (h w) c'), (q, k, v))
		dtype = q.dtype
		if shared.opts.upcast_attn:
		q, k = q.float(), k.float()
		q = q.contiguous()
		k = k.contiguous()
		v = v.contiguous()
		out = xformers.ops.memory_efficient_attention(q, k, v, op=get_xformers_flash_attention_op(q, k, v))
		out = out.to(dtype)
		out = rearrange(out, 'b (h w) c -> b c h w', h=h)
		out = self.proj_out(out)
		return x + out

Admin message