Merge pull request #6055 from brkirch/sub-quad_attn_opt (c295e4a2) · Commits · github_fork / Stable Diffusion Webui

README.md

+1 −0

Original line number	Diff line number	Diff line
		@@ -141,6 +141,7 @@ Licenses for borrowed code can be found in `Settings -> Licenses` screen, and al
		- Ideas for optimizations - https://github.com/basujindal/stable-diffusion
		- Cross Attention layer optimization - Doggettx - https://github.com/Doggettx/stable-diffusion, original idea for prompt editing.
		- Cross Attention layer optimization - InvokeAI, lstein - https://github.com/invoke-ai/InvokeAI (originally http://github.com/lstein/stable-diffusion)
		- Sub-quadratic Cross Attention layer optimization - Alex Birch (https://github.com/Birch-san/diffusers/pull/1), Amin Rezaei (https://github.com/AminRezaei0x443/memory-efficient-attention)
		- Textual Inversion - Rinon Gal - https://github.com/rinongal/textual_inversion (we're not using his code, but we are using his ideas).
		- Idea for SD upscale - https://github.com/jquesnelle/txt2imghd
		- Noise generation for outpainting mk2 - https://github.com/parlance-zz/g-diffuser-bot

html/licenses.html

+28 −1

Original line number	Diff line number	Diff line
		@@ -184,7 +184,7 @@ SOFTWARE.
		</pre>

		<h2><a href="https://github.com/JingyunLiang/SwinIR/blob/main/LICENSE">SwinIR</a></h2>
		<small>Code added by contirubtors, most likely copied from this repository.</small>
		<small>Code added by contributors, most likely copied from this repository.</small>

		<pre>
		Apache License
		@@ -390,3 +390,30 @@ SOFTWARE.
		limitations under the License.
		</pre>

		<h2><a href="https://github.com/AminRezaei0x443/memory-efficient-attention/blob/main/LICENSE">Memory Efficient Attention</a></h2>
		<small>The sub-quadratic cross attention optimization uses modified code from the Memory Efficient Attention package that Alex Birch optimized for 3D tensors. This license is updated to reflect that.</small>
		<pre>
		MIT License

		Copyright (c) 2023 Alex Birch
		Copyright (c) 2023 Amin Rezaei

		Permission is hereby granted, free of charge, to any person obtaining a copy
		of this software and associated documentation files (the "Software"), to deal
		in the Software without restriction, including without limitation the rights
		to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
		copies of the Software, and to permit persons to whom the Software is
		furnished to do so, subject to the following conditions:

		The above copyright notice and this permission notice shall be included in all
		copies or substantial portions of the Software.

		THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
		IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
		FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
		AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
		LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
		OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
		SOFTWARE.
		</pre>

modules/sd_hijack.py

+9 −12

Original line number	Diff line number	Diff line
		@@ -7,8 +7,6 @@ from modules.hypernetworks import hypernetwork
		from modules.shared import cmd_opts
		from modules import sd_hijack_clip, sd_hijack_open_clip, sd_hijack_unet, sd_hijack_xlmr, xlmr

		from modules.sd_hijack_optimizations import invokeAI_mps_available

		import ldm.modules.attention
		import ldm.modules.diffusionmodules.model
		import ldm.modules.diffusionmodules.openaimodel
		@@ -43,17 +41,16 @@ def apply_optimizations():
		ldm.modules.attention.CrossAttention.forward = sd_hijack_optimizations.xformers_attention_forward
		ldm.modules.diffusionmodules.model.AttnBlock.forward = sd_hijack_optimizations.xformers_attnblock_forward
		optimization_method = 'xformers'
		elif cmd_opts.opt_sub_quad_attention:
		print("Applying sub-quadratic cross attention optimization.")
		ldm.modules.attention.CrossAttention.forward = sd_hijack_optimizations.sub_quad_attention_forward
		ldm.modules.diffusionmodules.model.AttnBlock.forward = sd_hijack_optimizations.sub_quad_attnblock_forward
		optimization_method = 'sub-quadratic'
		elif cmd_opts.opt_split_attention_v1:
		print("Applying v1 cross attention optimization.")
		ldm.modules.attention.CrossAttention.forward = sd_hijack_optimizations.split_cross_attention_forward_v1
		optimization_method = 'V1'
		elif not cmd_opts.disable_opt_split_attention and (cmd_opts.opt_split_attention_invokeai or not torch.cuda.is_available()):
		if not invokeAI_mps_available and shared.device.type == 'mps':
		print("The InvokeAI cross attention optimization for MPS requires the psutil package which is not installed.")
		print("Applying v1 cross attention optimization.")
		ldm.modules.attention.CrossAttention.forward = sd_hijack_optimizations.split_cross_attention_forward_v1
		optimization_method = 'V1'
		else:
		elif not cmd_opts.disable_opt_split_attention and (cmd_opts.opt_split_attention_invokeai or not cmd_opts.opt_split_attention and not torch.cuda.is_available()):
		print("Applying cross attention optimization (InvokeAI).")
		ldm.modules.attention.CrossAttention.forward = sd_hijack_optimizations.split_cross_attention_forward_invokeAI
		optimization_method = 'InvokeAI'

modules/sd_hijack_optimizations.py

+100 −25

Original line number	Diff line number	Diff line
		import math
		import sys
		import traceback
		import importlib
		import psutil

		import torch
		from torch import einsum
		@@ -12,6 +12,8 @@ from einops import rearrange
		from modules import shared
		from modules.hypernetworks import hypernetwork

		from .sub_quadratic_attention import efficient_dot_product_attention


		if shared.cmd_opts.xformers or shared.cmd_opts.force_enable_xformers:
		try:
		@@ -22,6 +24,19 @@ if shared.cmd_opts.xformers or shared.cmd_opts.force_enable_xformers:
		print(traceback.format_exc(), file=sys.stderr)


		def get_available_vram():
		if shared.device.type == 'cuda':
		stats = torch.cuda.memory_stats(shared.device)
		mem_active = stats['active_bytes.all.current']
		mem_reserved = stats['reserved_bytes.all.current']
		mem_free_cuda, _ = torch.cuda.mem_get_info(torch.cuda.current_device())
		mem_free_torch = mem_reserved - mem_active
		mem_free_total = mem_free_cuda + mem_free_torch
		return mem_free_total
		else:
		return psutil.virtual_memory().available


		# see https://github.com/basujindal/stable-diffusion/pull/117 for discussion
		def split_cross_attention_forward_v1(self, x, context=None, mask=None):
		h = self.heads
		@@ -76,12 +91,7 @@ def split_cross_attention_forward(self, x, context=None, mask=None):

		r1 = torch.zeros(q.shape[0], q.shape[1], v.shape[2], device=q.device, dtype=q.dtype)

		stats = torch.cuda.memory_stats(q.device)
		mem_active = stats['active_bytes.all.current']
		mem_reserved = stats['reserved_bytes.all.current']
		mem_free_cuda, _ = torch.cuda.mem_get_info(torch.cuda.current_device())
		mem_free_torch = mem_reserved - mem_active
		mem_free_total = mem_free_cuda + mem_free_torch
		mem_free_total = get_available_vram()

		gb = 1024 ** 3
		tensor_size = q.shape[0] * q.shape[1] * k.shape[1] * q.element_size()
		@@ -118,18 +128,7 @@ def split_cross_attention_forward(self, x, context=None, mask=None):
		return self.to_out(r2)


		def check_for_psutil():
		try:
		spec = importlib.util.find_spec('psutil')
		return spec is not None
		except ModuleNotFoundError:
		return False

		invokeAI_mps_available = check_for_psutil()

		# -- Taken from https://github.com/invoke-ai/InvokeAI and modified --
		if invokeAI_mps_available:
		import psutil
		mem_total_gb = psutil.virtual_memory().total // (1 << 30)

		def einsum_op_compvis(q, k, v):
		@@ -215,6 +214,71 @@ def split_cross_attention_forward_invokeAI(self, x, context=None, mask=None):

		# -- End of code from https://github.com/invoke-ai/InvokeAI --


		# Based on Birch-san's modified implementation of sub-quadratic attention from https://github.com/Birch-san/diffusers/pull/1
		# The sub_quad_attention_forward function is under the MIT License listed under Memory Efficient Attention in the Licenses section of the web UI interface
		def sub_quad_attention_forward(self, x, context=None, mask=None):
		assert mask is None, "attention-mask not currently implemented for SubQuadraticCrossAttnProcessor."

		h = self.heads

		q = self.to_q(x)
		context = default(context, x)

		context_k, context_v = hypernetwork.apply_hypernetwork(shared.loaded_hypernetwork, context)
		k = self.to_k(context_k)
		v = self.to_v(context_v)
		del context, context_k, context_v, x

		q = q.unflatten(-1, (h, -1)).transpose(1,2).flatten(end_dim=1)
		k = k.unflatten(-1, (h, -1)).transpose(1,2).flatten(end_dim=1)
		v = v.unflatten(-1, (h, -1)).transpose(1,2).flatten(end_dim=1)

		x = sub_quad_attention(q, k, v, q_chunk_size=shared.cmd_opts.sub_quad_q_chunk_size, kv_chunk_size=shared.cmd_opts.sub_quad_kv_chunk_size, chunk_threshold=shared.cmd_opts.sub_quad_chunk_threshold, use_checkpoint=self.training)

		x = x.unflatten(0, (-1, h)).transpose(1,2).flatten(start_dim=2)

		out_proj, dropout = self.to_out
		x = out_proj(x)
		x = dropout(x)

		return x

		def sub_quad_attention(q, k, v, q_chunk_size=1024, kv_chunk_size=None, kv_chunk_size_min=None, chunk_threshold=None, use_checkpoint=True):
		bytes_per_token = torch.finfo(q.dtype).bits//8
		batch_x_heads, q_tokens, _ = q.shape
		_, k_tokens, _ = k.shape
		qk_matmul_size_bytes = batch_x_heads * bytes_per_token * q_tokens * k_tokens

		if chunk_threshold is None:
		chunk_threshold_bytes = int(get_available_vram() * 0.9) if q.device.type == 'mps' else int(get_available_vram() * 0.7)
		elif chunk_threshold == 0:
		chunk_threshold_bytes = None
		else:
		chunk_threshold_bytes = int(0.01 * chunk_threshold * get_available_vram())

		if kv_chunk_size_min is None and chunk_threshold_bytes is not None:
		kv_chunk_size_min = chunk_threshold_bytes // (batch_x_heads * bytes_per_token * (k.shape[2] + v.shape[2]))
		elif kv_chunk_size_min == 0:
		kv_chunk_size_min = None

		if chunk_threshold_bytes is not None and qk_matmul_size_bytes <= chunk_threshold_bytes:
		# the big matmul fits into our memory limit; do everything in 1 chunk,
		# i.e. send it down the unchunked fast-path
		query_chunk_size = q_tokens
		kv_chunk_size = k_tokens

		return efficient_dot_product_attention(
		q,
		k,
		v,
		query_chunk_size=q_chunk_size,
		kv_chunk_size=kv_chunk_size,
		kv_chunk_size_min = kv_chunk_size_min,
		use_checkpoint=use_checkpoint,
		)


		def xformers_attention_forward(self, x, context=None, mask=None):
		h = self.heads
		q_in = self.to_q(x)
		@@ -252,12 +316,7 @@ def cross_attention_attnblock_forward(self, x):

		h_ = torch.zeros_like(k, device=q.device)

		stats = torch.cuda.memory_stats(q.device)
		mem_active = stats['active_bytes.all.current']
		mem_reserved = stats['reserved_bytes.all.current']
		mem_free_cuda, _ = torch.cuda.mem_get_info(torch.cuda.current_device())
		mem_free_torch = mem_reserved - mem_active
		mem_free_total = mem_free_cuda + mem_free_torch
		mem_free_total = get_available_vram()

		tensor_size = q.shape[0] * q.shape[1] * k.shape[2] * q.element_size()
		mem_required = tensor_size * 2.5
		@@ -312,3 +371,19 @@ def xformers_attnblock_forward(self, x):
		return x + out
		except NotImplementedError:
		return cross_attention_attnblock_forward(self, x)

		def sub_quad_attnblock_forward(self, x):
		h_ = x
		h_ = self.norm(h_)
		q = self.q(h_)
		k = self.k(h_)
		v = self.v(h_)
		b, c, h, w = q.shape
		q, k, v = map(lambda t: rearrange(t, 'b c h w -> b (h w) c'), (q, k, v))
		q = q.contiguous()
		k = k.contiguous()
		v = v.contiguous()
		out = sub_quad_attention(q, k, v, q_chunk_size=shared.cmd_opts.sub_quad_q_chunk_size, kv_chunk_size=shared.cmd_opts.sub_quad_kv_chunk_size, chunk_threshold=shared.cmd_opts.sub_quad_chunk_threshold, use_checkpoint=self.training)
		out = rearrange(out, 'b (h w) c -> b c h w', h=h)
		out = self.proj_out(out)
		return x + out

modules/shared.py

+4 −0

Original line number	Diff line number	Diff line
		@@ -56,6 +56,10 @@ parser.add_argument("--xformers", action='store_true', help="enable xformers for
		parser.add_argument("--force-enable-xformers", action='store_true', help="enable xformers for cross attention layers regardless of whether the checking code thinks you can run it; do not make bug reports if this fails to work")
		parser.add_argument("--deepdanbooru", action='store_true', help="does not do anything")
		parser.add_argument("--opt-split-attention", action='store_true', help="force-enables Doggettx's cross-attention layer optimization. By default, it's on for torch cuda.")
		parser.add_argument("--opt-sub-quad-attention", action='store_true', help="enable memory efficient sub-quadratic cross-attention layer optimization")
		parser.add_argument("--sub-quad-q-chunk-size", type=int, help="query chunk size for the sub-quadratic cross-attention layer optimization to use", default=1024)
		parser.add_argument("--sub-quad-kv-chunk-size", type=int, help="kv chunk size for the sub-quadratic cross-attention layer optimization to use", default=None)
		parser.add_argument("--sub-quad-chunk-threshold", type=int, help="the percentage of VRAM threshold for the sub-quadratic cross-attention layer optimization to use chunking", default=None)
		parser.add_argument("--opt-split-attention-invokeai", action='store_true', help="force-enables InvokeAI's cross-attention layer optimization. By default, it's on when cuda is unavailable.")
		parser.add_argument("--opt-split-attention-v1", action='store_true', help="enable older version of split attention optimization that does not consume all the VRAM it can find")
		parser.add_argument("--disable-opt-split-attention", action='store_true', help="force-disables cross-attention layer optimization")

Admin message