Commit 37acba26 authored by Pam's avatar Pam
Browse files

argument to disable memory efficient for sdp

parent fec0a895
Loading
Loading
Loading
Loading
+8 −3
Original line number Diff line number Diff line
@@ -43,6 +43,11 @@ def apply_optimizations():
        ldm.modules.diffusionmodules.model.AttnBlock.forward = sd_hijack_optimizations.xformers_attnblock_forward
        optimization_method = 'xformers'
    elif cmd_opts.opt_sdp_attention and (hasattr(torch.nn.functional, "scaled_dot_product_attention") and callable(getattr(torch.nn.functional, "scaled_dot_product_attention"))):
        if cmd_opts.opt_sdp_no_mem_attention:
            print("Applying scaled dot product cross attention optimization (without memory efficient attention).")
            ldm.modules.attention.CrossAttention.forward = sd_hijack_optimizations.scaled_dot_product_no_mem_attention_forward
            optimization_method = 'sdp-no-mem'
        else:
            print("Applying scaled dot product cross attention optimization.")
            ldm.modules.attention.CrossAttention.forward = sd_hijack_optimizations.scaled_dot_product_attention_forward
            optimization_method = 'sdp'
+4 −0
Original line number Diff line number Diff line
@@ -388,6 +388,10 @@ def scaled_dot_product_attention_forward(self, x, context=None, mask=None):
    hidden_states = self.to_out[1](hidden_states)
    return hidden_states

def scaled_dot_product_no_mem_attention_forward(self, x, context=None, mask=None):
    with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=True, enable_mem_efficient=False):
        return scaled_dot_product_attention_forward(self, x, context, mask)

def cross_attention_attnblock_forward(self, x):
        h_ = x
        h_ = self.norm(h_)
+1 −0
Original line number Diff line number Diff line
@@ -70,6 +70,7 @@ parser.add_argument("--sub-quad-chunk-threshold", type=int, help="the percentage
parser.add_argument("--opt-split-attention-invokeai", action='store_true', help="force-enables InvokeAI's cross-attention layer optimization. By default, it's on when cuda is unavailable.")
parser.add_argument("--opt-split-attention-v1", action='store_true', help="enable older version of split attention optimization that does not consume all the VRAM it can find")
parser.add_argument("--opt-sdp-attention", action='store_true', help="enable scaled dot product cross-attention layer optimization; requires PyTorch 2.*")
parser.add_argument("--opt-sdp-no-mem-attention", action='store_true', help="disables memory efficient sdp, makes image generation deterministic; requires --opt-sdp-attention")
parser.add_argument("--disable-opt-split-attention", action='store_true', help="force-disables cross-attention layer optimization")
parser.add_argument("--disable-nan-check", action='store_true', help="do not check if produced images/latent spaces have nans; useful for running without a checkpoint in CI")
parser.add_argument("--use-cpu", nargs='+', help="use CPU as torch device for specified modules", default=[], type=str.lower)