From d31e226650ad01daefff66ec202992b8c3bf8384 Mon Sep 17 00:00:00 2001
From: comfyanonymous <comfyanonymous@protonmail.com>
Date: Wed, 28 Aug 2024 16:18:39 -0400
Subject: [PATCH] Unify RMSNorm code.

---
 comfy/ldm/common_dit.py                     | 13 +++++++++++
 comfy/ldm/flux/layers.py                    |  4 ++--
 comfy/ldm/modules/diffusionmodules/mmdit.py | 24 ++-------------------
 3 files changed, 17 insertions(+), 24 deletions(-)

diff --git a/comfy/ldm/common_dit.py b/comfy/ldm/common_dit.py
index 9900255..9016abc 100644
--- a/comfy/ldm/common_dit.py
+++ b/comfy/ldm/common_dit.py
@@ -1,4 +1,5 @@
 import torch
+import comfy.ops
 
 def pad_to_patch_size(img, patch_size=(2, 2), padding_mode="circular"):
     if padding_mode == "circular" and torch.jit.is_tracing() or torch.jit.is_scripting():
@@ -6,3 +7,15 @@ def pad_to_patch_size(img, patch_size=(2, 2), padding_mode="circular"):
     pad_h = (patch_size[0] - img.shape[-2] % patch_size[0]) % patch_size[0]
     pad_w = (patch_size[1] - img.shape[-1] % patch_size[1]) % patch_size[1]
     return torch.nn.functional.pad(img, (0, pad_w, 0, pad_h), mode=padding_mode)
+
+try:
+    rms_norm_torch = torch.nn.functional.rms_norm
+except:
+    rms_norm_torch = None
+
+def rms_norm(x, weight, eps=1e-6):
+    if rms_norm_torch is not None:
+        return rms_norm_torch(x, weight.shape, weight=comfy.ops.cast_to(weight, dtype=x.dtype, device=x.device), eps=eps)
+    else:
+        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + eps)
+        return (x * rrms) * comfy.ops.cast_to(weight, dtype=x.dtype, device=x.device)
diff --git a/comfy/ldm/flux/layers.py b/comfy/ldm/flux/layers.py
index 20bd285..dabab3e 100644
--- a/comfy/ldm/flux/layers.py
+++ b/comfy/ldm/flux/layers.py
@@ -6,6 +6,7 @@ from torch import Tensor, nn
 
 from .math import attention, rope
 import comfy.ops
+import comfy.ldm.common_dit
 
 
 class EmbedND(nn.Module):
@@ -63,8 +64,7 @@ class RMSNorm(torch.nn.Module):
         self.scale = nn.Parameter(torch.empty((dim), dtype=dtype, device=device))
 
     def forward(self, x: Tensor):
-        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
-        return (x * rrms) * comfy.ops.cast_to(self.scale, dtype=x.dtype, device=x.device)
+        return comfy.ldm.common_dit.rms_norm(x, self.scale, 1e-6)
 
 
 class QKNorm(torch.nn.Module):
diff --git a/comfy/ldm/modules/diffusionmodules/mmdit.py b/comfy/ldm/modules/diffusionmodules/mmdit.py
index 491a58a..759788a 100644
--- a/comfy/ldm/modules/diffusionmodules/mmdit.py
+++ b/comfy/ldm/modules/diffusionmodules/mmdit.py
@@ -355,29 +355,9 @@ class RMSNorm(torch.nn.Module):
         else:
             self.register_parameter("weight", None)
 
-    def _norm(self, x):
-        """
-        Apply the RMSNorm normalization to the input tensor.
-        Args:
-            x (torch.Tensor): The input tensor.
-        Returns:
-            torch.Tensor: The normalized tensor.
-        """
-        return x * torch.rsqrt(x.pow(2).mean(-1, keepdim=True) + self.eps)
-
     def forward(self, x):
-        """
-        Forward pass through the RMSNorm layer.
-        Args:
-            x (torch.Tensor): The input tensor.
-        Returns:
-            torch.Tensor: The output tensor after applying RMSNorm.
-        """
-        x = self._norm(x)
-        if self.learnable_scale:
-            return x * self.weight.to(device=x.device, dtype=x.dtype)
-        else:
-            return x
+        return comfy.ldm.common_dit.rms_norm(x, self.weight, self.eps)
+
 
 
 class SwiGLUFeedForward(nn.Module):