quic
diff --git a/‎…nt/diffusers/pipelines/modeling_utils.py‎ ‎…cient/diffusers/models/modeling_utils.py‎QEfficient/diffusers/pipelines/modeling_utils.py renamed to QEfficient/diffusers/models/modeling_utils.py
Lines changed: 18 additions & 8 deletions b/‎…nt/diffusers/pipelines/modeling_utils.py‎ ‎…cient/diffusers/models/modeling_utils.py‎QEfficient/diffusers/pipelines/modeling_utils.py renamed to QEfficient/diffusers/models/modeling_utils.py
Lines changed: 18 additions & 8 deletions
diff --git a/‎QEfficient/diffusers/models/transformers/transformer_wan.py‎
Lines changed: 5 additions & 2 deletions b/‎QEfficient/diffusers/models/transformers/transformer_wan.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎QEfficient/diffusers/pipelines/configs/wan_config.json‎
Lines changed: 35 additions & 45 deletions b/‎QEfficient/diffusers/pipelines/configs/wan_config.json‎
Lines changed: 35 additions & 45 deletions
diff --git a/‎QEfficient/diffusers/pipelines/pipeline_module.py‎
Lines changed: 3 additions & 16 deletions b/‎QEfficient/diffusers/pipelines/pipeline_module.py‎
Lines changed: 3 additions & 16 deletions
diff --git a/‎QEfficient/diffusers/pipelines/pipeline_utils.py‎
Lines changed: 10 additions & 5 deletions b/‎QEfficient/diffusers/pipelines/pipeline_utils.py‎
Lines changed: 10 additions & 5 deletions
@@ -40,6 +40,7 @@ def apply_head_blocking(
     q: torch.FloatTensor,
     k: torch.FloatTensor,
     v: torch.FloatTensor,
+    head_block_size: int,
     attention_mask: Optional[torch.FloatTensor] = None,
 ) -> torch.FloatTensor:
     """
@@ -62,7 +63,6 @@ def apply_head_blocking(
     scale_factor = 1.0 / math.sqrt(DH)
 
     # Get head blocking configuration
-    _, head_block_size, _, _ = get_attention_blocking_config()
     head_block_size = head_block_size or NH
     num_head_blocks = math.ceil(NH / head_block_size)
 
@@ -107,6 +107,8 @@ def apply_kv_blocking(
     q: torch.FloatTensor,
     k: torch.FloatTensor,
     v: torch.FloatTensor,
+    head_block_size: int,
+    num_kv_blocks: int,
     attention_mask: Optional[torch.FloatTensor] = None,
 ) -> torch.FloatTensor:
     """
@@ -129,7 +131,6 @@ def apply_kv_blocking(
     scale_factor = 1.0 / math.sqrt(DH)
 
     # Get blocking configuration
-    _, head_block_size, num_kv_blocks, _ = get_attention_blocking_config()
     head_block_size = head_block_size or NH
     num_kv_blocks = num_kv_blocks or CL
     num_head_blocks = math.ceil(NH / head_block_size)
@@ -210,6 +211,8 @@ def apply_q_blocking(
     q: torch.FloatTensor,
     k: torch.FloatTensor,
     v: torch.FloatTensor,
+    head_block_size: int,
+    num_q_blocks: int,
     attention_mask: Optional[torch.FloatTensor] = None,
 ) -> torch.FloatTensor:
     """
@@ -232,7 +235,6 @@ def apply_q_blocking(
     scale_factor = 1.0 / math.sqrt(DH)
 
     # Get blocking configuration
-    _, head_block_size, _, num_q_blocks = get_attention_blocking_config()
     head_block_size = head_block_size or NH
     num_q_blocks = num_q_blocks or CL
     num_head_blocks = math.ceil(NH / head_block_size)
@@ -292,6 +294,9 @@ def apply_qkv_blocking(
     q: torch.FloatTensor,
     k: torch.FloatTensor,
     v: torch.FloatTensor,
+    head_block_size: int,
+    num_kv_blocks: int,
+    num_q_blocks: int,
     attention_mask: Optional[torch.FloatTensor] = None,
 ) -> torch.FloatTensor:
     """
@@ -313,7 +318,6 @@ def apply_qkv_blocking(
     scale_factor = 1.0 / math.sqrt(DH)
 
     # Get blocking configuration from environment variables
-    _, head_block_size, num_kv_blocks, num_q_blocks = get_attention_blocking_config()
     head_block_size = head_block_size or NH
     num_kv_blocks = num_kv_blocks or CL
     num_q_blocks = num_q_blocks or CL
@@ -420,6 +424,9 @@ def compute_blocked_attention(
     q: torch.FloatTensor,
     k: torch.FloatTensor,
     v: torch.FloatTensor,
+    head_block_size: int,
+    num_kv_blocks: int,
+    num_q_blocks: int,
     blocking_mode: str = "default",
     attention_mask: Optional[torch.FloatTensor] = None,
 ) -> torch.FloatTensor:
@@ -430,17 +437,20 @@ def compute_blocked_attention(
         q (torch.FloatTensor): Query tensor of shape (BS, NH, CL, DH)
         k (torch.FloatTensor): Key tensor of shape (BS, NH, CL, DH)
         v (torch.FloatTensor): Value tensor of shape (BS, NH, CL, DH)
+        head_block_size (int) : Head blocking size
+        num_kv_blocks (int) : Number of KV blocks
+        num_q_blocks (int) : Number of Q blocks
         blocking_mode (str): Blocking strategy ('kv', 'q', 'qkv', 'default')
         attention_mask (Optional[torch.FloatTensor]): Attention mask tensor
 
     Returns:
         torch.FloatTensor: Attention output of shape (BS, NH, CL, DH)
     """
     if blocking_mode == "kv":
-        return apply_kv_blocking(q, k, v, attention_mask)
+        return apply_kv_blocking(q, k, v, head_block_size, num_kv_blocks, attention_mask)
     elif blocking_mode == "q":
-        return apply_q_blocking(q, k, v, attention_mask)
+        return apply_q_blocking(q, k, v, head_block_size, num_q_blocks, attention_mask)
     elif blocking_mode == "qkv":
-        return apply_qkv_blocking(q, k, v, attention_mask)
+        return apply_qkv_blocking(q, k, v, head_block_size, num_kv_blocks, num_q_blocks, attention_mask)
     else:  # default
-        return apply_head_blocking(q, k, v, attention_mask)
+        return apply_head_blocking(q, k, v, head_block_size, attention_mask)
@@ -26,7 +26,7 @@
 )
 from diffusers.utils import set_weights_and_activate_adapters
 
-from QEfficient.diffusers.pipelines.modeling_utils import (
+from QEfficient.diffusers.models.modeling_utils import (
     compute_blocked_attention,
     get_attention_blocking_config,
 )
@@ -113,12 +113,15 @@ def apply_rotary_emb(
             key = apply_rotary_emb(key, *rotary_emb)
 
         # Get blocking configuration
-        blocking_mode, _, _, _ = get_attention_blocking_config()
+        blocking_mode, head_block_size, num_kv_blocks, num_q_blocks = get_attention_blocking_config()
         # Apply blocking using pipeline_utils
         hidden_states = compute_blocked_attention(
             query.transpose(1, 2),
             key.transpose(1, 2),
             value.transpose(1, 2),
+            head_block_size,
+            num_kv_blocks,
+            num_q_blocks,
             blocking_mode=blocking_mode,
             attention_mask=attention_mask,
         )
 
@@ -1,46 +1,36 @@
 {
-    "description": "Default configuration for Wan unified transformer",
-    "modules": {
-        "transformer": {
-            "specializations": [
-                                {
-                                "batch_size": "1",
-                                "num_channels": "16",
-                                "num_frames": "21",
-                                "latent_height": "24",
-                                "latent_width": "40",
-                                "steps": "1",
-                                "sequence_length": "512",
-                                "cl": "5040",
-                                "model_type": 1
-                                },
-                                {
-                                "batch_size": "1",
-                                "num_channels": "16",
-                                "num_frames": "21",
-                                "latent_height": "24",
-                                "latent_width": "40",
-                                "steps": "1",
-                                "sequence_length": "512",
-                                "cl": "5040",
-                                "model_type": 2
-                                }
-                            ],
-            "compilation":
-                          {
-                            "onnx_path": null,
-                            "compile_dir": null,
-                            "mdp_ts_num_devices": 16,
-                            "mxfp6_matmul": true,
-                            "convert_to_fp16": true,
-                            "aic_num_cores": 16,
-                            "mos": 1,
-                            "mdts_mos": 1
-                            },
-            "execute":
-                      {
-                        "device_ids": null
-                      }
-        }
-    },
-}
+  "description": "Default configuration for Wan pipeline with unified transformer (model_type: 1 for high noise; model_type:2 for low noise)",
+  "modules": {
+    "transformer": {
+          "specializations": [
+                              {
+                                  "batch_size": "1",
+                                  "num_channels": "16",
+                                  "steps": "1",
+                                  "sequence_length": "512",
+                                  "model_type": 1
+                              },
+                              {
+                                  "batch_size": "1",
+                                  "num_channels": "16",
+                                  "steps": "1",
+                                  "sequence_length": "512",
+                                  "model_type": 2
+                              }
+                          ],
+          "compilation":  {
+                              "onnx_path": null,
+                              "compile_dir": null,
+                              "mdp_ts_num_devices": 16,
+                              "mxfp6_matmul": true,
+                              "convert_to_fp16": true,
+                              "aic_num_cores": 16,
+                              "mos": 1,
+                              "mdts_mos": 1
+                         },
+          "execute":     {
+                              "device_ids": null
+                          }
+    }
+  }
+}
@@ -458,7 +458,7 @@ def export(
         """
 
         if use_onnx_subfunctions:
-            export_kwargs = {"export_modules_as_functions": {QEffFluxTransformerBlock, QEffFluxSingleTransformerBlock}}
+            export_kwargs = {"export_modules_as_functions": {QEffFluxTransformerBlock, QEffFluxSingleTransformerBlock}, "use_onnx_subfunctions":True}
 
         # Sort _use_default_values in config to ensure consistent hash generation during export
         self.model.config["_use_default_values"].sort()
@@ -591,7 +591,7 @@ def export(
         output_names: List[str],
         dynamic_axes: Dict,
         export_dir: str = None,
-        export_kwargs: Dict = None,
+        export_kwargs: Dict = {},
         use_onnx_subfunctions: bool = False,
     ) -> str:
         """Export the Wan transformer model to ONNX format.
@@ -607,14 +607,8 @@ def export(
         Returns:
             str: Path to the exported ONNX model
         """
-        if export_kwargs is None:
-            export_kwargs = {}
-
         if use_onnx_subfunctions:
-            export_kwargs = {"export_modules_as_functions": {WanTransformerBlock}}
-
-        # torch patch to export onnx with subfunction
-        apply_torch_patches()  # TODO: Moving to _export is better
+            export_kwargs = {"export_modules_as_functions": {WanTransformerBlock}, "use_onnx_subfunctions":True}
 
         return self._export(
             example_inputs=inputs,
@@ -634,10 +628,3 @@ def compile(self, specializations, **compiler_options) -> None:
             **compiler_options: Additional compiler options (e.g., num_cores, aic_num_of_activations)
         """
         self._compile(specializations=specializations, **compiler_options)
-
-    @property
-    def model_name(self) -> str:
-        mname = self.model.__class__.__name__
-        if mname.startswith("QEff") or mname.startswith("QEFF"):
-            mname = mname[4:]
-        return mname
@@ -39,7 +39,7 @@ def calculate_compressed_latent_dimension(height: int, width: int, vae_scale_fac
     return cl, latent_height, latent_width
 
 
-def calculate_latent_dimensions(
+def calculate_latent_dimensions_with_frames(
     height: int,
     width: int,
     num_frames: int,
@@ -49,22 +49,27 @@ def calculate_latent_dimensions(
     patch_width: int,
 ) -> int:
     """
-    Calculate the latent dimensions, Compressed latent dimension (cl) for transformer buffer allocation.
+    Calculate the latent dimensions for video generation models.
 
-    This method computes the compressed sequence length (cl) that the transformer
-    will process, based on the target video dimensions, VAE scale factors, and
-    patch sizes. This is crucial for proper buffer allocation in QAIC inference.
+    This method computes the compressed sequence length (cl),
+    Latent height, Latent width , Latent frames based on the
+    target video dimensions, VAE scale factors, and patch sizes.
 
     Args:
         height (int): Target video height in pixels
         width (int): Target video width in pixels
         num_frames (int): Target video frames in pixels
+        vae_scale_factor_spatial (int): spatial vae_scale_factor from model config
+        vae_scale_factor_temporal (int): temporal vae_scale_factor from model config
+        patch_height (int): patch_height from model config
+        patch_width (int): patch_width from model config
 
     Returns:
         tuple: (cl, latent_height, latent_width)
             - cl (int): Compressed latent dimension for transformer input
             - latent_height (int): Height in latent space
             - latent_width (int): Width in latent space
+            - latent_frames (int): frames in latent space
 
     Mathematical Formula:
         latent_height = height // vae_scale_factor_spatial