diff --git a/.vscode/launch.json b/.vscode/launch.json
index c9238e502b..82f5390f5f 100644
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -11,9 +11,18 @@
       "program": "generateNetwork.py",
       "console": "integratedTerminal",
       "cwd": "${workspaceFolder}/DeeployTest",
+      "env": {
+        "PYTHONPATH": "${workspaceFolder}"
+      },
+      "subProcess": true,
       "justMyCode": false,
-      "args":
-          "-p${input:platformUntiled} -t${input:model} ${input:additionalArgsUntiled}"
+      "args": [
+        "-p",
+        "${input:platformUntiled}",
+        "-t",
+        "${input:model}",
+        "${input:additionalArgsUntiled}"
+      ]
     },
     {
       "name": "Deeploy Generate Tiled",
@@ -22,9 +31,18 @@
       "program": "testMVP.py",
       "console": "integratedTerminal",
       "cwd": "${workspaceFolder}/DeeployTest",
+      "env": {
+        "PYTHONPATH": "${workspaceFolder}"
+      },
+      "subProcess": true,
       "justMyCode": false,
-      "args":
-          "-p${input:platformTiled} -t${input:model} ${input:additionalArgsTiled}"
+      "args": [
+        "-p",
+        "${input:platformTiled}",
+        "-t",
+        "${input:model}",
+        "${input:additionalArgsTiled}"
+      ]
     }
   ],
   "inputs": [
@@ -85,7 +103,7 @@
       "id": "additionalArgsTiled",
       "type": "promptString",
       "description": "Additional Arguments",
-      "default": "-v --doublebuffer"
+      "default": "--doublebuffer"
     }
   ]
-}
\ No newline at end of file
+}
diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py b/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py
index 609a179c7b..e3ba2f8e1c 100644
--- a/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py
+++ b/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py
@@ -276,6 +276,7 @@ def apply(self,
 
         for buffer in inputs + transients:
             assert buffer._live == True, f"Tried to deallocate already dead buffer {buffer.name}"
+
             buffer._live = False
             # Don't deallocate if it's an alias of a live buffer
             if not buffer.has_live_aliases(ctxt):
@@ -363,7 +364,6 @@ def apply(self,
 
         for buffer in inputs + transients:
             assert buffer._live == True, f"Tried to deallocate already dead buffer {buffer.name}"
-
             memoryLevel = "None" if not hasattr(buffer, "_memoryLevel") else buffer._memoryLevel
             if memoryLevel not in ctxt._dynamicSize:
                 ctxt._dynamicSize[memoryLevel] = 0
diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py
index 44abe85112..a96d5fc046 100644
--- a/Deeploy/DeeployTypes.py
+++ b/Deeploy/DeeployTypes.py
@@ -63,6 +63,37 @@ class CodeGenVerbosity:
 _backendPostParsingFilename = 'backend_post_parsing'
 _backendPostBindingFilename = 'backend_post_binding'
 
+
+def _deeployTypeToNpType(ty: Type[BaseType]):
+
+    def _broadcastInteger(ty: Type[IntegerImmediate]):
+        if ty.signed:
+            return np.dtype(getattr(np, "int" + str(ty.typeWidth)))
+        else:
+            return np.dtype(getattr(np, "uint" + str(ty.typeWidth)))
+
+    def _broadcastFloat(ty: Type[FloatImmediate]):
+        if ty.typeWidth == 16:
+            return np.dtype(np.float16)
+        if ty.typeWidth == 32:
+            return np.dtype(np.float32)
+        if ty.typeWidth == 64:
+            return np.dtype(np.float64)
+        return np.dtype(np.float32)
+
+    if issubclass(ty, Pointer) and hasattr(ty, "referencedType"):
+        if issubclass(ty.referencedType, IntegerImmediate):
+            return _broadcastInteger(ty.referencedType)
+        if issubclass(ty.referencedType, FloatImmediate):
+            return _broadcastFloat(ty.referencedType)
+    elif issubclass(ty, IntegerImmediate):
+        return _broadcastInteger(ty)
+    elif issubclass(ty, FloatImmediate):
+        return _broadcastFloat(ty)
+
+    return None
+
+
 _ctxtExtension = '.pkl'
 _graphExtension = '.onnx'
 _dataExtension = '.data'
@@ -415,7 +446,12 @@ def __eq__(self, other):
     def _valueString(self) -> str:
         values = list(self.values.reshape(-1))
         if self._type.typeName == 'float32_t*':
-            strValues = [f'{value}f' for value in values]
+            strValues = []
+            for value in values:
+                literal = f"{float(value):.9g}"
+                if "e" not in literal and "." not in literal:
+                    literal += ".0"
+                strValues.append(literal + "f")
         elif self._type.typeName == 'int8_t*':
             strValues = [f'{int(value)}' for value in values]
         else:
@@ -977,8 +1013,6 @@ def hoistConstant(self,
             Returns the name of the newly registed ConstantBuffer
 
         """
-        assert len(constant.outputs) <= 1, f"Constant {constant.name} has more than one output"
-
         name = name if name is not None else constant.name
 
         # LMACAN: The shape needs to be copied into a tuple for pickling to work. Don't ask me why..
@@ -2027,25 +2061,7 @@ def parse(self, ctxt: NetworkContext, default_channels_first: bool) -> Tuple[Net
         return ctxt, False
 
     def _broadcastToNpType(self, ty: Type[BaseType]):
-
-        def _broadcastInteger(ty: Type[IntegerImmediate]):
-            if ty.signed:
-                return np.dtype(getattr(np, "int" + str(ty.typeWidth)))
-            else:
-                return np.dtype(getattr(np, "uint" + str(ty.typeWidth)))
-
-        def _broadcastFloat(ty: Type[FloatImmediate]):
-            return np.dtype(getattr(np, "double"))
-
-        if issubclass(ty, Pointer) and hasattr(ty, "referencedType"):
-            if issubclass(ty.referencedType, IntegerImmediate):
-                return _broadcastInteger(ty.referencedType)
-        elif issubclass(ty, IntegerImmediate):
-            return _broadcastInteger(ty)
-        elif issubclass(ty, FloatImmediate):
-            return _broadcastFloat(ty)
-
-        return None
+        return _deeployTypeToNpType(ty)
 
     def typeCheck(self, ctxt: NetworkContext) -> Tuple[NetworkContext, bool]:
         """Invokes the mapper's typeCheck method
@@ -2106,8 +2122,9 @@ def bind(self, ctxt: NetworkContext) -> Tuple[NetworkContext, bool]:
                 elif ctxt.is_global(node.name):
                     npType = self._broadcastToNpType(ctxt.globalObjects[node.name]._type)
                     if isinstance(ctxt.globalObjects[node.name], ConstantBuffer):
-                        if isinstance(node, gs.Constant):
+                        if isinstance(node, gs.Constant) and npType is not None:
                             node.values = node.values.astype(npType)
+                            node.export_dtype = npType
                     else:
                         node.shape = ctxt.globalObjects[node.name].shape
                         if npType is not None:
@@ -2856,7 +2873,17 @@ def generateInferenceInitializationCode(self) -> str:
 
             name = node.name
             node.name = self.ctxt._mangle(node.name)
-            callStack += node.init()
+
+            if ("TILING_CODEGEN" not in node.name and isinstance(node, VariableBuffer) and hasattr(node, "_type")
+                    and issubclass(node._type, Pointer)):
+                # Local inference buffers are late-bound by the generated layer code. Initializing them to NULL keeps
+                # clang from flagging false-positive uninitialized reads on paths where the assignment is emitted in a
+                # separate closure, and marking them unused avoids noise for scratch buffers that are reserved
+                # generically but optimized away for a specific layer instance.
+                typeName = node._instance.typeName if hasattr(node, "_instance") else node._type.typeName
+                callStack += f"{typeName} {node.name} __attribute__((unused)) = NULL;\n"
+            else:
+                callStack += node.init()
             node.name = name
 
         return callStack
@@ -3121,6 +3148,15 @@ def _exportGraph(self, folderPath, fileName):
         # VJUNG: ONNX-Graphsurgeon needs tensors to be in their export types
         constTensors = [tensor for tensor in self.graph.tensors().values() if isinstance(tensor, gs.Constant)]
         for tensor in constTensors:
+            if tensor.name in self.ctxt.globalObjects:
+                ctxtTensor = self.ctxt.globalObjects[tensor.name]
+                if isinstance(ctxtTensor, ConstantBuffer) and hasattr(ctxtTensor, "_type"):
+                    npType = _deeployTypeToNpType(ctxtTensor._type)
+                    if npType is not None:
+                        tensor.values = tensor.values.astype(npType)
+                        tensor.export_dtype = npType
+                        continue
+
             if tensor.dtype != tensor.export_dtype:
                 tensor.values = tensor.values.astype(tensor.export_dtype)
 
diff --git a/Deeploy/EngineExtension/OptimizationPasses/TopologyOptimizationPasses/EngineColoringPasses.py b/Deeploy/EngineExtension/OptimizationPasses/TopologyOptimizationPasses/EngineColoringPasses.py
index 82b7d1fde4..fb4a647634 100644
--- a/Deeploy/EngineExtension/OptimizationPasses/TopologyOptimizationPasses/EngineColoringPasses.py
+++ b/Deeploy/EngineExtension/OptimizationPasses/TopologyOptimizationPasses/EngineColoringPasses.py
@@ -36,6 +36,8 @@ def apply(self, graph: gs.Graph) -> Tuple[gs.Graph]:
             engine = self.engineMapper.mapNodeToEngine(node, graph)
             if engine is not None:
                 node.attrs["engine"] = engine.name
+                if hasattr(engine, "n_cores"):
+                    node.attrs["n_cores"] = engine.n_cores
         return graph
 
 
diff --git a/Deeploy/Targets/Generic/Bindings.py b/Deeploy/Targets/Generic/Bindings.py
index 308b179aef..807f3ba5cd 100644
--- a/Deeploy/Targets/Generic/Bindings.py
+++ b/Deeploy/Targets/Generic/Bindings.py
@@ -15,16 +15,16 @@
     ConvTransposeTemplate, DebugPrintTemplate, DequantTemplate, DummyTemplate, DWConvTemplate, FloatAddTemplate, \
     FloatConvTemplate, FloatDivTemplate, FloatDWConvTemplate, FloatGELUTemplate, FloatGemmTemplate, \
     FloatLayernormTemplate, FloatMatMulTemplate, FloatMaxPoolTemplate, FloatMulTemplate, FloatPadTemplate, \
-    FloatPowTemplate, FloatReduceMeanTemplate, FloatReluTemplate, FloatSoftmaxTemplate, FloatSqrtTemplate, \
-    GatherTemplate, GemmTemplate, IntegerDivTemplate, ITAMaxTemplate, ITAPartialMaxTemplate, MatMulTemplate, \
-    MaxPoolTemplate, MulTemplate, PadTemplate, QuantTemplate, ReduceMeanTemplate, ReduceSumTemplate, \
+    FloatPowTemplate, FloatReduceLogSumExpTemplate, FloatReduceMeanTemplate, FloatReluTemplate, FloatSoftmaxTemplate, \
+    FloatSqrtTemplate, GatherTemplate, GemmTemplate, IntegerDivTemplate, ITAMaxTemplate, ITAPartialMaxTemplate, \
+    MatMulTemplate, MaxPoolTemplate, MulTemplate, PadTemplate, QuantTemplate, ReduceMeanTemplate, ReduceSumTemplate, \
     RequantShiftTemplate, ReshapeTemplate, RQIntegerDivTemplate, RQSiGELUTemplate, SliceTemplate, TransposeTemplate, \
     iGELUTemplate, iLayernormTemplate, iRMSNormTemplate, iSoftmaxTemplate
 from Deeploy.Targets.Generic.TypeCheckers import AddChecker, BatchNormChecker, ConcatChecker, ConvChecker, \
     DebugPrintChecker, DequantChecker, DivChecker, DummyChecker, GatherChecker, GELUChecker, GEMMChecker, \
-    LayerNormChecker, MatMulChecker, MaxPoolChecker, MulChecker, PadChecker, QuantChecker, ReduceMeanChecker, \
-    ReduceSumChecker, ReluChecker, RequantShiftChecker, ReshapeChecker, RQIntegerDivChecker, SliceChecker, \
-    SoftmaxChecker, TransposeChecker
+    LayerNormChecker, MatMulChecker, MaxPoolChecker, MulChecker, PadChecker, QuantChecker, ReduceLogSumExpChecker, \
+    ReduceMeanChecker, ReduceSumChecker, ReluChecker, RequantShiftChecker, ReshapeChecker, RQIntegerDivChecker, \
+    SliceChecker, SoftmaxChecker, TransposeChecker
 
 BasicTransformer = CodeTransformation([ArgumentStructGeneration(), MemoryManagementGeneration(), FutureGeneration()])
 
@@ -227,6 +227,11 @@
                 BasicTransformer) for type in SignedIntegerDataTypes
 ]
 
+BasicReduceLogSumExpBindings = [
+    NodeBinding(ReduceLogSumExpChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
+                FloatReduceLogSumExpTemplate.referenceTemplate, BasicTransformer)
+]
+
 BasicReluBinding = NodeBinding(ReluChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
                                FloatReluTemplate.referenceTemplate, BasicTransformer)
 
@@ -286,6 +291,9 @@
 BasicConcatBindings = [
     NodeBinding(ConcatChecker([PointerClass(type), PointerClass(type)], [PointerClass(type)]),
                 ConcatTemplate.referenceTemplate, BasicTransformer) for type in IntegerDataTypes
+] + [
+    NodeBinding(ConcatChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+                ConcatTemplate.referenceTemplate, BasicTransformer)
 ]
 
 BasicQuantBindings = [
@@ -312,18 +320,34 @@
     for type in FloatDataTypes
 ]
 
-BasicConvTransposeBindings = [
+BasicConvTranspose1DBindings = [
+    NodeBinding(
+        ConvChecker(
+            [PointerClass(type), PointerClass(type), PointerClass(type)],  # input, weight, bias
+            [PointerClass(type)]),
+        ConvTransposeTemplate.reference1DTemplate,
+        BasicTransformer) for type in FloatDataTypes
+] + [
+    NodeBinding(
+        ConvChecker(
+            [PointerClass(type), PointerClass(type)],  # input, weight
+            [PointerClass(type)]),
+        ConvTransposeTemplate.reference1DTemplate,
+        BasicTransformer) for type in FloatDataTypes
+]
+
+BasicConvTranspose2DBindings = [
     NodeBinding(
         ConvChecker(
             [PointerClass(type), PointerClass(type), PointerClass(type)],  # input, weight, bias
             [PointerClass(type)]),
-        ConvTransposeTemplate.referenceTemplate,
+        ConvTransposeTemplate.reference2DTemplate,
         BasicTransformer) for type in FloatDataTypes
 ] + [
     NodeBinding(
         ConvChecker(
             [PointerClass(type), PointerClass(type)],  # input, weight
             [PointerClass(type)]),
-        ConvTransposeTemplate.referenceTemplate,
+        ConvTransposeTemplate.reference2DTemplate,
         BasicTransformer) for type in FloatDataTypes
 ]
diff --git a/Deeploy/Targets/Generic/Layers.py b/Deeploy/Targets/Generic/Layers.py
index cc733937cc..2d011e21f9 100644
--- a/Deeploy/Targets/Generic/Layers.py
+++ b/Deeploy/Targets/Generic/Layers.py
@@ -340,6 +340,12 @@ def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorReprese
         if inputShapes[1] == () or inputShapes[1] == []:
             inputShapes[1] = (1,)
 
+        # Scalars and singletons should broadcast to the tensor operand,
+        # not shrink the tensor shape to (1,).
+        if tuple(inputShapes[1]) == (1,):
+            inputShapes[1] = inputShapes[0]
+            return (inputShapes, outputShapes)
+
         if len(inputShapes[0]) > len(inputShapes[1]):
             inputShapes[1] = inputShapes[0]
         else:
@@ -438,6 +444,27 @@ def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorReprese
         return (inputShapes, outputShapes)
 
 
+class ReduceLogSumExpLayer(ONNXLayer):
+
+    def __init__(self, maps: List[NodeMapper]):
+        super().__init__(maps)
+
+    def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorRepresentation,
+                      channels_first) -> Tuple[Shape, Shape]:
+        axis = operatorRepresentation['axes'][0]
+        inputShape = list(copy.deepcopy(inputShapes[0]))
+
+        if operatorRepresentation['keepdims']:
+            outputShape = inputShape
+            outputShape[axis] = 1
+        else:
+            outputShape = inputShape[:axis] + inputShape[axis + 1:]
+            if len(outputShape) == 0:
+                outputShape = [1]
+
+        return (inputShapes, [outputShape])
+
+
 class ReluLayer(ONNXLayer):
 
     def __init__(self, maps: List[NodeMapper]):
diff --git a/Deeploy/Targets/Generic/Parsers.py b/Deeploy/Targets/Generic/Parsers.py
index ad787d9e4b..3b69f9e526 100644
--- a/Deeploy/Targets/Generic/Parsers.py
+++ b/Deeploy/Targets/Generic/Parsers.py
@@ -6,6 +6,7 @@
 from typing import Tuple
 
 import numpy as np
+import onnx
 import onnx_graphsurgeon as gs
 
 from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, NodeParser, VariableBuffer
@@ -334,25 +335,99 @@ class PadParser(NodeParser):
     def __init__(self):
         super().__init__()
 
+    def _evaluate_constant_tensor(self, tensor: gs.Tensor):
+
+        if isinstance(tensor, gs.Constant):
+            return np.asarray(tensor.values)
+
+        if not hasattr(tensor, "inputs") or len(tensor.inputs) != 1:
+            return None
+
+        node = tensor.inputs[0]
+        input_values = []
+        for input_tensor in node.inputs:
+            value = self._evaluate_constant_tensor(input_tensor)
+            if value is None:
+                return None
+            input_values.append(value)
+
+        if node.op == "Constant":
+            value = node.attrs.get("value")
+            if value is None:
+                return None
+            return np.asarray(value.values)
+
+        if node.op == "Cast":
+            cast_dtype = onnx.helper.tensor_dtype_to_np_dtype(node.attrs["to"])
+            return input_values[0].astype(cast_dtype)
+
+        if node.op == "Reshape":
+            return np.reshape(input_values[0], input_values[1].astype(np.int64).tolist())
+
+        if node.op == "Concat":
+            return np.concatenate(input_values, axis = node.attrs["axis"])
+
+        if node.op == "Transpose":
+            return np.transpose(input_values[0], axes = node.attrs["perm"])
+
+        if node.op == "ConstantOfShape":
+            fill_value = node.attrs.get("value")
+            if fill_value is None:
+                scalar = np.array(0, dtype = np.float32)
+            else:
+                scalar = np.asarray(fill_value.values).reshape(-1)[0]
+            return np.full(input_values[0].astype(np.int64).tolist(), scalar, dtype = np.asarray(scalar).dtype)
+
+        if node.op == "Slice":
+            data = input_values[0]
+            starts = input_values[1].astype(np.int64).reshape(-1)
+            ends = input_values[2].astype(np.int64).reshape(-1)
+            axes = input_values[3].astype(np.int64).reshape(-1) if len(input_values) >= 4 else np.arange(len(starts))
+            steps = input_values[4].astype(np.int64).reshape(-1) if len(input_values) >= 5 else np.ones(
+                len(starts), dtype = np.int64)
+
+            slices = [slice(None)] * data.ndim
+            for start, end, axis, step in zip(starts, ends, axes, steps):
+                slices[int(axis)] = slice(int(start), int(end), int(step))
+
+            return data[tuple(slices)]
+
+        return None
+
     def parseNode(self, node: gs.Node) -> bool:
 
-        ret = all([
-            'mode' in node.attrs, 'pads' in node.attrs, 'value' in node.attrs,
-            len(node.inputs) == 1,
-            len(node.outputs) == 1
-        ])
+        ret = all(['mode' in node.attrs, len(node.outputs) == 1])
 
         if ret:
             self.operatorRepresentation['mode'] = node.attrs['mode']
+            self.operatorRepresentation['value'] = 0
 
-            try:
-                self.operatorRepresentation['pads'] = [int(p) for p in node.attrs['pads']]
-            except Exception as e:
-                self.operatorRepresentation['pads'] = node.attrs['pads']
+            if 'pads' in node.attrs and len(node.inputs) == 1:
+                try:
+                    self.operatorRepresentation['pads'] = [int(p) for p in node.attrs['pads']]
+                except Exception:
+                    self.operatorRepresentation['pads'] = node.attrs['pads']
 
-            self.operatorRepresentation['value'] = node.attrs['value']
+                if 'value' in node.attrs:
+                    self.operatorRepresentation['value'] = node.attrs['value']
+                return True
 
-        return ret
+            if len(node.inputs) in (2, 3):
+                pads = self._evaluate_constant_tensor(node.inputs[1])
+                if pads is None:
+                    return False
+
+                self.operatorRepresentation['pads'] = [int(p) for p in np.asarray(pads).reshape(-1)]
+
+                if len(node.inputs) == 3:
+                    value = self._evaluate_constant_tensor(node.inputs[2])
+                    if value is None:
+                        return False
+                    self.operatorRepresentation['value'] = np.asarray(value).reshape(-1)[0].item()
+
+                return True
+
+        return False
 
     def parseNodeCtxt(self,
                       ctxt: NetworkContext,
@@ -366,6 +441,16 @@ def parseNodeCtxt(self,
         self.operatorRepresentation['data_in_size'] = np.prod(data_in.shape)
         self.operatorRepresentation['data_out_size'] = np.prod(data_out.shape)
 
+        # Keep optional constant Pad inputs visible to the tiler.
+        # The template uses the decoded scalar/list values, but the tiled
+        # scheduler still tracks the original constant tensors as node inputs.
+        if len(node.inputs) >= 2:
+            pads_tensor = ctxt.lookup(node.inputs[1].name)
+            self.operatorRepresentation['pads_tensor'] = pads_tensor.name
+        if len(node.inputs) >= 3:
+            value_tensor = ctxt.lookup(node.inputs[2].name)
+            self.operatorRepresentation['value_tensor'] = value_tensor.name
+
         return ctxt, True
 
 
@@ -623,6 +708,118 @@ def parseNodeCtxt(self,
         return newCtxt, ret
 
 
+class ReduceLogSumExpParser(NodeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+        if len(node.inputs) < 1 or len(node.inputs) > 2 or len(node.outputs) != 1:
+            return False
+
+        if 'keepdims' not in node.attrs:
+            return False
+
+        self.operatorRepresentation['keepdims'] = int(node.attrs['keepdims'])
+        if len(node.inputs[0].shape) == 0:
+            return False
+
+        if len(node.inputs) == 2:
+            if not isinstance(node.inputs[1], gs.Constant):
+                return False
+            axes = np.array(node.inputs[1].values, dtype = np.int64)
+        else:
+            if 'axes' not in node.attrs:
+                return False
+            axes = node.attrs['axes']
+            if isinstance(axes, int):
+                axes = [axes]
+            axes = np.array(axes, dtype = np.int64)
+
+        normalized_axes = []
+        rank = len(node.inputs[0].shape)
+        for axis in axes:
+            normalized_axis = int(axis)
+            if normalized_axis < 0:
+                normalized_axis += rank
+            normalized_axes.append(normalized_axis)
+
+        if len(normalized_axes) != 1:
+            return False
+
+        axis = normalized_axes[0]
+        if axis < 0 or axis >= rank:
+            return False
+
+        self.operatorRepresentation['axes'] = np.array([axis], dtype = np.int64)
+
+        if self.operatorRepresentation['keepdims']:
+            output_shape = list(node.inputs[0].shape)
+            output_shape[axis] = 1
+        else:
+            output_shape = list(node.inputs[0].shape[:axis]) + list(node.inputs[0].shape[axis + 1:])
+            if len(output_shape) == 0:
+                output_shape = [1]
+
+        node.outputs[0].shape = output_shape
+        return True
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        data_in = ctxt.lookup(node.inputs[0].name)
+        data_out = ctxt.lookup(node.outputs[0].name)
+
+        if len(node.inputs) == 2:
+            axes_buffer = ctxt.lookup(node.inputs[1].name)
+            axes_buffer._live = False
+            axes_buffer._deploy = False
+            axes = np.array(axes_buffer.values, dtype = np.int64)
+        else:
+            axes = np.array(self.operatorRepresentation['axes'], dtype = np.int64)
+
+        normalized_axes = []
+        for axis in axes:
+            normalized_axis = int(axis)
+            if normalized_axis < 0:
+                normalized_axis += len(data_in.shape)
+            normalized_axes.append(normalized_axis)
+
+        if len(normalized_axes) != 1:
+            return ctxt, False
+
+        axis = normalized_axes[0]
+        if axis < 0 or axis >= len(data_in.shape):
+            return ctxt, False
+
+        outer_size = int(np.prod(data_in.shape[:axis])) if axis > 0 else 1
+        inner_size = int(np.prod(data_in.shape[axis + 1:])) if axis + 1 < len(data_in.shape) else 1
+        if self.operatorRepresentation['keepdims']:
+            output_shape = list(data_in.shape)
+            output_shape[axis] = 1
+        else:
+            output_shape = list(data_in.shape[:axis]) + list(data_in.shape[axis + 1:])
+            if len(output_shape) == 0:
+                output_shape = [1]
+
+        data_out.shape = output_shape
+        node.outputs[0].shape = output_shape
+
+        self.operatorRepresentation['data_in'] = data_in.name
+        self.operatorRepresentation['data_out'] = data_out.name
+        self.operatorRepresentation['data_in_shape'] = data_in.shape
+        self.operatorRepresentation['data_out_shape'] = output_shape
+        self.operatorRepresentation['size'] = int(np.prod(data_in.shape))
+        self.operatorRepresentation['axes'] = np.array([axis], dtype = np.int64)
+        self.operatorRepresentation['axisLength'] = data_in.shape[axis]
+        self.operatorRepresentation['outerSize'] = outer_size
+        self.operatorRepresentation['innerSize'] = inner_size
+
+        return ctxt, True
+
+
 class SoftmaxParser(NodeParser):
 
     def __init__(self):
@@ -2703,14 +2900,11 @@ def __init__(self):
         super().__init__()
 
     def parseNode(self, node: gs.Node) -> bool:
-        # Verify the attributes (epsilon is mandatory, momentum and training_mode  are optional)
-        if 'epsilon' not in node.attrs:
-            return False
         # Common Inputs: 5 (X, scale, B, mean, var)
         if len(node.inputs) < 5:
             return False
 
-        # Save the attributes, default values are provided if not present
+        # Save attributes (ONNX defaults when attributes are omitted)
         self.operatorRepresentation['epsilon'] = node.attrs.get('epsilon', 1e-5)
         self.operatorRepresentation['momentum'] = node.attrs.get('momentum', 0.9)
         self.operatorRepresentation['training_mode'] = node.attrs.get('training_mode', 0)
@@ -2728,10 +2922,27 @@ def parseNodeCtxt(self, ctxt, node: gs.Node, channels_first: bool = True):
         self.operatorRepresentation[outputs[0]] = ctxt.lookup(node.outputs[0].name).name
 
         input_shape = ctxt.lookup(node.inputs[0].name).shape
-        # Save input shape information
+        param_shape = ctxt.lookup(node.inputs[1].name).shape
+
+        if len(input_shape) >= 2 and len(param_shape) >= 1:
+            channel_count = param_shape[0]
+            channels_first_match = input_shape[1] == channel_count
+            channels_last_match = input_shape[-1] == channel_count
+
+            # Prefer explicit evidence from the BN parameter length when the
+            # default layout would otherwise be ambiguous for mixed-layout graphs.
+            if channels_first_match != channels_last_match:
+                channels_first = channels_first_match
+
+        # BatchNorm runs on flattened [N, C, L] for channels-first and [N, L, C]
+        # for channels-last, where L covers all spatial positions.
         self.operatorRepresentation['batch_size'] = input_shape[0]
-        self.operatorRepresentation['channel_size'] = input_shape[1]
-        self.operatorRepresentation['window_size'] = input_shape[2]
+        self.operatorRepresentation['channel_size'] = input_shape[1] if channels_first else input_shape[-1]
+        if channels_first:
+            self.operatorRepresentation['window_size'] = int(np.prod(input_shape[2:]))
+        else:
+            self.operatorRepresentation['window_size'] = int(np.prod(input_shape[1:-1]))
+        self.operatorRepresentation['channels_first'] = int(channels_first)
 
         return ctxt, True
 
@@ -2783,8 +2994,8 @@ def parseNodeCtxt(self, ctxt: NetworkContext, node: gs.Node, channels_first: boo
 
         stride_x, stride_y = 1, 1
         if "strides" in node.attrs:
-            stride_y = node.attrs["strides"][0]
-            stride_x = node.attrs["strides"][1] if len(node.attrs["strides"]) > 1 else stride_y
+            stride_x = node.attrs["strides"][0]
+            stride_y = node.attrs["strides"][1] if len(node.attrs["strides"]) > 1 else stride_x
         self.operatorRepresentation["stride_y"] = stride_y
         self.operatorRepresentation["stride_x"] = stride_x
 
@@ -2865,6 +3076,80 @@ def parseNodeCtxt(self,
         return ctxt, False
 
 
+class ConvTranspose2DParser(ConvTransposeParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+        # 2D ConvTranspose expects 4D input/output and 4D weight
+        wellFormed = super().parseNode(node)
+        ret = False
+        if wellFormed:
+            ret = all([
+                # Make sure strides are 2D
+                len(node.attrs['strides']) == 2,
+                len(node.attrs['pads']) == 4,
+                len(node.attrs['dilations']) == 2,
+            ])
+        if ret:
+
+            self.operatorRepresentation['kernel_shape'] = node.attrs['kernel_shape']
+            self.operatorRepresentation['dim_kernel_x'] = int(self.operatorRepresentation['kernel_shape'][0])
+            self.operatorRepresentation['dim_kernel_y'] = int(self.operatorRepresentation['kernel_shape'][1])
+            self.operatorRepresentation['dilation_x'] = int(self.operatorRepresentation['dilations'][0])
+            self.operatorRepresentation['dilation_y'] = int(self.operatorRepresentation['dilations'][1])
+            self.operatorRepresentation['padding_x'] = int(self.operatorRepresentation['pads'][0])
+            self.operatorRepresentation['padding_y'] = int(self.operatorRepresentation['pads'][1])
+            self.operatorRepresentation['stride_x'] = int(self.operatorRepresentation['strides'][0])
+            self.operatorRepresentation['stride_y'] = int(self.operatorRepresentation['strides'][1])
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if ret:
+            data_in = newCtxt.lookup(node.inputs[0].name)
+            data_out = newCtxt.lookup(node.outputs[0].name)
+            in_shape = data_in.shape
+            out_shape = data_out.shape
+            weight = newCtxt.lookup(node.inputs[1].name)
+
+            if len(in_shape) != 4 or len(out_shape) != 4 or len(weight.shape) != 4:
+                return ctxt, False
+
+            self.operatorRepresentation['batch'] = in_shape[0]
+
+            if channels_first:
+                self.operatorRepresentation['ch_im_in'] = in_shape[1]
+                self.operatorRepresentation['dim_im_in_x'] = in_shape[2]
+                self.operatorRepresentation['dim_im_in_y'] = in_shape[3]
+                self.operatorRepresentation['ch_im_out'] = out_shape[1]
+                self.operatorRepresentation['dim_im_out_x'] = out_shape[2]
+                self.operatorRepresentation['dim_im_out_y'] = out_shape[3]
+            else:
+                self.operatorRepresentation['ch_im_in'] = in_shape[3]
+                self.operatorRepresentation['dim_im_in_x'] = in_shape[1]
+                self.operatorRepresentation['dim_im_in_y'] = in_shape[2]
+                self.operatorRepresentation['ch_im_out'] = out_shape[3]
+                self.operatorRepresentation['dim_im_out_x'] = out_shape[1]
+                self.operatorRepresentation['dim_im_out_y'] = out_shape[2]
+
+            self.operatorRepresentation["batchOffsetIn"] = (self.operatorRepresentation["ch_im_in"] *
+                                                            self.operatorRepresentation["dim_im_in_x"] *
+                                                            self.operatorRepresentation["dim_im_in_y"])
+            self.operatorRepresentation["batchOffsetOut"] = (self.operatorRepresentation["ch_im_out"] *
+                                                             self.operatorRepresentation["dim_im_out_x"] *
+                                                             self.operatorRepresentation["dim_im_out_y"])
+            return newCtxt, True
+        return ctxt, False
+
+
 class SqrtParser(NodeParser):
 
     def __init__(self):
diff --git a/Deeploy/Targets/Generic/Platform.py b/Deeploy/Targets/Generic/Platform.py
index e05e897270..c0066bd78c 100644
--- a/Deeploy/Targets/Generic/Platform.py
+++ b/Deeploy/Targets/Generic/Platform.py
@@ -7,26 +7,27 @@
 from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NodeMapper, NodeTemplate, \
     StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer
 from Deeploy.Targets.Generic.Bindings import BasicAddBindings, BasicBatchNormBindings, BasicConcatBindings, \
-    BasicConv1DBindings, BasicConv2DBindings, BasicConvTransposeBindings, BasicDebugPrintBindings, \
-    BasicDequantBindings, BasicDivBindings, BasicDWConv1DBinding, BasicDWConv2DBindings, BasicGatherBindings, \
-    BasicGELUBindings, BasicGEMMBindings, BasicITAPartialSoftmaxBinding, BasicITASoftmaxBinding, \
+    BasicConv1DBindings, BasicConv2DBindings, BasicConvTranspose1DBindings, BasicConvTranspose2DBindings, \
+    BasicDebugPrintBindings, BasicDequantBindings, BasicDivBindings, BasicDWConv1DBinding, BasicDWConv2DBindings, \
+    BasicGatherBindings, BasicGELUBindings, BasicGEMMBindings, BasicITAPartialSoftmaxBinding, BasicITASoftmaxBinding, \
     BasicLayerNormBindings, BasicMatMulBindings, BasicMaxPool1DBindings, BasicMaxPool2DBindings, BasicMulBindings, \
-    BasicPad1DBindings, BasicPad2DBindings, BasicPowBindings, BasicQuantBindings, BasicReduceMeanBindings, \
-    BasicReduceSumBindings, BasicReluBinding, BasicReshapeBindings, BasicRQIntegerDivBinding, BasicRQSBindings, \
-    BasicRQSGELUBinding, BasicSliceBindings, BasicSoftmaxBindings, BasicSqrtBindings, BasicTransposeBindings, \
-    DummyBinding
+    BasicPad1DBindings, BasicPad2DBindings, BasicPowBindings, BasicQuantBindings, BasicReduceLogSumExpBindings, \
+    BasicReduceMeanBindings, BasicReduceSumBindings, BasicReluBinding, BasicReshapeBindings, BasicRQIntegerDivBinding, \
+    BasicRQSBindings, BasicRQSGELUBinding, BasicSliceBindings, BasicSoftmaxBindings, BasicSqrtBindings, \
+    BasicTransposeBindings, DummyBinding
 from Deeploy.Targets.Generic.Layers import AddLayer, BatchNormalizationLayer, ConcatLayer, ConvLayer, \
     ConvTransposeLayer, DebugPrintLayer, DequantLayer, DivLayer, GatherLayer, GELULayer, GEMMLayer, ITAMaxLayer, \
-    LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, PowLayer, QuantLayer, ReduceMeanLayer, \
-    ReduceSumLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, SliceLayer, \
-    SoftmaxLayer, SqrtLayer, TransposeLayer
+    LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, PowLayer, QuantLayer, ReduceLogSumExpLayer, \
+    ReduceMeanLayer, ReduceSumLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, \
+    SliceLayer, SoftmaxLayer, SqrtLayer, TransposeLayer
 from Deeploy.Targets.Generic.Parsers import AddParser, BatchNormParser, ConcatParser, ConvTranspose1DParser, \
-    DebugParser, DequantParser, DivParser, DummyParser, FlattenParser, GatherParser, GELUParser, GenericConv1DParser, \
-    GenericConv2DParser, GenericDWConv1DParser, GenericDWConv2DParser, GenericGEMMParser, GenericMaxPool2DParser, \
-    IntegerDivParser, ITAMaxParser, ITAPartialMaxParser, LayerNormParser, MatMulParser, MaxPool1DParser, MulParser, \
-    Pad1DParser, Pad2DParser, PowParser, QuantParser, ReduceMeanParser, ReduceSumParser, ReluParser, \
-    RequantShiftParser, ReshapeParser, RQIntegerDivParser, RQSiGELUParser, SliceParser, SoftmaxParser, SqrtParser, \
-    TransposeParser, UnsqueezeParser, iLayerNormParser, iSoftmaxParser
+    ConvTranspose2DParser, DebugParser, DequantParser, DivParser, DummyParser, FlattenParser, GatherParser, \
+    GELUParser, GenericConv1DParser, GenericConv2DParser, GenericDWConv1DParser, GenericDWConv2DParser, \
+    GenericGEMMParser, GenericMaxPool2DParser, IntegerDivParser, ITAMaxParser, ITAPartialMaxParser, LayerNormParser, \
+    MatMulParser, MaxPool1DParser, MulParser, Pad1DParser, Pad2DParser, PowParser, QuantParser, ReduceLogSumExpParser, \
+    ReduceMeanParser, ReduceSumParser, ReluParser, RequantShiftParser, ReshapeParser, RQIntegerDivParser, \
+    RQSiGELUParser, SliceParser, SoftmaxParser, SqrtParser, TransposeParser, UnsqueezeParser, iLayerNormParser, \
+    iSoftmaxParser
 from Deeploy.Targets.Generic.Templates import AllocateTemplate, FreeTemplate
 from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import DequantPatternPass, ExtractPaddingFromConvPass, \
     ExtractPaddingFromPoolPass, MatMulAddMergePass, MergeConstAddAndRequantPass, QuantPatternPass, \
@@ -58,6 +59,7 @@
 Pad1DMapper = NodeMapper(Pad1DParser(), BasicPad1DBindings)
 Pad2DMapper = NodeMapper(Pad2DParser(), BasicPad2DBindings)
 ReduceMeanMapper = NodeMapper(ReduceMeanParser(), BasicReduceMeanBindings)
+ReduceLogSumExpMapper = NodeMapper(ReduceLogSumExpParser(), BasicReduceLogSumExpBindings)
 ReduceSumMapper = NodeMapper(ReduceSumParser(), BasicReduceSumBindings)
 ReluMapper = NodeMapper(ReluParser(), [BasicReluBinding])
 RequantShiftMapper = NodeMapper(RequantShiftParser(), BasicRQSBindings)
@@ -71,7 +73,8 @@
 QuantMapper = NodeMapper(QuantParser(), BasicQuantBindings)
 DequantMapper = NodeMapper(DequantParser(), BasicDequantBindings)
 BatchNormalizationMapper = NodeMapper(BatchNormParser(), BasicBatchNormBindings)
-ConvTransposeMapper = NodeMapper(ConvTranspose1DParser(), BasicConvTransposeBindings)
+ConvTranspose1DMapper = NodeMapper(ConvTranspose1DParser(), BasicConvTranspose1DBindings)
+ConvTranspose2DMapper = NodeMapper(ConvTranspose2DParser(), BasicConvTranspose2DBindings)
 SliceMapper = NodeMapper(SliceParser(), BasicSliceBindings)
 
 # Dummy nodes are intended for development purposes only!
@@ -105,6 +108,7 @@
     'Sqrt': SqrtLayer([SqrtMapper]),
     'Pad': PadLayer([Pad1DMapper, Pad2DMapper]),
     'ReduceMean': ReduceMeanLayer([ReduceMeanMapper]),
+    'ReduceLogSumExp': ReduceLogSumExpLayer([ReduceLogSumExpMapper]),
     'ReduceSum': ReduceSumLayer([ReduceSumMapper]),
     'Relu': ReluLayer([ReluMapper]),
     'RequantizediGELU': RQSiGELULayer([RQGELUMapper]),
@@ -118,7 +122,7 @@
     'Quant': QuantLayer([QuantMapper]),
     'Dequant': DequantLayer([DequantMapper]),
     'BatchNormalization': BatchNormalizationLayer([BatchNormalizationMapper]),
-    'ConvTranspose': ConvTransposeLayer([ConvTransposeMapper])
+    'ConvTranspose': ConvTransposeLayer([ConvTranspose1DMapper, ConvTranspose2DMapper])
     # # For example, you can use the DummpyMapper, in case you want to test
     # # deployment or optimizations with GlobalAveragePool nodes but did not yet
     # # implement the corresponding kernel
diff --git a/Deeploy/Targets/Generic/Templates/BatchNormalizationTemplate.py b/Deeploy/Targets/Generic/Templates/BatchNormalizationTemplate.py
index 5377c91ca0..0233a36c96 100644
--- a/Deeploy/Targets/Generic/Templates/BatchNormalizationTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/BatchNormalizationTemplate.py
@@ -9,7 +9,7 @@
 BEGIN_SINGLE_CORE
     BatchNorm_fp32(
         ${data_in}, ${scale}, ${bias}, ${mean}, ${variance},
-        ${data_out}, ${batch_size}, ${channel_size}, ${window_size}
+        ${data_out}, ${batch_size}, ${channel_size}, ${window_size}, ${epsilon}, ${channels_first}
     );
 END_SINGLE_CORE
 """)
diff --git a/Deeploy/Targets/Generic/Templates/ConvTransposeTemplate.py b/Deeploy/Targets/Generic/Templates/ConvTransposeTemplate.py
index 9bf864c91f..73bb130f78 100644
--- a/Deeploy/Targets/Generic/Templates/ConvTransposeTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/ConvTransposeTemplate.py
@@ -4,8 +4,9 @@
 
 from Deeploy.DeeployTypes import NodeTemplate
 
-referenceTemplate = NodeTemplate("""
+reference1DTemplate = NodeTemplate("""
 <%
+bias_ptr = bias if 'bias' in locals() else "NULL"
 batchOffsetIn = ch_im_in * dim_im_in_y
 batchOffsetOut = ch_im_out * dim_im_out_y
 %>
@@ -20,7 +21,7 @@
             ref_${data_out}_${data_in}, ${ch_im_in}, ${dim_im_in_y},
             ${weight}, ${ch_im_out}, ${dim_kernel_y},
             ${stride_y},
-            ${bias}, ${has_bias},
+            ${bias_ptr}, ${has_bias},
             ref_${data_out}_${data_out}, ${dim_im_out_y}
         );
 
@@ -29,3 +30,30 @@
     }
 END_SINGLE_CORE
 """)
+
+reference2DTemplate = NodeTemplate("""
+<%
+bias_ptr = bias if 'bias' in locals() else "NULL"
+batchOffsetIn = ch_im_in * dim_im_in_x * dim_im_in_y
+batchOffsetOut = ch_im_out * dim_im_out_x * dim_im_out_y
+%>
+
+// 2D Transposed Conv (Name: ${nodeName}, Op: ${nodeOp})
+BEGIN_SINGLE_CORE
+    ${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in};
+    ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+    for (uint32_t n=0; n<${batch}; ++n) {
+        ConvTranspose2d_fp32(
+            ref_${data_out}_${data_in}, ${ch_im_in}, ${dim_im_in_x}, ${dim_im_in_y},
+            ${weight}, ${ch_im_out}, ${dim_kernel_x}, ${dim_kernel_y},
+            ${stride_x}, ${stride_y},
+            ${bias_ptr}, ${has_bias},
+            ref_${data_out}_${data_out}, ${dim_im_out_x}, ${dim_im_out_y}
+        );
+
+        ref_${data_out}_${data_in} += ${batchOffsetIn};
+        ref_${data_out}_${data_out} += ${batchOffsetOut};
+    }
+END_SINGLE_CORE
+""")
diff --git a/Deeploy/Targets/Generic/Templates/FloatMulTemplate.py b/Deeploy/Targets/Generic/Templates/FloatMulTemplate.py
index 3c8c2da501..0df2e901a0 100644
--- a/Deeploy/Targets/Generic/Templates/FloatMulTemplate.py
+++ b/Deeploy/Targets/Generic/Templates/FloatMulTemplate.py
@@ -8,7 +8,7 @@
 // Float Mul (Name: ${nodeName}, Op: ${nodeOp})
 BEGIN_SINGLE_CORE
     for (uint32_t i=0;i<${size};i++){
-        ${C}[i] = ${A}[i] * ${B}[0];
+        ${C}[i] = ${A}[i] * ${B}[${0 if sizeB == 1 else 'i'}];
     }
 END_SINGLE_CORE
 """)
diff --git a/Deeploy/Targets/Generic/Templates/FloatReduceLogSumExpTemplate.py b/Deeploy/Targets/Generic/Templates/FloatReduceLogSumExpTemplate.py
new file mode 100644
index 0000000000..98d04cfaf6
--- /dev/null
+++ b/Deeploy/Targets/Generic/Templates/FloatReduceLogSumExpTemplate.py
@@ -0,0 +1,10 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+referenceTemplate = NodeTemplate("""
+// ReduceLogSumExp (Name: ${nodeName}, Op: ${nodeOp})
+SINGLE_CORE ReduceLogSumExp_fp32_fp32(${data_in}, ${data_out}, ${outerSize}, ${axisLength}, ${innerSize});
+""")
diff --git a/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py
index 091cb55a41..0e24d51635 100644
--- a/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py
+++ b/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py
@@ -10,11 +10,23 @@
 from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
 from Deeploy.TilingExtension.TileConstraint import TileConstraint
 from Deeploy.TilingExtension.TilerModel import TilerModel
-from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, TilingSchedule, VariableReplacementScheme
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
 
 
 class UntiledTileConstraint(TileConstraint):
 
+    @staticmethod
+    def _normalizedShape(shape) -> Tuple[int, ...]:
+        if isinstance(shape, int):
+            return (shape,)
+
+        normalized = tuple(shape)
+        if len(normalized) == 0:
+            return (1,)
+
+        return normalized
+
     @staticmethod
     def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
 
@@ -35,7 +47,8 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw
 
             tilerModel.addTensorDimToModel(ctxt, tensorName)
 
-            for idx, shapeDim in enumerate(_buffer.shape):
+            shape = [_buffer.shape] if isinstance(_buffer.shape, int) else _buffer.shape
+            for idx, shapeDim in enumerate(shape):
                 tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName = tensorName, dimIdx = idx) == shapeDim)
 
         return tilerModel
@@ -53,22 +66,55 @@ def serializeTilingSolution(
             cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
             targetMemLevel: str, ctxt: NetworkContext,
             operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
-        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
-
-        schedule = TilingSchedule({}, {}, [], [])
         repScheme = VariableReplacementScheme({}, {})
+        inputLoadSchedule: List[Dict[str, HyperRectangle]] = [{}]
+        outputLoadSchedule: List[Dict[str, HyperRectangle]] = [{}]
+        inputBaseOffsets: Dict[str, List[int]] = {}
+        outputBaseOffsets: Dict[str, List[int]] = {}
 
-        for key, value in tilingSolution.tensorMemoryConstraints.items():
+        addrNames: List[str] = []
+        for key, value in operatorRepresentation.items():
+            if not isinstance(value, str):
+                continue
+
+            if value not in tilingSolution.tensorMemoryConstraints:
+                continue
 
-            assert len(value.memoryConstraints.keys()) == 1, f"{cls} should be untiled, but {value} is tiled!"
+            _buffer = ctxt.lookup(value)
+            if isinstance(_buffer, TransientBuffer):
+                continue
 
-            memKey = list(value.memoryConstraints.keys())[0]
-            memValue = value.memoryConstraints[memKey]
+            addrNames.append(key)
 
+        for key, value in tilingSolution.tensorMemoryConstraints.items():
             _buffer = ctxt.lookup(key)
             if isinstance(_buffer, TransientBuffer):
                 continue
 
-            assert memValue.shape == tuple(_buffer.shape)
+            fullShape = cls._normalizedShape(_buffer.shape)
+            memoryConstraints = list(value.memoryConstraints.values())
+
+            # Untiled tensors may still be materialized on multiple memory levels when the same
+            # full-shape buffer is replicated along the memory path (for example, an untiled Slice
+            # fed by an L3-backed producer). This is not true tiling and should remain acceptable
+            # here. We only reject the case where different memory levels carry different shapes.
+            assert all(cls._normalizedShape(memValue.shape) == fullShape for memValue in memoryConstraints), \
+                f"{cls} should be untiled, but {value} carries multiple shapes across memory levels!"
+
+        if len(addrNames) > 0:
+            inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                      operatorRepresentation, addrNames)
+
+        for addrName in inputBaseOffsets:
+            buffer = ctxt.lookup(operatorRepresentation[addrName])
+            shape = cls._normalizedShape(buffer.shape)
+            inputLoadSchedule[0][addrName] = HyperRectangle((0,) * len(shape), shape)
+
+        for addrName in outputBaseOffsets:
+            buffer = ctxt.lookup(operatorRepresentation[addrName])
+            shape = cls._normalizedShape(buffer.shape)
+            outputLoadSchedule[0][addrName] = HyperRectangle((0,) * len(shape), shape)
+
+        schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
 
         return repScheme, schedule
diff --git a/Deeploy/Targets/Generic/TopologyOptimizationPasses/Passes.py b/Deeploy/Targets/Generic/TopologyOptimizationPasses/Passes.py
index 146bcf699e..4841187abe 100644
--- a/Deeploy/Targets/Generic/TopologyOptimizationPasses/Passes.py
+++ b/Deeploy/Targets/Generic/TopologyOptimizationPasses/Passes.py
@@ -478,7 +478,12 @@ def _merge_matmul_add_fun(graph: gs.Graph, match: Match, name: str):
     matched_nodes = [m for k, m in match.nodes_map.items()]
     gemm = matched_nodes[0]
     add = matched_nodes[1]
-    _bias = add.inputs[0] if isinstance(add.inputs[0], gs.Constant) else add.inputs[1]
+
+    constant_inputs = [inp for inp in add.inputs if isinstance(inp, gs.Constant)]
+    if len(constant_inputs) != 1:
+        return graph
+
+    _bias = constant_inputs[0]
     _inputs = gemm.inputs + [_bias]
     _outputs = add.outputs
 
diff --git a/Deeploy/Targets/Generic/TypeCheckers.py b/Deeploy/Targets/Generic/TypeCheckers.py
index c2c8d436f8..6834ed8975 100644
--- a/Deeploy/Targets/Generic/TypeCheckers.py
+++ b/Deeploy/Targets/Generic/TypeCheckers.py
@@ -341,6 +341,20 @@ def _inferSignedness(self, inputs: List[VariableBuffer],
             return [False]
 
 
+class ReduceLogSumExpChecker(SignPropTypeChecker):
+
+    def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
+        super().__init__(input_types, output_types)
+
+    def _inferNumLevels(self, inputs: List[VariableBuffer],
+                        operatorRepresentation: OperatorRepresentation) -> List[int]:
+        return [2**(self.input_types[0].referencedType.typeWidth)]
+
+    def _inferSignedness(self, inputs: List[VariableBuffer],
+                         operatorRepresentation: OperatorRepresentation) -> List[bool]:
+        return [True]
+
+
 class ReluChecker(SignPropTypeChecker):
 
     def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]):
diff --git a/Deeploy/Targets/Neureka/Engine.py b/Deeploy/Targets/Neureka/Engine.py
index 2585b1a688..d7745bd11c 100644
--- a/Deeploy/Targets/Neureka/Engine.py
+++ b/Deeploy/Targets/Neureka/Engine.py
@@ -2,8 +2,9 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
-from typing import List
+from typing import Any, List
 
+import numpy as np
 import onnx_graphsurgeon as gs
 
 from Deeploy.DeeployTypes import DeploymentEngine, NodeMapper
@@ -53,30 +54,95 @@ def __init__(self,
         self.enable3x3 = enable3x3
         self.enableStrides = enableStrides
 
-    def isDenseConv(self, node) -> bool:
+    @staticmethod
+    def _isSupportedConvNode(node: gs.Node) -> bool:
+        # Common N-EUREKA preconditions for all convolution flavors. Keep this
+        # structural: engine coloring runs before Deeploy has reliable type info.
         return node.op in ["Conv", "RequantizedConv"] and \
+            len(node.inputs) > 1 and \
             isinstance(node.inputs[1], gs.Constant) and \
-            node.attrs['kernel_shape'] == [3, 3] and \
-            node.attrs['dilations'] == [1, 1] and \
-            node.attrs['group'] == 1 and \
-            (node.attrs['strides'] == [1, 1] or self.enableStrides)
+            node.attrs.get('dilations') == [1, 1]
+
+    def _hasSupportedStrides(self, node: gs.Node) -> bool:
+        # Strided convolutions are opt-in because not every N-EUREKA setup enables
+        # them, while unit strides are always supported.
+        return node.attrs.get('strides') == [1, 1] or self.enableStrides
+
+    @staticmethod
+    def _isIntegerDtype(dtype: Any) -> bool:
+        # ONNX-GraphSurgeon may expose either numpy dtypes/classes or plain ONNX
+        # enum integers depending on the graph loading path. Treat unknown or
+        # missing dtypes as inconclusive instead of rejecting the node outright.
+        if dtype is None:
+            return False
+
+        try:
+            return np.issubdtype(dtype, np.integer)
+        except TypeError:
+            return dtype in {
+                2,  # UINT8
+                3,  # INT8
+                4,  # UINT16
+                5,  # INT16
+                6,  # INT32
+                7,  # INT64
+                12,  # UINT32
+                13,  # UINT64
+            }
+
+    @classmethod
+    def _hasIntegerTensorType(cls, node: gs.Node) -> bool:
+        # Prefer real tensor metadata when it is available. This catches already
+        # integer-typed ONNX graphs without depending on exporter-specific names.
+        return any(cls._isIntegerDtype(getattr(tensor, "dtype", None)) for tensor in [*node.inputs, *node.outputs])
+
+    @staticmethod
+    def _hasQuantizedProvenance(node: gs.Node) -> bool:
+        # Some quantized Deeploy/PACT graphs still carry FLOAT ONNX annotations
+        # before type inference. In that case the stable signal is the provenance
+        # left by quantization/integerization passes on tensors or attributes.
+        quantizedMarkers = ("INTEGERIZE", "QUANT", "REQUANT", "PACT")
+        names = [getattr(tensor, "name", "") for tensor in [*node.inputs, *node.outputs]]
+        attrNames = [str(name) for name in node.attrs.keys()]
+        return any(marker in name.upper() for name in [*names, *attrNames] for marker in quantizedMarkers)
+
+    @classmethod
+    def _hasNeurekaCompatibleSemantics(cls, node: gs.Node) -> bool:
+        # RequantizedConv is produced only after a quantized convolution/requant
+        # pattern was merged, so its op already carries the integer semantics that
+        # N-EUREKA expects. Plain Conv needs an additional signal; otherwise FP32
+        # convolutions from mixed models would be colored for N-EUREKA and fail
+        # later in parsing/binding.
+        if node.op == "RequantizedConv":
+            return True
+
+        return cls._hasIntegerTensorType(node) or cls._hasQuantizedProvenance(node)
+
+    def isDenseConv(self, node) -> bool:
+        return self._isSupportedConvNode(node) and \
+            self._hasNeurekaCompatibleSemantics(node) and \
+            node.attrs.get('kernel_shape') == [3, 3] and \
+            node.attrs.get('group', 1) == 1 and \
+            self._hasSupportedStrides(node)
 
     def isPWConv(self, node) -> bool:
-        return node.op in ["Conv", "RequantizedConv"] and \
-            isinstance(node.inputs[1], gs.Constant) and \
-            node.attrs['kernel_shape'] == [1, 1] and \
-            node.attrs['dilations'] == [1, 1] and \
-            (node.attrs['strides'] == [1, 1] or self.enableStrides)
+        return self._isSupportedConvNode(node) and \
+            self._hasNeurekaCompatibleSemantics(node) and \
+            node.attrs.get('kernel_shape') == [1, 1] and \
+            self._hasSupportedStrides(node)
 
     def isDWConv(self, node) -> bool:
-        return node.op in ["Conv", "RequantizedConv"] and \
-            isinstance(node.inputs[1], gs.Constant) and \
-            node.attrs['kernel_shape'] == [3, 3] and \
-            node.attrs['dilations'] == [1, 1] and \
-            node.attrs['group'] != 1 and \
-            (node.attrs['strides'] == [1, 1] or self.enableStrides)
+        return self._isSupportedConvNode(node) and \
+            self._hasNeurekaCompatibleSemantics(node) and \
+            node.attrs.get('kernel_shape') == [3, 3] and \
+            node.attrs.get('group', 1) != 1 and \
+            self._hasSupportedStrides(node)
 
     def canExecute(self, node: gs.Node) -> bool:
+        # Engine coloring runs before Deeploy type inference, and ONNX dtype
+        # annotations are not reliable for every quantized graph. Still, N-EUREKA
+        # is an integer accelerator, so the coloring must avoid pure FP Conv
+        # nodes and let the fallback engine handle them.
         if self.enable3x3:
             return self.isPWConv(node) or self.isDWConv(node) or self.isDenseConv(node)
         else:
diff --git a/Deeploy/Targets/Neureka/Platform.py b/Deeploy/Targets/Neureka/Platform.py
index e83f7f20f4..af8762c595 100644
--- a/Deeploy/Targets/Neureka/Platform.py
+++ b/Deeploy/Targets/Neureka/Platform.py
@@ -6,6 +6,7 @@
 
 import onnx_graphsurgeon as gs
 
+from Deeploy.AbstractDataTypes import PointerClass, VoidType
 from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
     RequantizedGemmToPwPass
 from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, NodeTemplate, TopologyOptimizer
@@ -27,6 +28,12 @@ class NeurekaConstantBuffer(ConstantBuffer):
     allocTemplate = NodeTemplate("")
     deallocTemplate = NodeTemplate("")
 
+    def __init__(self, name: str = '', shape = [1], values = [0]):
+        super().__init__(name, shape, values)
+        # Some Neureka lowering paths inspect global constants before type inference
+        # has populated them with the final pointer type.
+        self._type = PointerClass(VoidType)
+
     def _bufferRepresentation(self):
         operatorRepresentation = super()._bufferRepresentation()
         operatorRepresentation["_memoryLevel"] = getattr(self, "_memoryLevel", None)
diff --git a/Deeploy/Targets/PULPOpen/Bindings.py b/Deeploy/Targets/PULPOpen/Bindings.py
index 2c78978e23..9882efe568 100644
--- a/Deeploy/Targets/PULPOpen/Bindings.py
+++ b/Deeploy/Targets/PULPOpen/Bindings.py
@@ -15,12 +15,14 @@
 from Deeploy.FutureExtension.Bindings.AutoFutureBinding import AutoFutureBinding
 from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration
 from Deeploy.MemoryLevelExtension.CodeTransformationPasses.Closure import MemoryAwareClosureGeneration
-from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, DequantTemplate, FloatReduceSumTemplate, \
-    GatherTemplate, QuantTemplate, RQSiGELUTemplate, SliceTemplate, iHardswishTemplate
-from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, ConvChecker, DequantChecker, \
-    GatherChecker, GELUChecker, GEMMChecker, HardswishChecker, LayerNormChecker, MatMulChecker, MulChecker, \
-    QuantChecker, ReduceMeanChecker, ReluChecker, ReshapeChecker, RQAddChecker, RQHardswishChecker, SGDChecker, \
-    SliceChecker, SoftmaxChecker, SoftmaxCrossEntropyLossChecker, TransposeChecker
+from Deeploy.Targets.Generic.Templates import AddTemplate, BatchNormalizationTemplate, ConcatTemplate, \
+    DequantTemplate, FloatPadTemplate, FloatReduceLogSumExpTemplate, FloatReduceSumTemplate, GatherTemplate, \
+    PadTemplate, QuantTemplate, RQSiGELUTemplate, SliceTemplate, iHardswishTemplate
+from Deeploy.Targets.Generic.TypeCheckers import AddChecker, BatchNormChecker, ConcatChecker, ConvChecker, \
+    DequantChecker, GatherChecker, GELUChecker, GEMMChecker, HardswishChecker, LayerNormChecker, MatMulChecker, \
+    MulChecker, PadChecker, QuantChecker, ReduceLogSumExpChecker, ReduceMeanChecker, ReluChecker, ReshapeChecker, \
+    RQAddChecker, RQHardswishChecker, SGDChecker, SliceChecker, SoftmaxChecker, SoftmaxCrossEntropyLossChecker, \
+    TransposeChecker
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterSynch import PULPSynchCoresPass
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterTiling import PULPClusterTiling
 from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPL3Tiling import PULPL3Tiling
@@ -29,12 +31,13 @@
 from Deeploy.Targets.PULPOpen.DataTypes import PULPDMAFuture
 from Deeploy.Targets.PULPOpen.DMA.L3Dma import l3DmaHack
 from Deeploy.Targets.PULPOpen.DMA.MchanDma import MchanDma
-from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, DMASliceTemplate, FloatAddTemplate, FloatConvTemplate, \
-    FloatGELUTemplate, FloatGemmTemplate, FloatLayernormTemplate, FloatMatMulTemplate, FloatMaxPoolTemplate, \
-    FloatMulTemplate, FloatReduceMeanTemplate, FloatReluTemplate, FloatSoftmaxTemplate, GEMMTemplate, \
-    MatrixVectorTemplate, MaxPoolTemplate, MulTemplate, ReduceMeanTemplate, RequantShiftTemplate, ReshapeTemplate, \
-    RQAddTemplate, RQSiHardswishTemplate, SGDTemplate, SoftmaxCrossEntropyLossTemplate, TallGEMMTemplate, \
-    TransposeTemplate, UniformRequantShiftTemplate, iRMSNormTemplate, iSoftmaxTemplate
+from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, ConvTransposeTemplate, DMASliceTemplate, \
+    FloatAddTemplate, FloatConvTemplate, FloatGELUTemplate, FloatGemmTemplate, FloatLayernormTemplate, \
+    FloatMatMulTemplate, FloatMaxPoolTemplate, FloatMulTemplate, FloatReduceMeanTemplate, FloatReluTemplate, \
+    FloatSoftmaxTemplate, GEMMTemplate, MatrixVectorTemplate, MaxPoolTemplate, MulTemplate, ReduceMeanTemplate, \
+    RequantShiftTemplate, ReshapeTemplate, RQAddTemplate, RQSiHardswishTemplate, SGDTemplate, \
+    SoftmaxCrossEntropyLossTemplate, TallGEMMTemplate, TransposeTemplate, UniformRequantShiftTemplate, \
+    iRMSNormTemplate, iSoftmaxTemplate
 from Deeploy.Targets.PULPOpen.TypeCheckers import PULPConvChecker, PULPLinearChecker, PULPMaxPoolChecker, \
     PULPRequantShiftChecker
 from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement, \
@@ -176,6 +179,35 @@
                 ReshapeTemplate.referenceTemplate, SkipTransformer) for type in IntegerDataTypes + FloatDataTypes
 ]
 
+PULPBatchNormBindings = [
+    NodeBinding(
+        BatchNormChecker(
+            [PointerClass(type),
+             PointerClass(type),
+             PointerClass(type),
+             PointerClass(type),
+             PointerClass(type)], [PointerClass(type)]), BatchNormalizationTemplate.referenceTemplate, ForkTransformer)
+    for type in FloatDataTypes
+]
+
+PULPPad1DBindings = [
+    NodeBinding(PadChecker([PointerClass(type)], [PointerClass(type)]), PadTemplate.reference1DTemplate,
+                ForkTransformer) for type in IntegerDataTypes
+] + [
+    NodeBinding(PadChecker([PointerClass(type)], [PointerClass(type)]), FloatPadTemplate.reference1DTemplate,
+                ForkTransformer) for type in FloatDataTypes
+]
+
+PULPPad2DBindings = [
+    NodeBinding(PadChecker([PointerClass(type)], [PointerClass(type)]), PadTemplate.reference2DTemplate,
+                ForkTransformer) for type in IntegerDataTypes
+] + [
+    NodeBinding(
+        PadChecker([PointerClass(float32_t), PointerClass(float32_t),
+                    PointerClass(float32_t)], [PointerClass(float32_t)]), FloatPadTemplate.reference2DTemplate,
+        ForkTransformer)
+]
+
 PULPRQAddBindings = [
     NodeBinding(RQAddChecker([PointerClass(_type), PointerClass(_type2)], [PointerClass(_type3)]),
                 RQAddTemplate.referenceTemplate, ForkTransformer)
@@ -249,6 +281,15 @@
         ForkTransformer) for float_type in FloatDataTypes
 ]
 
+PULPFloatConvTranspose2DBindings = [
+    NodeBinding(ConvChecker(
+        [PointerClass(type), PointerClass(type), PointerClass(type)], [PointerClass(type)]),
+                ConvTransposeTemplate.reference2DTemplate, ForkTransformer) for type in FloatDataTypes
+] + [
+    NodeBinding(ConvChecker([PointerClass(type), PointerClass(type)], [PointerClass(type)]),
+                ConvTransposeTemplate.reference2DTemplate, ForkTransformer) for type in FloatDataTypes
+]
+
 PULPRQSMatrixVecBindings = [
     NodeBinding(
         PULPLinearChecker([PointerClass(type1),
@@ -322,6 +363,11 @@
                 FloatReduceSumTemplate.referenceTemplate, ClusterTransformer)
 ]
 
+PULPReduceLogSumExpBindings = [
+    NodeBinding(ReduceLogSumExpChecker([PointerClass(float32_t)], [PointerClass(float32_t)]),
+                FloatReduceLogSumExpTemplate.referenceTemplate, ForkTransformer)
+]
+
 PULPUniformRQSBindings = [
     NodeBinding(
         PULPRequantShiftChecker([PointerClass(type), PointerClass(int32_t),
@@ -382,6 +428,9 @@
 PULPConcatBindings = [
     NodeBinding(ConcatChecker([PointerClass(type), PointerClass(type)], [PointerClass(type)]),
                 ConcatTemplate.referenceTemplate, ClusterTransformer) for type in IntegerDataTypes
+] + [
+    NodeBinding(ConcatChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+                ConcatTemplate.referenceTemplate, ClusterTransformer)
 ]
 
 PULPiRMSNormBindings = [
diff --git a/Deeploy/Targets/PULPOpen/DMA/L3Dma.py b/Deeploy/Targets/PULPOpen/DMA/L3Dma.py
index 6c2aa30811..375ad75958 100644
--- a/Deeploy/Targets/PULPOpen/DMA/L3Dma.py
+++ b/Deeploy/Targets/PULPOpen/DMA/L3Dma.py
@@ -29,7 +29,7 @@ class L3Dma(AsyncDma):
     _transferTemplates = {
         2:
             NodeTemplate(
-                "pi_cl_ram_copy_2d(get_ram_ptr(), ${ext}, ${loc}, ${transfer_size}, ${stride}, ${length}, ${ext2loc}, &${future});"
+                "pi_cl_ram_copy_2d(get_ram_ptr(), (uint32_t) ${ext}, ${loc}, ${transfer_size}, ${stride}, ${length}, ${ext2loc}, &${future});"
             )
     }
     _waitingStrategy = PerTensorWaitingStrategy(L3DmaFuture)
diff --git a/Deeploy/Targets/PULPOpen/Parsers.py b/Deeploy/Targets/PULPOpen/Parsers.py
index 5c5951eaba..b71b0aead6 100644
--- a/Deeploy/Targets/PULPOpen/Parsers.py
+++ b/Deeploy/Targets/PULPOpen/Parsers.py
@@ -8,8 +8,8 @@
 import onnx_graphsurgeon as gs
 
 from Deeploy.DeeployTypes import NetworkContext
-from Deeploy.Targets.Generic.Parsers import Conv2DParser, GEMMParser, ReduceMeanParser, RQSConv1DParser, \
-    RQSConv2DParser, RQSParserInterface
+from Deeploy.Targets.Generic.Parsers import Conv2DParser, ConvTranspose2DParser, GEMMParser, ReduceMeanParser, \
+    RQSConv1DParser, RQSConv2DParser, RQSParserInterface
 
 
 class PULPConv2DParser(RQSConv2DParser):
@@ -90,6 +90,8 @@ def parseNode(self, node: gs.Node) -> (bool):
             self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][1])
             self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][2])
             self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][3])
+            if "n_cores" in node.attrs:
+                self.operatorRepresentation["n_cores"] = int(node.attrs["n_cores"])
 
             return ret
         return False
@@ -148,6 +150,8 @@ def parseNode(self, node: gs.Node) -> (bool):
             self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][1])
             self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][2])
             self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][3])
+            if "n_cores" in node.attrs:
+                self.operatorRepresentation["n_cores"] = int(node.attrs["n_cores"])
 
             return ret
         return False
@@ -182,6 +186,51 @@ def parseNodeCtxt(self,
         return ctxt, False
 
 
+class PULPConvTranspose2DParser(ConvTranspose2DParser):
+
+    def __init__(self):
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> bool:
+        wellFormed = super().parseNode(node)
+
+        if wellFormed:
+            # ConvTranspose kernels on PULP/Siracusa are emitted in CHW layout.
+            # This must be visible before broadcast() runs, otherwise output shapes
+            # are reinterpreted as NHWC and the last dimension gets clobbered.
+            self.operatorRepresentation['channels_first'] = True
+            self.operatorRepresentation['padding_y_top'] = int(self.operatorRepresentation['pads'][0])
+            self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][1])
+            self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][2])
+            self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][3])
+
+        return wellFormed
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+        if node.attrs.get("channels_first", True) == False:
+            return ctxt, False
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, True)
+
+        if ret:
+            self.operatorRepresentation['data_in'] = newCtxt.lookup(node.inputs[0].name).name
+            self.operatorRepresentation['weight'] = newCtxt.lookup(node.inputs[1].name).name
+
+            if len(node.inputs) == 2:
+                self.operatorRepresentation["has_bias"] = "false"
+                self.operatorRepresentation["bias"] = "NULL"
+            else:
+                self.operatorRepresentation["has_bias"] = "true"
+                self.operatorRepresentation["bias"] = newCtxt.lookup(node.inputs[2].name).name
+
+            return newCtxt, True
+
+        return ctxt, False
+
+
 class PULPDWConv1DParser(RQSConv1DParser):
 
     def __init__(self, noBiasHoisting = True):
diff --git a/Deeploy/Targets/PULPOpen/Platform.py b/Deeploy/Targets/PULPOpen/Platform.py
index f13e6451fb..fe0fdd737f 100644
--- a/Deeploy/Targets/PULPOpen/Platform.py
+++ b/Deeploy/Targets/PULPOpen/Platform.py
@@ -11,20 +11,20 @@
     NodeTemplate, StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer
 from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel
 from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryPlatform, MemoryPlatformWrapper
-from Deeploy.Targets.Generic.Bindings import BasicGEMMBindings, BasicPad1DBindings, BasicPad2DBindings, \
-    BasicRQIntegerDivBinding
-from Deeploy.Targets.Generic.Layers import AddLayer, ConcatLayer, ConvLayer, GatherLayer, GELUGradLayer, GELULayer, \
-    GEMMLayer, LayerNormGradLayer, LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, QuantLayer, \
-    ReduceMeanLayer, ReduceSumLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, \
-    RQSiHardswishLayer, SGDLayer, SliceLayer, SoftmaxCrossEntropyLossGradLayer, SoftmaxCrossEntropyLossLayer, \
-    SoftmaxGradLayer, SoftmaxLayer, TransposeLayer, iHardswishLayer, iRMSNormLayer
-from Deeploy.Targets.Generic.Parsers import AddParser, ConcatParser, DequantParser, FlattenParser, GatherParser, \
-    GELUGradParser, GELUParser, GEMMParser, LayerNormGradParser, LayerNormParser, MatMulParser, MaxPool1DParser, \
-    MaxPool2DParser, MulParser, Pad1DParser, Pad2DParser, QuantParser, ReduceSumParser, ReluParser, \
-    RequantShiftParser, ReshapeParser, RQAddParser, RQIntegerDivParser, RQSiGELUParser, RQSiHardswishParser, \
-    SGDParser, SliceParser, SoftmaxCrossEntropyLossGradParser, SoftmaxCrossEntropyLossParser, SoftmaxGradParser, \
-    SoftmaxParser, TransposeParser, UniformRequantShiftParser, UnsqueezeParser, iHardswishParser, iRMSNormParser, \
-    iSoftmaxParser
+from Deeploy.Targets.Generic.Bindings import BasicGEMMBindings, BasicRQIntegerDivBinding
+from Deeploy.Targets.Generic.Layers import AddLayer, BatchNormalizationLayer, ConcatLayer, ConvLayer, \
+    ConvTransposeLayer, GatherLayer, GELUGradLayer, GELULayer, GEMMLayer, LayerNormGradLayer, LayerNormLayer, \
+    MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, QuantLayer, ReduceLogSumExpLayer, ReduceMeanLayer, ReduceSumLayer, \
+    ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, RQSiHardswishLayer, SGDLayer, \
+    SliceLayer, SoftmaxCrossEntropyLossGradLayer, SoftmaxCrossEntropyLossLayer, SoftmaxGradLayer, SoftmaxLayer, \
+    TransposeLayer, iHardswishLayer, iRMSNormLayer
+from Deeploy.Targets.Generic.Parsers import AddParser, BatchNormParser, ConcatParser, DequantParser, FlattenParser, \
+    GatherParser, GELUGradParser, GELUParser, GEMMParser, LayerNormGradParser, LayerNormParser, MatMulParser, \
+    MaxPool1DParser, MaxPool2DParser, MulParser, Pad1DParser, Pad2DParser, QuantParser, ReduceLogSumExpParser, \
+    ReduceSumParser, ReluParser, RequantShiftParser, ReshapeParser, RQAddParser, RQIntegerDivParser, RQSiGELUParser, \
+    RQSiHardswishParser, SGDParser, SliceParser, SoftmaxCrossEntropyLossGradParser, SoftmaxCrossEntropyLossParser, \
+    SoftmaxGradParser, SoftmaxParser, TransposeParser, UniformRequantShiftParser, UnsqueezeParser, iHardswishParser, \
+    iRMSNormParser, iSoftmaxParser
 from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate
 from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import DequantPatternPass, IntegerDivRequantMergePass, \
     MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, QuantPatternPass, RQSSplitPass, \
@@ -32,24 +32,25 @@
 from Deeploy.Targets.PULPOpen.Bindings import BasicDequantBindings, BasicQuantBindings, PULPDMASliceBindings, \
     PULPDWConv1DBinding
 from Deeploy.Targets.PULPOpen.Layers import PULPRQSConvLayer, PULPRQSGEMMLayer
-from Deeploy.Targets.PULPOpen.Parsers import PULPConv1DParser, PULPConv2DParser, PULPDWConv1DParser, \
-    PULPDWConv2DParser, PULPFPConv2DParser, PULPFPDWConv2DParser, PULPGEMMParser, PULPMatrixVecParser, \
-    PULPReduceMeanParser, PULPTallGEMMParser
+from Deeploy.Targets.PULPOpen.Parsers import PULPConv1DParser, PULPConv2DParser, PULPConvTranspose2DParser, \
+    PULPDWConv1DParser, PULPDWConv2DParser, PULPFPConv2DParser, PULPFPDWConv2DParser, PULPGEMMParser, \
+    PULPMatrixVecParser, PULPReduceMeanParser, PULPTallGEMMParser
 from Deeploy.Targets.PULPOpen.Templates import AllocateTemplate, FreeTemplate
-from Deeploy.Targets.PULPOpen.Tiler import PULPAddTilingReadyBindings, PULPConcatTilingReadyBindings, \
-    PULPConv2DTilingReadyBindings, PULPDWConv2DTilingReadyBindings, PULPFlattenTilingReadyBindings, \
-    PULPFPGELUGradTilingReadyBindings, PULPFPGELUTilingReadyBindings, PULPFPGEMMTilingReadyBindings, \
-    PULPGatherTilingReadyBindings, PULPiHardswishTilingReadyBindings, PULPiRMSNormTilingReadyBindings, \
-    PULPiRQSGELUTilingReadyBindings, PULPLayernormGradTilingReadyBindings, PULPLayernormTilingReadyBindings, \
-    PULPMatMulTilingReadyBindings, PULPMaxPool1DTilingReadyBindings, PULPMaxPool2DTilingReadyBindings, \
-    PULPMulTilingReadyBindings, PULPReduceMeanTilingReadyBindings, PULPReduceSumTilingReadyBindings, \
-    PULPReluTilingReadyBindings, PULPRQAddTilingReadyBindings, PULPRQSConv1DTilingReadyBindings, \
-    PULPRQSConv2DTilingReadyBindings, PULPRQSDWConv2DTilingReadyBindings, PULPRQSGEMMTilingReadyBindings, \
-    PULPRQSiHardswishTilingReadyBindings, PULPRQSMatrixVecTilingReadyBindings, PULPRQSTallGEMMTilingReadyBindings, \
-    PULPRQSTilingReadyBindings, PULPSGDTilingReadyBindings, PULPSliceTilingReadyBindings, \
-    PULPSoftmaxCrossEntropyGradTilingReadyBindings, PULPSoftmaxCrossEntropyTilingReadyBindings, \
-    PULPSoftmaxGradTilingReadyBindings, PULPSoftmaxTilingReadyBindings, PULPTransposeTilingReadyBindings, \
-    PULPUniformRQSTilingReadyBindings
+from Deeploy.Targets.PULPOpen.Tiler import PULPAddTilingReadyBindings, PULPBatchNormTilingReadyBindings, \
+    PULPConcatTilingReadyBindings, PULPConv2DTilingReadyBindings, PULPConvTranspose2DTilingReadyBindings, \
+    PULPDWConv2DTilingReadyBindings, PULPFlattenTilingReadyBindings, PULPFPGELUGradTilingReadyBindings, \
+    PULPFPGELUTilingReadyBindings, PULPFPGEMMTilingReadyBindings, PULPGatherTilingReadyBindings, \
+    PULPiHardswishTilingReadyBindings, PULPiRMSNormTilingReadyBindings, PULPiRQSGELUTilingReadyBindings, \
+    PULPLayernormGradTilingReadyBindings, PULPLayernormTilingReadyBindings, PULPMatMulTilingReadyBindings, \
+    PULPMaxPool1DTilingReadyBindings, PULPMaxPool2DTilingReadyBindings, PULPMulTilingReadyBindings, \
+    PULPPad1DTilingReadyBindings, PULPPad2DTilingReadyBindings, PULPReduceLogSumExpTilingReadyBindings, \
+    PULPReduceMeanTilingReadyBindings, PULPReduceSumTilingReadyBindings, PULPReluTilingReadyBindings, \
+    PULPRQAddTilingReadyBindings, PULPRQSConv1DTilingReadyBindings, PULPRQSConv2DTilingReadyBindings, \
+    PULPRQSDWConv2DTilingReadyBindings, PULPRQSGEMMTilingReadyBindings, PULPRQSiHardswishTilingReadyBindings, \
+    PULPRQSMatrixVecTilingReadyBindings, PULPRQSTallGEMMTilingReadyBindings, PULPRQSTilingReadyBindings, \
+    PULPSGDTilingReadyBindings, PULPSliceTilingReadyBindings, PULPSoftmaxCrossEntropyGradTilingReadyBindings, \
+    PULPSoftmaxCrossEntropyTilingReadyBindings, PULPSoftmaxGradTilingReadyBindings, PULPSoftmaxTilingReadyBindings, \
+    PULPTransposeTilingReadyBindings, PULPUniformRQSTilingReadyBindings
 from Deeploy.Targets.PULPOpen.TopologyOptimizationPasses.Passes import PULPAddRequantMergePass, \
     PULPConvRequantMergePass, PULPGEMMRequantMergePass, PULPMatMulRequantMergePass
 
@@ -60,8 +61,8 @@
 GELUGradMapper = NodeMapper(GELUGradParser(), PULPFPGELUGradTilingReadyBindings)
 GatherMapper = NodeMapper(GatherParser(), PULPGatherTilingReadyBindings)
 MulMapper = NodeMapper(MulParser(), PULPMulTilingReadyBindings)
-Pad1DMapper = NodeMapper(Pad1DParser(), BasicPad1DBindings)
-Pad2DMapper = NodeMapper(Pad2DParser(), BasicPad2DBindings)
+Pad1DMapper = NodeMapper(Pad1DParser(), PULPPad1DTilingReadyBindings)
+Pad2DMapper = NodeMapper(Pad2DParser(), PULPPad2DTilingReadyBindings)
 ReshapeMapper = NodeMapper(ReshapeParser(), PULPFlattenTilingReadyBindings)
 TransposeMapper = NodeMapper(TransposeParser(), PULPTransposeTilingReadyBindings)
 UnsqueezeMapper = NodeMapper(UnsqueezeParser(), PULPFlattenTilingReadyBindings)
@@ -70,6 +71,7 @@
 UniformRequantShiftMapper = NodeMapper(UniformRequantShiftParser(), PULPUniformRQSTilingReadyBindings)
 
 ReduceMeanMapper = NodeMapper(PULPReduceMeanParser(), PULPReduceMeanTilingReadyBindings)
+ReduceLogSumExpMapper = NodeMapper(ReduceLogSumExpParser(), PULPReduceLogSumExpTilingReadyBindings)
 ReduceSumMapper = NodeMapper(ReduceSumParser(), PULPReduceSumTilingReadyBindings)
 MatMulMapper = NodeMapper(MatMulParser(), PULPMatMulTilingReadyBindings)
 RQIntegerDivMapper = NodeMapper(RQIntegerDivParser(), [BasicRQIntegerDivBinding])
@@ -81,6 +83,7 @@
 Conv2DMapper = NodeMapper(PULPConv2DParser(), PULPRQSConv2DTilingReadyBindings)
 FPDWConv2DMapper = NodeMapper(PULPFPDWConv2DParser(), PULPDWConv2DTilingReadyBindings)
 DWConv2DMapper = NodeMapper(PULPDWConv2DParser(), PULPRQSDWConv2DTilingReadyBindings)
+ConvTranspose2DMapper = NodeMapper(PULPConvTranspose2DParser(), PULPConvTranspose2DTilingReadyBindings)
 GEMMMapper = NodeMapper(PULPGEMMParser(), PULPRQSGEMMTilingReadyBindings)
 FloatGEMMMapper = NodeMapper(GEMMParser(), PULPFPGEMMTilingReadyBindings)
 MatrixVecMapper = NodeMapper(PULPMatrixVecParser(), PULPRQSMatrixVecTilingReadyBindings)
@@ -111,8 +114,10 @@
 QuantMapper = NodeMapper(QuantParser(), BasicQuantBindings)
 DequantMapper = NodeMapper(DequantParser(), BasicDequantBindings)
 GEMMDequantMapper = NodeMapper(PULPGEMMParser(), BasicGEMMBindings)
+BatchNormalizationMapper = NodeMapper(BatchNormParser(), PULPBatchNormTilingReadyBindings)
 PULPMapping = {
     'Conv': ConvLayer([FPConv2DMapper, FPDWConv2DMapper]),
+    'ConvTranspose': ConvTransposeLayer([ConvTranspose2DMapper]),
     'RequantizedConv': PULPRQSConvLayer([Conv2DMapper, DWConv2DMapper, Conv1DMapper, DWConv1DMapper]),
     'RequantizedGemm': PULPRQSGEMMLayer([MatrixVecMapper, TallGEMMMapper, GEMMMapper]),
     'Gemm': GEMMLayer([FloatGEMMMapper, GEMMDequantMapper]),
@@ -128,6 +133,7 @@
     'iSoftmax': SoftmaxLayer([Softmax_int8_Mapper]),
     'Softmax': SoftmaxLayer([SoftmaxMapper]),
     'ReduceMean': ReduceMeanLayer([ReduceMeanMapper]),
+    'ReduceLogSumExp': ReduceLogSumExpLayer([ReduceLogSumExpMapper]),
     'ReduceSum': ReduceSumLayer([ReduceSumMapper]),
     'RequantShift': RequantShiftLayer([UniformRequantShiftMapper, RequantShiftMapper]),
     'Add': AddLayer([AddMapper]),
@@ -148,6 +154,7 @@
     'RequantizediHardswish': RQSiHardswishLayer([RQSiHardswishMapper]),
     'Quant': QuantLayer([QuantMapper]),
     'Dequant': QuantLayer([DequantMapper]),
+    'BatchNormalization': BatchNormalizationLayer([BatchNormalizationMapper]),
     'SoftmaxGrad': SoftmaxGradLayer([SoftmaxGradMapper]),
     'SoftmaxCrossEntropyLoss': SoftmaxCrossEntropyLossLayer([SoftmaxCrossEntropyLossMapper]),
     'SoftmaxCrossEntropyLossGrad': SoftmaxCrossEntropyLossGradLayer([SoftmaxCrossEntropyLossGradMapper]),
diff --git a/Deeploy/Targets/PULPOpen/Templates/ConvTransposeTemplate.py b/Deeploy/Targets/PULPOpen/Templates/ConvTransposeTemplate.py
new file mode 100644
index 0000000000..5ef8d5855e
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/Templates/ConvTransposeTemplate.py
@@ -0,0 +1,36 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from Deeploy.DeeployTypes import NodeTemplate
+
+reference2DTemplate = NodeTemplate("""
+<%
+batchOffsetIn = ch_im_in * dim_im_in_x * dim_im_in_y
+batchOffsetOut = ch_im_out * dim_im_out_x * dim_im_out_y
+%>
+
+// 2D FP ConvTranspose CHW on PULPOpen/Siracusa (Name: ${nodeName}, Op: ${nodeOp})
+${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in};
+${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+for (uint32_t n=0; n<${batch}; ++n) {
+    PULP_ConvTranspose2d_fp32_fp32_fp32_CHW(
+        ref_${data_out}_${data_in},
+        ${ch_im_in}, ${dim_im_in_x}, ${dim_im_in_y},
+        ${weight},
+        ${ch_im_out}, ${group},
+        ${dim_kernel_x}, ${dim_kernel_y},
+        ${stride_x}, ${stride_y},
+        ${dilation_x}, ${dilation_y},
+        ${padding_y_top}, ${padding_y_bottom},
+        ${padding_x_left}, ${padding_x_right},
+        ${bias}, ${has_bias},
+        ref_${data_out}_${data_out},
+        ${dim_im_out_x}, ${dim_im_out_y}
+    );
+
+    ref_${data_out}_${data_in} += ${batchOffsetIn};
+    ref_${data_out}_${data_out} += ${batchOffsetOut};
+}
+""")
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py
index bfa893db94..bd4f108ab3 100644
--- a/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py
@@ -18,10 +18,12 @@ def __init__(self, templateStr):
     def computeTransientBuffersSize(
             ctxt: NetworkContext,
             operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]:
+        n_cores = int(operatorRepresentation["n_cores"])
+        # Conservative fallback used during Autoencoder2D debug:
+        # n_cores = max(int(operatorRepresentation.get("n_cores", 8)), 8)
         # Memory allocation for the im2col buffer can be dynamic, based on the number of cores.
-        im2col_dim = (operatorRepresentation["weight_type"].typeWidth //
-                      8) * operatorRepresentation["n_cores"] * operatorRepresentation[
-                          'ch_im_in'] * operatorRepresentation['dim_kernel_x'] * operatorRepresentation['dim_kernel_y']
+        im2col_dim = (operatorRepresentation["weight_type"].typeWidth // 8) * n_cores * operatorRepresentation[
+            'ch_im_in'] * operatorRepresentation['dim_kernel_x'] * operatorRepresentation['dim_kernel_y']
 
         im2col_name = operatorRepresentation['nodeName'] + "_buffer"
 
@@ -48,9 +50,12 @@ def computeTransientBuffersSize(
             ctxt: NetworkContext,
             operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]:
 
+        n_cores = int(operatorRepresentation["n_cores"])
+        # Conservative fallback used during Autoencoder2D debug:
+        # n_cores = max(int(operatorRepresentation.get("n_cores", 8)), 8)
         # Memory allocation for the im2col buffer can be dynamic, based on the number of cores.
-        im2col_dim = (operatorRepresentation["weight_type"].typeWidth // 8) * operatorRepresentation[
-            "n_cores"] * operatorRepresentation['dim_kernel_x'] * operatorRepresentation['dim_kernel_y']
+        im2col_dim = (operatorRepresentation["weight_type"].typeWidth //
+                      8) * n_cores * operatorRepresentation['dim_kernel_x'] * operatorRepresentation['dim_kernel_y']
 
         im2col_name = operatorRepresentation['nodeName'] + "_buffer"
 
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py
index 59499706e5..917e82a822 100644
--- a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py
@@ -4,7 +4,8 @@
 
 from typing import Dict, List, Tuple
 
-from Deeploy.AbstractDataTypes import float32_tPtr
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import float32_t
 from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
 
 
@@ -19,7 +20,7 @@ def alignToContext(self, ctxt: NetworkContext,
         if 'C' not in operatorRepresentation or operatorRepresentation['C'] is None:
             # No bias case - set C to NULL and provide a default type
             operatorRepresentation['C'] = None
-            operatorRepresentation['C_type'] = float32_tPtr  # Default to fp32 type
+            operatorRepresentation['C_type'] = PointerClass(float32_t)  # Default to fp32 type
             operatorRepresentation['C_batched'] = False
 
         return ctxt, operatorRepresentation, []
@@ -76,4 +77,4 @@ def alignToContext(self, ctxt: NetworkContext,
 
     ref_${data_out}_${data_out} += ${M} * ${O};
 }
-""")
\ No newline at end of file
+""")
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py
index ced6c3cbcf..1d1d6a70bc 100644
--- a/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py
@@ -14,18 +14,33 @@
 uint32_t ${nodeName}_end = MIN(${nodeName}_start + ${nodeName}_chunk, (uint32_t) ${size});
 
 if (${nodeName}_start < ${nodeName}_end) {
+% if sizeB == 1:
     float32_t ${nodeName}_scalar = ${B}[0];
+% endif
     uint32_t ${nodeName}_unroll_end = ${nodeName}_start + ((${nodeName}_end - ${nodeName}_start) / 6) * 6;
     for (uint32_t i = ${nodeName}_start; i < ${nodeName}_unroll_end; i += 6) {
+% if sizeB == 1:
         ${C}[i + 0] = ${A}[i + 0] * ${nodeName}_scalar;
         ${C}[i + 1] = ${A}[i + 1] * ${nodeName}_scalar;
         ${C}[i + 2] = ${A}[i + 2] * ${nodeName}_scalar;
         ${C}[i + 3] = ${A}[i + 3] * ${nodeName}_scalar;
         ${C}[i + 4] = ${A}[i + 4] * ${nodeName}_scalar;
         ${C}[i + 5] = ${A}[i + 5] * ${nodeName}_scalar;
+% else:
+        ${C}[i + 0] = ${A}[i + 0] * ${B}[i + 0];
+        ${C}[i + 1] = ${A}[i + 1] * ${B}[i + 1];
+        ${C}[i + 2] = ${A}[i + 2] * ${B}[i + 2];
+        ${C}[i + 3] = ${A}[i + 3] * ${B}[i + 3];
+        ${C}[i + 4] = ${A}[i + 4] * ${B}[i + 4];
+        ${C}[i + 5] = ${A}[i + 5] * ${B}[i + 5];
+% endif
     }
     for (uint32_t i = ${nodeName}_unroll_end; i < ${nodeName}_end; i++) {
+% if sizeB == 1:
         ${C}[i] = ${A}[i] * ${nodeName}_scalar;
+% else:
+        ${C}[i] = ${A}[i] * ${B}[i];
+% endif
     }
 }
-""")
\ No newline at end of file
+""")
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py
index ae9ca96b3c..bd6e5dc290 100644
--- a/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py
@@ -1060,4 +1060,4 @@ def serializeTilingSolution(
         tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
         variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
 
-        return variableReplacementSchedule, tilingSchedule
\ No newline at end of file
+        return variableReplacementSchedule, tilingSchedule
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/ConvTransposeTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/ConvTransposeTileConstraint.py
new file mode 100644
index 0000000000..b3b6756c41
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/ConvTransposeTileConstraint.py
@@ -0,0 +1,189 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint8_t, uint16_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
+
+
+class ConvTranspose2DTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        # Register all tensors that participate in the tiled operator. The tiler will
+        # create symbolic variables for each tensor dimension and solve over them.
+        inputBufferName = parseDict['data_in']
+        weightBufferName = parseDict['weight']
+        outputBufferName = parseDict['data_out']
+
+        for bufferName in [inputBufferName, weightBufferName, outputBufferName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        # Symbolic dimensions for NCHW input and output, and [Cin, Cout/group, Kh, Kw] weights.
+        inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 0)
+        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1)
+        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 2)
+        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 3)
+
+        weightInChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 0)
+        weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 1)
+        weightHeightVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 2)
+        weightWidthVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 3)
+
+        outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0)
+        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1)
+        outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2)
+        outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
+
+        # Geometrical constraints: batch is preserved, the weight Cin must match input Cin,
+        # and in this first implementation H/W are not tiled, so they stay equal to the
+        # full tensor shapes known from the context.
+        tilerModel.addConstraint(outputBatchVar == inputBatchVar)
+        tilerModel.addConstraint(weightInChannelVar == inputChannelVar)
+        tilerModel.addConstraint(outputHeightVar == ctxt.lookup(outputBufferName).shape[2])
+        tilerModel.addConstraint(outputWidthVar == ctxt.lookup(outputBufferName).shape[3])
+        tilerModel.addConstraint(inputHeightVar == ctxt.lookup(inputBufferName).shape[2])
+        tilerModel.addConstraint(inputWidthVar == ctxt.lookup(inputBufferName).shape[3])
+        tilerModel.addConstraint(weightHeightVar == ctxt.lookup(weightBufferName).shape[2])
+        tilerModel.addConstraint(weightWidthVar == ctxt.lookup(weightBufferName).shape[3])
+
+        if parseDict['group'] == 1:
+            # For the regular case, the only tiled dimension is Cout, so the output tile
+            # channel count must match the weight tile channel count.
+            tilerModel.addConstraint(outputChannelVar == weightOutChannelVar)
+        else:
+            # Grouped ConvTranspose is kept conservative for now: no real Cout slicing is
+            # enforced on the weight tensor because the grouped layout needs dedicated handling.
+            tilerModel.addConstraint(outputChannelVar == ctxt.lookup(outputBufferName).shape[1])
+            tilerModel.addConstraint(weightOutChannelVar == ctxt.lookup(weightBufferName).shape[1])
+
+        if parseDict.get('has_bias', "false") == "true":
+            # Bias follows the tiled output channels one-to-one.
+            biasBufferName = parseDict['bias']
+            tilerModel.addTensorDimToModel(ctxt, biasBufferName)
+            biasChannelVar = tilerModel.getTensorDimVar(tensorName = biasBufferName, dimIdx = 0)
+            tilerModel.addConstraint(biasChannelVar == outputChannelVar)
+
+        return tilerModel
+
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        inputBufferName = parseDict['data_in']
+        outputBufferName = parseDict['data_out']
+
+        inputShape = ctxt.lookup(inputBufferName).shape
+        outputShape = ctxt.lookup(outputBufferName).shape
+
+        inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 0)
+        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1)
+        outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0)
+        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1)
+
+        # Policy constraints: keep the full input resident and tile only along output
+        # channels. This is the simplest correct strategy for ConvTranspose because it
+        # avoids the inverse spatial dependency problem on H/W tiles.
+        tilerModel.addConstraint(inputBatchVar == inputShape[0])
+        tilerModel.addConstraint(inputChannelVar == inputShape[1])
+        tilerModel.addConstraint(outputBatchVar == outputShape[0])
+
+        if parseDict['group'] == 1 and parseDict["ch_im_out"] >= 8:
+            # Do not create tiny channel tiles unless necessary. This reduces overhead
+            # from DMA/codegen and keeps the kernel granularity reasonable.
+            tilerModel.addMinTileSizeConstraint(parseDict, 'ch_im_out', outputChannelVar, 8)
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        # The solver gives us output tiles. For each output tile we must reconstruct the
+        # exact input/weight/bias tiles that must be transferred into the target memory.
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['data_in', 'weight', 'data_out']
+        has_bias = operatorRepresentation.get('has_bias', "false") == "true"
+        if has_bias:
+            addrNames.append('bias')
+
+        # Resolve base addresses in the target memory level (typically L1) for all tensors
+        # used by this tiled operator.
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        inputShape = ctxt.lookup(operatorRepresentation['data_in']).shape
+        weightShape = ctxt.lookup(operatorRepresentation['weight']).shape
+
+        inputInCubes = []
+        inputWeightCubes = []
+        inputBiasCubes = []
+
+        replacements = {
+            "ch_im_out": [],
+            "batch": [],
+        }
+        replacementTypes = {
+            "ch_im_out": PointerClass(uint16_t),
+            "batch": PointerClass(uint8_t),
+        }
+
+        for cube in outputCubes:
+            # Output tiles are NCHW. Since we only tile along C_out, only the channel
+            # offset/size are relevant here.
+            _, cOffset, _, _ = cube.offset
+            nSize, cSize, _, _ = cube.dims
+
+            # Input is kept whole for every tile in this first implementation.
+            inputInCubes.append(HyperRectangle((0, 0, 0, 0), tuple(inputShape)))
+
+            if operatorRepresentation['group'] == 1:
+                # Slice the weight tensor along the output-channel axis so the kernel sees
+                # only the filters needed for the current output tile.
+                weightOffset = (0, cOffset, 0, 0)
+                weightDims = (weightShape[0], cSize, weightShape[2], weightShape[3])
+            else:
+                # Conservative grouped fallback: use full weights until grouped slicing is
+                # implemented explicitly for ConvTranspose tiling.
+                weightOffset = (0, 0, 0, 0)
+                weightDims = tuple(weightShape)
+
+            inputWeightCubes.append(HyperRectangle(weightOffset, weightDims))
+
+            if has_bias:
+                if operatorRepresentation['group'] == 1:
+                    # Bias is sliced exactly like the output channels.
+                    inputBiasCubes.append(HyperRectangle((cOffset,), (cSize,)))
+                else:
+                    inputBiasCubes.append(HyperRectangle((0,), (ctxt.lookup(operatorRepresentation['bias']).shape[0],)))
+
+            # These replacements are injected into the template so each kernel invocation
+            # uses the tile-local shape instead of the global layer shape.
+            replacements["ch_im_out"].append(cSize)
+            replacements["batch"].append(nSize)
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        # Build the per-tile DMA/load schedule: which tensor cubes must be present before
+        # computing each output cube.
+        if has_bias:
+            for inCube, weightCube, biasCube in zip(inputInCubes, inputWeightCubes, inputBiasCubes):
+                inputLoadSchedule.append({"data_in": inCube, "weight": weightCube, "bias": biasCube})
+        else:
+            for inCube, weightCube in zip(inputInCubes, inputWeightCubes):
+                inputLoadSchedule.append({"data_in": inCube, "weight": weightCube})
+
+        for out in outputCubes:
+            outputLoadSchedule.append({"data_out": out})
+
+        schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        return VariableReplacementScheme(replacements, replacementTypes), schedule
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py
index ee7e448be6..fb55dd536d 100644
--- a/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py
@@ -6,7 +6,7 @@
 from typing import Dict, List, Tuple
 
 from Deeploy.AbstractDataTypes import PointerClass
-from Deeploy.CommonExtensions.DataTypes import int8_t
+from Deeploy.CommonExtensions.DataTypes import uint16_t
 from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
 from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
 from Deeploy.TilingExtension.TileConstraint import TileConstraint
@@ -209,10 +209,10 @@ def serializeTilingSolution(
         replacements["N"] = [NSize] * len(outputCubes)
 
         replacementTypes = {
-            "M": PointerClass(int8_t),
-            "N": PointerClass(int8_t),
-            "O": PointerClass(int8_t),
-            "batch": PointerClass(int8_t)
+            "M": PointerClass(uint16_t),
+            "N": PointerClass(uint16_t),
+            "O": PointerClass(uint16_t),
+            "batch": PointerClass(uint16_t)
         }
 
         # Update load schedule lists
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/PadTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/PadTileConstraint.py
new file mode 100644
index 0000000000..c7cce27d06
--- /dev/null
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/PadTileConstraint.py
@@ -0,0 +1,215 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint16_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
+
+
+class Pad2DTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        inputBufferName = parseDict['data_in']
+        outputBufferName = parseDict['data_out']
+
+        buffersOfInterest = [inputBufferName, outputBufferName]
+        if 'pads_tensor' in parseDict:
+            buffersOfInterest.append(parseDict['pads_tensor'])
+        if 'value_tensor' in parseDict:
+            buffersOfInterest.append(parseDict['value_tensor'])
+
+        for bufferName in buffersOfInterest:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        channels_first = bool(parseDict.get('channels_first', 1))
+
+        if channels_first:
+            batchDim = 0
+            channelDim = 1
+            heightDim = 2
+            widthDim = 3
+        else:
+            batchDim = 0
+            heightDim = 1
+            widthDim = 2
+            channelDim = 3
+
+        inputBatchVar = tilerModel.getTensorDimVar(inputBufferName, batchDim)
+        inputHeightVar = tilerModel.getTensorDimVar(inputBufferName, heightDim)
+        inputWidthVar = tilerModel.getTensorDimVar(inputBufferName, widthDim)
+        inputChannelVar = tilerModel.getTensorDimVar(inputBufferName, channelDim)
+
+        outputBatchVar = tilerModel.getTensorDimVar(outputBufferName, batchDim)
+        outputHeightVar = tilerModel.getTensorDimVar(outputBufferName, heightDim)
+        outputWidthVar = tilerModel.getTensorDimVar(outputBufferName, widthDim)
+        outputChannelVar = tilerModel.getTensorDimVar(outputBufferName, channelDim)
+
+        tilerModel.addConstraint(outputBatchVar == inputBatchVar)
+        tilerModel.addConstraint(outputWidthVar == inputWidthVar + 2 * parseDict['pad_x'])
+        tilerModel.addConstraint(outputChannelVar == inputChannelVar)
+
+        # Height tiles may include top/bottom padding only at the boundary tiles.
+        tilerModel.addConstraint(outputHeightVar >= inputHeightVar)
+        tilerModel.addConstraint(outputHeightVar <= inputHeightVar + 2 * parseDict['pad_y'])
+
+        return tilerModel
+
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        inputBufferName = parseDict['data_in']
+        outputBufferName = parseDict['data_out']
+
+        channels_first = bool(parseDict.get('channels_first', 1))
+
+        if channels_first:
+            batchDim = 0
+            channelDim = 1
+            heightDim = 2
+            widthDim = 3
+        else:
+            batchDim = 0
+            heightDim = 1
+            widthDim = 2
+            channelDim = 3
+
+        inputBuffer = ctxt.lookup(inputBufferName)
+        outputBuffer = ctxt.lookup(outputBufferName)
+
+        # Keep batch, width and channels whole to preserve contiguous NHWC row copies.
+        tilerModel.addConstraint(tilerModel.getTensorDimVar(inputBufferName, batchDim) == inputBuffer.shape[batchDim])
+        tilerModel.addConstraint(tilerModel.getTensorDimVar(outputBufferName, batchDim) == outputBuffer.shape[batchDim])
+        tilerModel.addConstraint(tilerModel.getTensorDimVar(inputBufferName, widthDim) == inputBuffer.shape[widthDim])
+        tilerModel.addConstraint(tilerModel.getTensorDimVar(outputBufferName, widthDim) == outputBuffer.shape[widthDim])
+        tilerModel.addConstraint(
+            tilerModel.getTensorDimVar(inputBufferName, channelDim) == inputBuffer.shape[channelDim])
+        tilerModel.addConstraint(
+            tilerModel.getTensorDimVar(outputBufferName, channelDim) == outputBuffer.shape[channelDim])
+
+        # Avoid pure-padding tiles and avoid tail tiles with a single padding row.
+        outputHeightVar = tilerModel.getTensorDimVar(outputBufferName, heightDim)
+        tilerModel.addConstraint(outputHeightVar >= (parseDict['pad_y'] + 1))
+
+        return tilerModel
+
+    @staticmethod
+    def _computeInputCube(outputCube: HyperRectangle, parseDict: Dict,
+                          outputShape: Tuple[int, ...]) -> Tuple[HyperRectangle, int]:
+        channels_first = bool(parseDict.get('channels_first', 1))
+
+        if channels_first:
+            batchOffset, _, outputHOffset, _ = outputCube.offset
+            batchSize, _, outputHSize, _ = outputCube.dims
+            inputChannels = parseDict['dim_im_in_ch']
+            inputWidth = parseDict['dim_im_in_y']
+        else:
+            batchOffset, outputHOffset, _, _ = outputCube.offset
+            batchSize, outputHSize, _, _ = outputCube.dims
+            inputChannels = parseDict['dim_im_in_ch']
+            inputWidth = parseDict['dim_im_in_y']
+
+        padTop = parseDict['pad_y']
+        inputHeightTotal = parseDict['dim_im_in_x']
+        outputHeightTotal = outputShape[2] if channels_first else outputShape[1]
+
+        dataStart = padTop
+        dataEnd = padTop + inputHeightTotal
+        tileEnd = outputHOffset + outputHSize
+
+        localPadTop = max(dataStart - outputHOffset, 0)
+        localPadBottom = max(tileEnd - dataEnd, 0)
+
+        inputStart = max(outputHOffset, dataStart) - dataStart
+        inputEnd = min(tileEnd, dataEnd) - dataStart
+        inputHeight = max(inputEnd - inputStart, 1)
+
+        if channels_first:
+            inCube = HyperRectangle((batchOffset, 0, inputStart, 0),
+                                    (batchSize, inputChannels, inputHeight, inputWidth))
+        else:
+            inCube = HyperRectangle((batchOffset, inputStart, 0, 0),
+                                    (batchSize, inputHeight, inputWidth, inputChannels))
+
+        _ = localPadBottom  # Bottom padding is encoded by the output height itself.
+        return inCube, localPadTop
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['data_in', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        replacements: Dict[str, List[int]] = {
+            "batch": [],
+            "dim_im_in_x": [],
+            "dim_im_in_y": [],
+            "dim_im_in_ch": [],
+            "dim_im_out_x": [],
+            "dim_im_out_y": [],
+            "dim_im_out_ch": [],
+            "pad_y": [],
+            "data_out_size": [],
+        }
+
+        replacementTypes = {
+            "batch": PointerClass(uint16_t),
+            "dim_im_in_x": PointerClass(uint16_t),
+            "dim_im_in_y": PointerClass(uint16_t),
+            "dim_im_in_ch": PointerClass(uint16_t),
+            "dim_im_out_x": PointerClass(uint16_t),
+            "dim_im_out_y": PointerClass(uint16_t),
+            "dim_im_out_ch": PointerClass(uint16_t),
+            "pad_y": PointerClass(uint16_t),
+            "data_out_size": PointerClass(uint16_t),
+        }
+
+        outputShape = tuple(ctxt.lookup(operatorRepresentation['data_out']).shape)
+        channels_first = bool(operatorRepresentation.get('channels_first', 1))
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for outCube in outputCubes:
+            inCube, localPadTop = cls._computeInputCube(outCube, operatorRepresentation, outputShape)
+
+            replacements["batch"].append(outCube.dims[0])
+            replacements["pad_y"].append(localPadTop)
+            replacements["data_out_size"].append(int(np.prod(outCube.dims)))
+
+            if channels_first:
+                replacements["dim_im_in_ch"].append(inCube.dims[1])
+                replacements["dim_im_in_x"].append(inCube.dims[2])
+                replacements["dim_im_in_y"].append(inCube.dims[3])
+                replacements["dim_im_out_ch"].append(outCube.dims[1])
+                replacements["dim_im_out_x"].append(outCube.dims[2])
+                replacements["dim_im_out_y"].append(outCube.dims[3])
+            else:
+                replacements["dim_im_in_x"].append(inCube.dims[1])
+                replacements["dim_im_in_y"].append(inCube.dims[2])
+                replacements["dim_im_in_ch"].append(inCube.dims[3])
+                replacements["dim_im_out_x"].append(outCube.dims[1])
+                replacements["dim_im_out_y"].append(outCube.dims[2])
+                replacements["dim_im_out_ch"].append(outCube.dims[3])
+
+            inputLoadSchedule.append({"data_in": inCube})
+            outputLoadSchedule.append({"data_out": outCube})
+
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/SliceConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/SliceConstraint.py
index 5309300659..4a1993c45e 100644
--- a/Deeploy/Targets/PULPOpen/TileConstraints/SliceConstraint.py
+++ b/Deeploy/Targets/PULPOpen/TileConstraints/SliceConstraint.py
@@ -19,6 +19,12 @@
 
 class SliceTileConstraint(TileConstraint):
 
+    @staticmethod
+    def _resolveImmediate(value, ctxt: NetworkContext):
+        if isinstance(value, str):
+            return np.asarray(ctxt.lookup(value).values).astype(np.int64).reshape(-1)
+        return np.asarray(value).astype(np.int64).reshape(-1)
+
     @staticmethod
     def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
 
@@ -31,8 +37,8 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw
         inputShape = parseDict['data_in_shape']
 
         #   Get other necessary information
-        sliceAxes = parseDict['axes']
-        sliceSteps = parseDict['steps']
+        sliceAxes = SliceTileConstraint._resolveImmediate(parseDict['axes'], ctxt)
+        sliceSteps = SliceTileConstraint._resolveImmediate(parseDict['steps'], ctxt)
 
         # ===== ADD I/O DIMENSIONS TO THE MODEL AS VARIABLES =====
         for bufferName in [inputBufferName, outputBufferName]:
@@ -83,10 +89,14 @@ def computeInputCubeFromOutputCube(outputCube: AbsoluteHyperRectangle, parseDict
         in_cube_offset = list(outputCube.offset).copy()
 
         # Iterate through the sliced axes
-        for idx, ax in enumerate(parseDict['axes']):
+        sliceAxes = np.asarray(parseDict['axes']).astype(np.int64).reshape(-1)
+        sliceStarts = np.asarray(parseDict['starts']).astype(np.int64).reshape(-1)
+        sliceSteps = np.asarray(parseDict['steps']).astype(np.int64).reshape(-1)
+
+        for idx, ax in enumerate(sliceAxes):
             # Get current sliced ax parameters
-            start = parseDict['starts'][idx]
-            step = parseDict['steps'][idx]
+            start = sliceStarts[idx]
+            step = sliceSteps[idx]
 
             # Compute input cube parameters for the current axis
             in_cube_dims[ax] = (outputCube.dims[ax] - 1) * step + 1
@@ -149,12 +159,18 @@ def serializeTilingSolution(
         outputLoadSchedule = []
 
         for out_cube in outputCubes:
+            immediateOperatorRepresentation = operatorRepresentation.copy()
+            immediateOperatorRepresentation['axes'] = cls._resolveImmediate(operatorRepresentation['axes'], ctxt)
+            immediateOperatorRepresentation['starts'] = cls._resolveImmediate(operatorRepresentation['starts'], ctxt)
+            immediateOperatorRepresentation['steps'] = cls._resolveImmediate(operatorRepresentation['steps'], ctxt)
+
             # Compute input cube
-            in_cube = SliceTileConstraint.computeInputCubeFromOutputCube(out_cube, parseDict = operatorRepresentation)
+            in_cube = SliceTileConstraint.computeInputCubeFromOutputCube(out_cube,
+                                                                         parseDict = immediateOperatorRepresentation)
 
             # Compute new ends for replacement
             new_ends = list()
-            for ax in operatorRepresentation['axes']:
+            for ax in immediateOperatorRepresentation['axes']:
                 new_ends.append(in_cube.offset[ax] + in_cube.dims[ax])
 
             # Append replacement elements
diff --git a/Deeploy/Targets/PULPOpen/Tiler.py b/Deeploy/Targets/PULPOpen/Tiler.py
index 901106459e..75856b0314 100644
--- a/Deeploy/Targets/PULPOpen/Tiler.py
+++ b/Deeploy/Targets/PULPOpen/Tiler.py
@@ -14,10 +14,12 @@
 from Deeploy.Targets.Generic.TileConstraints.RQSiHardswishTileConstraint import RQSiHardswishTileConstraint
 from Deeploy.Targets.Generic.TileConstraints.TransposeTileConstraint import TransposeTileConstraint
 from Deeploy.Targets.Generic.TileConstraints.UnaryTileConstraint import UnaryTileConstraint
-from Deeploy.Targets.PULPOpen.Bindings import PULPAddBindings, PULPConcatBindings, PULPFloatConv2DBindings, \
-    PULPFloatDWConv2DBindings, PULPFloatGELUBinding, PULPFloatGELUGradBinding, PULPFloatGEMMBindings, \
-    PULPGatherBindings, PULPiHardswishBindings, PULPiRMSNormBindings, PULPiRQSGELUBindings, PULPLayernormBinding, \
-    PULPLayernormGradBinding, PULPMatMulBindings, PULPMaxPool1DBindings, PULPMaxPool2DBindings, PULPMulBindings, \
+from Deeploy.Targets.Generic.TileConstraints.UntiledTileConstraint import UntiledTileConstraint
+from Deeploy.Targets.PULPOpen.Bindings import PULPAddBindings, PULPBatchNormBindings, PULPConcatBindings, \
+    PULPFloatConv2DBindings, PULPFloatConvTranspose2DBindings, PULPFloatDWConv2DBindings, PULPFloatGELUBinding, \
+    PULPFloatGELUGradBinding, PULPFloatGEMMBindings, PULPGatherBindings, PULPiHardswishBindings, PULPiRMSNormBindings, \
+    PULPiRQSGELUBindings, PULPLayernormBinding, PULPLayernormGradBinding, PULPMatMulBindings, PULPMaxPool1DBindings, \
+    PULPMaxPool2DBindings, PULPMulBindings, PULPPad1DBindings, PULPPad2DBindings, PULPReduceLogSumExpBindings, \
     PULPReduceMeanBindings, PULPReduceSumBindings, PULPReluBinding, PULPReshapeBindings, PULPRQAddBindings, \
     PULPRQSBindings, PULPRQSConv1DBindings, PULPRQSConv2DBindings, PULPRQSDWConv2DBindings, PULPRQSGEMMBindings, \
     PULPRQSiHardswishBindings, PULPRQSMatrixVecBindings, PULPRQSTallGEMMBindings, PULPSGDBindings, PULPSliceBindings, \
@@ -25,6 +27,7 @@
     PULPSoftmaxGradBindings, PULPTransposeBindings, PULPUniformRQSBindings
 from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint, RQConv1DTileConstraint, \
     RQConv2DTileConstraint
+from Deeploy.Targets.PULPOpen.TileConstraints.ConvTransposeTileConstraint import ConvTranspose2DTileConstraint
 from Deeploy.Targets.PULPOpen.TileConstraints.DWConvTileConstraint import DWConv2DTileConstraint, \
     RQDWConv2DTileConstraint
 from Deeploy.Targets.PULPOpen.TileConstraints.GatherTileConstraint import GatherTileConstraint
@@ -40,7 +43,6 @@
 from Deeploy.Targets.PULPOpen.TileConstraints.ReduceSumTileConstraint import ReduceSumTileConstraint
 from Deeploy.Targets.PULPOpen.TileConstraints.RequantShiftTileConstraint import RequantShiftTileConstraint
 from Deeploy.Targets.PULPOpen.TileConstraints.SGDTileConstraint import SGDTileConstraint
-from Deeploy.Targets.PULPOpen.TileConstraints.SliceConstraint import SliceTileConstraint
 from Deeploy.Targets.PULPOpen.TileConstraints.SoftmaxCrossEntropyTileConstraint import \
     SoftmaxCrossEntropyGradTileConstraint, SoftmaxCrossEntropyTileConstraint
 from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings
@@ -57,6 +59,12 @@
 PULPConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPFloatConv2DBindings,
                                                         tileConstraint = Conv2DTileConstraint())
 
+PULPConv2DUntiledTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPFloatConv2DBindings,
+                                                               tileConstraint = UntiledTileConstraint())
+
+PULPConvTranspose2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPFloatConvTranspose2DBindings,
+                                                                 tileConstraint = ConvTranspose2DTileConstraint())
+
 PULPDWConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPFloatDWConv2DBindings,
                                                           tileConstraint = DWConv2DTileConstraint())
 
@@ -95,6 +103,9 @@
 PULPMaxPool2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPMaxPool2DBindings,
                                                            tileConstraint = MaxPoolCTileConstraint())
 
+PULPMaxPool2DUntiledTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPMaxPool2DBindings,
+                                                                  tileConstraint = UntiledTileConstraint())
+
 PULPRQSTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPRQSBindings,
                                                      tileConstraint = RequantShiftTileConstraint())
 
@@ -152,11 +163,25 @@
 PULPReduceSumTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPReduceSumBindings,
                                                            tileConstraint = ReduceSumTileConstraint())
 
+_PULPReduceLogSumExpBindings = copy.deepcopy(PULPReduceLogSumExpBindings)
+
+PULPReduceLogSumExpTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = _PULPReduceLogSumExpBindings,
+                                                                 tileConstraint = UntiledTileConstraint())
+
 PULPSGDTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPSGDBindings,
                                                      tileConstraint = SGDTileConstraint())
 
 PULPSliceTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPSliceBindings,
-                                                       tileConstraint = SliceTileConstraint())
+                                                       tileConstraint = UntiledTileConstraint())
 
 PULPReduceMeanTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPReduceMeanBindings,
                                                             tileConstraint = ReduceMeanTileConstraint())
+
+PULPBatchNormTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPBatchNormBindings,
+                                                           tileConstraint = UntiledTileConstraint())
+
+PULPPad1DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPPad1DBindings,
+                                                       tileConstraint = UntiledTileConstraint())
+
+PULPPad2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPPad2DBindings,
+                                                       tileConstraint = UntiledTileConstraint())
diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py
index 51f87534ea..1cbd64bbe7 100644
--- a/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py
+++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py
@@ -187,6 +187,7 @@ def _legalizeTransfers(self, transfers: List[HyperRectangle], outerShape: Tuple[
 
         if isFinalMemoryLevel:
             minimizedTransfers = []
+            fallbackToSharedOuterShape = False
             for rect in transfers:
                 paddedRect = HyperRectangle(padOffset(rect.offset, commonRank), padShape(rect.dims, commonRank))
                 minRect, newMinOuterShape = minimizeRectangle(paddedRect, outerShape)
@@ -194,13 +195,26 @@ def _legalizeTransfers(self, transfers: List[HyperRectangle], outerShape: Tuple[
                     minOuterShape = newMinOuterShape
                 else:
                     if minOuterShape != newMinOuterShape:
-                        rectStr = "\n".join(str(trans) for trans in transfers[:transfers.index(rect)])
-                        raise RuntimeError(f"""Currently support a single minimal outer shape.
-Old minOuterShape: {minOuterShape} vs. new minOuterShape {newMinOuterShape}.
-New minOuterShape produced by outerDims: {outerShape} and rect: {rect}.
-Old minOuterShape produced by outerDims: {outerShape} and rects:
-{rectStr}""")
+                        # Keep a shared outer shape when different tiles of the same external tensor
+                        # minimize differently. This shows up for L3->L2 transposed DMA on residual
+                        # tiles: a full-width tile may collapse to 1D, while the tail tile keeps a
+                        # higher-rank shape. Falling back to the common padded shape is conservative
+                        # but preserves the existing tiling structure and avoids introducing
+                        # layer-specific hacks here.
+                        #
+                        # A better long-term fix would be earlier in tiling propagation: if an
+                        # intermediate layer such as a Transpose fits entirely in L2, keep it whole
+                        # there instead of inheriting a downstream residual tiling split. That would
+                        # minimize L3 traffic rather than only making DMA legalization robust.
+                        fallbackToSharedOuterShape = True
+                        break
                 minimizedTransfers.append(minRect)
+            if fallbackToSharedOuterShape:
+                minimizedTransfers = [
+                    HyperRectangle(padOffset(rect.offset, commonRank), padShape(rect.dims, commonRank))
+                    for rect in transfers
+                ]
+                minOuterShape = outerShape
         else:
             minimizedTransfers = [HyperRectangle((0,), (int(np.prod(rect.dims)),)) for rect in transfers]
             minOuterShape = (int(np.prod(outerShape)),)
diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingHoistingMixIn.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingHoistingMixIn.py
index 8a0c1b9b54..bbcbc512cc 100644
--- a/Deeploy/TilingExtension/CodeTransformationPasses/TilingHoistingMixIn.py
+++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingHoistingMixIn.py
@@ -70,7 +70,21 @@ def _hoistReference(self,
                         shape: Tuple[int, ...] = (1,),
                         offset: Union[int, str, VariableBuffer] = 0,
                         override_type: Optional[Type[BaseType]] = None) -> _ReferenceBuffer:
-        ref = ctxt.hoistReference(self.prefix + name, reference, shape, offset, override_type)
+        refName = self.prefix + name
+        if ctxt.is_local(refName):
+            ref = ctxt.lookup(refName)
+            assert isinstance(ref, _ReferenceBuffer)
+            assert ref._referenceName == reference.name
+            assert tuple(ref.shape) == tuple(shape)
+            expectedOffset = offset.name if isinstance(offset, VariableBuffer) else offset
+            assert ref._offset == expectedOffset
+            if override_type is not None:
+                assert ref._type == PointerClass(override_type)
+            else:
+                assert ref._type == reference._type
+            return ref
+
+        ref = ctxt.hoistReference(refName, reference, shape, offset, override_type)
         ref._memoryLevel = self.memory
         return ref
 
diff --git a/DeeployTest/Platforms/Generic/main.c b/DeeployTest/Platforms/Generic/main.c
index e2b0449fb5..20fb980510 100644
--- a/DeeployTest/Platforms/Generic/main.c
+++ b/DeeployTest/Platforms/Generic/main.c
@@ -25,6 +25,23 @@ int main() {
   printf("Running network...\r\n");
   RunNetwork(0, 1);
 
+  const char *dump_outputs = getenv("DEEPLOY_DUMP_OUTPUTS");
+  if (dump_outputs && dump_outputs[0] != '\0') {
+    for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) {
+      uint32_t count = DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE);
+      printf("OUTPUT %u %u\r\n", buf, count);
+      for (uint32_t i = 0; i < count; i++) {
+        OUTPUTTYPE actual = ((OUTPUTTYPE *)DeeployNetwork_outputs[buf])[i];
+#if ISOUTPUTFLOAT == 1
+        printf("%.9g\r\n", (double)actual);
+#else
+        printf("%d\r\n", actual);
+#endif
+      }
+    }
+    return 0;
+  }
+
   int32_t tot_err = 0;
   uint32_t tot = 0;
   OUTPUTTYPE diff;
@@ -62,4 +79,4 @@ int main() {
   printf("Errors: %d out of %d \r\n", tot_err, tot);
 
   return tot_err;
-}
\ No newline at end of file
+}
diff --git a/DeeployTest/Platforms/PULPOpen/src/deeploytest.c b/DeeployTest/Platforms/PULPOpen/src/deeploytest.c
index 11d889e48d..1252edd518 100644
--- a/DeeployTest/Platforms/PULPOpen/src/deeploytest.c
+++ b/DeeployTest/Platforms/PULPOpen/src/deeploytest.c
@@ -4,6 +4,8 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
+#include <math.h>
+
 #include "CycleCounter.h"
 #include "Network.h"
 #include "dory_mem.h"
@@ -13,6 +15,8 @@
 
 #define MAINSTACKSIZE 8000
 #define SLAVESTACKSIZE 3800
+#define FLOAT_ABS_TOL 1e-4f
+#define FLOAT_REL_TOL 1e-5f
 
 struct pi_device cluster_dev;
 
@@ -40,8 +44,15 @@ void CompareFloatOnCluster(void *args) {
       float expected_val = expected[i];
       float actual_val = actual[i];
       float diff = expected_val - actual_val;
+      float abs_diff = fabsf(diff);
+      float scale = fabsf(expected_val);
+      float abs_actual = fabsf(actual_val);
+      if (abs_actual > scale) {
+        scale = abs_actual;
+      }
+      float tolerance = FLOAT_ABS_TOL + FLOAT_REL_TOL * scale;
 
-      if ((diff < -1e-4) || (diff > 1e-4) || isnan(diff)) {
+      if ((abs_diff > tolerance) || isnan(diff)) {
         local_err_count += 1;
 
         printf("Expected: %10.6f  ", expected_val);
@@ -125,39 +136,38 @@ int main(void) {
       compbuf = DeeployNetwork_outputs[buf];
     }
 
-    if (ISOUTPUTFLOAT) {
-      float_error_count = 0;
-      float_compare_args.expected = testOutputVector[buf];
-      float_compare_args.actual = compbuf;
-      float_compare_args.num_elements =
-          DeeployNetwork_outputs_bytes[buf] / sizeof(float);
-      float_compare_args.output_buf_index = buf;
-      float_compare_args.err_count = &float_error_count;
-
-      pi_cluster_task(&cluster_task, CompareFloatOnCluster,
-                      &float_compare_args);
-      cluster_task.stack_size = MAINSTACKSIZE;
-      cluster_task.slave_stack_size = SLAVESTACKSIZE;
-      pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task);
-
-      tot_err += float_error_count;
-    } else {
-
-      for (uint32_t i = 0;
-           i < DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE); i++) {
-        OUTPUTTYPE expected = ((OUTPUTTYPE *)testOutputVector[buf])[i];
-        OUTPUTTYPE actual = ((OUTPUTTYPE *)compbuf)[i];
-        int32_t error = expected - actual;
-        OUTPUTTYPE diff = (OUTPUTTYPE)(error < 0 ? -error : error);
-
-        if (diff) {
-          tot_err += 1;
-          printf("Expected: %4d  ", expected);
-          printf("Actual: %4d  ", actual);
-          printf("Diff: %4d at Index %12u in Output %u\r\n", diff, i, buf);
-        }
+#if ISOUTPUTFLOAT == 1
+    float_error_count = 0;
+    float_compare_args.expected = testOutputVector[buf];
+    float_compare_args.actual = compbuf;
+    float_compare_args.num_elements =
+        DeeployNetwork_outputs_bytes[buf] / sizeof(float);
+    float_compare_args.output_buf_index = buf;
+    float_compare_args.err_count = &float_error_count;
+
+    pi_cluster_task(&cluster_task, CompareFloatOnCluster, &float_compare_args);
+    cluster_task.stack_size = MAINSTACKSIZE;
+    cluster_task.slave_stack_size = SLAVESTACKSIZE;
+    pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task);
+
+    tot_err += float_error_count;
+#else
+
+    for (uint32_t i = 0;
+         i < DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE); i++) {
+      OUTPUTTYPE expected = ((OUTPUTTYPE *)testOutputVector[buf])[i];
+      OUTPUTTYPE actual = ((OUTPUTTYPE *)compbuf)[i];
+      int32_t error = expected - actual;
+      OUTPUTTYPE diff = (OUTPUTTYPE)(error < 0 ? -error : error);
+
+      if (diff) {
+        tot_err += 1;
+        printf("Expected: %4d  ", expected);
+        printf("Actual: %4d  ", actual);
+        printf("Diff: %4d at Index %12u in Output %u\r\n", diff, i, buf);
       }
     }
+#endif
     if ((uint32_t)DeeployNetwork_outputs[buf] < 0x1000000) {
       pi_l2_free(compbuf, (int)DeeployNetwork_outputs_bytes[buf]);
     }
@@ -167,4 +177,4 @@ int main(void) {
   printf("Errors: %u out of %u \r\n", tot_err, tot_tested);
 
   return (int)tot_err;
-}
\ No newline at end of file
+}
diff --git a/DeeployTest/Platforms/Siracusa/src/deeploytest.c b/DeeployTest/Platforms/Siracusa/src/deeploytest.c
index 11d889e48d..1252edd518 100644
--- a/DeeployTest/Platforms/Siracusa/src/deeploytest.c
+++ b/DeeployTest/Platforms/Siracusa/src/deeploytest.c
@@ -4,6 +4,8 @@
  * SPDX-License-Identifier: Apache-2.0
  */
 
+#include <math.h>
+
 #include "CycleCounter.h"
 #include "Network.h"
 #include "dory_mem.h"
@@ -13,6 +15,8 @@
 
 #define MAINSTACKSIZE 8000
 #define SLAVESTACKSIZE 3800
+#define FLOAT_ABS_TOL 1e-4f
+#define FLOAT_REL_TOL 1e-5f
 
 struct pi_device cluster_dev;
 
@@ -40,8 +44,15 @@ void CompareFloatOnCluster(void *args) {
       float expected_val = expected[i];
       float actual_val = actual[i];
       float diff = expected_val - actual_val;
+      float abs_diff = fabsf(diff);
+      float scale = fabsf(expected_val);
+      float abs_actual = fabsf(actual_val);
+      if (abs_actual > scale) {
+        scale = abs_actual;
+      }
+      float tolerance = FLOAT_ABS_TOL + FLOAT_REL_TOL * scale;
 
-      if ((diff < -1e-4) || (diff > 1e-4) || isnan(diff)) {
+      if ((abs_diff > tolerance) || isnan(diff)) {
         local_err_count += 1;
 
         printf("Expected: %10.6f  ", expected_val);
@@ -125,39 +136,38 @@ int main(void) {
       compbuf = DeeployNetwork_outputs[buf];
     }
 
-    if (ISOUTPUTFLOAT) {
-      float_error_count = 0;
-      float_compare_args.expected = testOutputVector[buf];
-      float_compare_args.actual = compbuf;
-      float_compare_args.num_elements =
-          DeeployNetwork_outputs_bytes[buf] / sizeof(float);
-      float_compare_args.output_buf_index = buf;
-      float_compare_args.err_count = &float_error_count;
-
-      pi_cluster_task(&cluster_task, CompareFloatOnCluster,
-                      &float_compare_args);
-      cluster_task.stack_size = MAINSTACKSIZE;
-      cluster_task.slave_stack_size = SLAVESTACKSIZE;
-      pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task);
-
-      tot_err += float_error_count;
-    } else {
-
-      for (uint32_t i = 0;
-           i < DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE); i++) {
-        OUTPUTTYPE expected = ((OUTPUTTYPE *)testOutputVector[buf])[i];
-        OUTPUTTYPE actual = ((OUTPUTTYPE *)compbuf)[i];
-        int32_t error = expected - actual;
-        OUTPUTTYPE diff = (OUTPUTTYPE)(error < 0 ? -error : error);
-
-        if (diff) {
-          tot_err += 1;
-          printf("Expected: %4d  ", expected);
-          printf("Actual: %4d  ", actual);
-          printf("Diff: %4d at Index %12u in Output %u\r\n", diff, i, buf);
-        }
+#if ISOUTPUTFLOAT == 1
+    float_error_count = 0;
+    float_compare_args.expected = testOutputVector[buf];
+    float_compare_args.actual = compbuf;
+    float_compare_args.num_elements =
+        DeeployNetwork_outputs_bytes[buf] / sizeof(float);
+    float_compare_args.output_buf_index = buf;
+    float_compare_args.err_count = &float_error_count;
+
+    pi_cluster_task(&cluster_task, CompareFloatOnCluster, &float_compare_args);
+    cluster_task.stack_size = MAINSTACKSIZE;
+    cluster_task.slave_stack_size = SLAVESTACKSIZE;
+    pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task);
+
+    tot_err += float_error_count;
+#else
+
+    for (uint32_t i = 0;
+         i < DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE); i++) {
+      OUTPUTTYPE expected = ((OUTPUTTYPE *)testOutputVector[buf])[i];
+      OUTPUTTYPE actual = ((OUTPUTTYPE *)compbuf)[i];
+      int32_t error = expected - actual;
+      OUTPUTTYPE diff = (OUTPUTTYPE)(error < 0 ? -error : error);
+
+      if (diff) {
+        tot_err += 1;
+        printf("Expected: %4d  ", expected);
+        printf("Actual: %4d  ", actual);
+        printf("Diff: %4d at Index %12u in Output %u\r\n", diff, i, buf);
       }
     }
+#endif
     if ((uint32_t)DeeployNetwork_outputs[buf] < 0x1000000) {
       pi_l2_free(compbuf, (int)DeeployNetwork_outputs_bytes[buf]);
     }
@@ -167,4 +177,4 @@ int main(void) {
   printf("Errors: %u out of %u \r\n", tot_err, tot_tested);
 
   return (int)tot_err;
-}
\ No newline at end of file
+}
diff --git a/DeeployTest/Tests/Kernels/FP32/Conv2D/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Conv2D/inputs.npz
new file mode 100644
index 0000000000..54a2d380c8
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Conv2D/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/Conv2D/network.onnx b/DeeployTest/Tests/Kernels/FP32/Conv2D/network.onnx
new file mode 100644
index 0000000000..87846a20cd
--- /dev/null
+++ b/DeeployTest/Tests/Kernels/FP32/Conv2D/network.onnx
@@ -0,0 +1,26 @@
+pytorch2.5.1:�
+�
+input
+conv.weight
+	conv.biasoutput
+/conv/Conv"Conv*
+	dilations@@�*
+group�*
+kernel_shape@@�*
+pads@@@@�*
+strides@@�
+main_graph*=Bconv.weightJ$V�#�(7>dw��V3{�Nw���=�iػ�Q�>�T�*B	conv.biasJf��=Z
+input
+
+
+
+
+
+b 
+output
+
+
+
+
+
+B
\ No newline at end of file
diff --git a/DeeployTest/Tests/Kernels/FP32/Conv2D/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Conv2D/outputs.npz
new file mode 100644
index 0000000000..af275c1945
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Conv2D/outputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ConvTranspose2D/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ConvTranspose2D/inputs.npz
new file mode 100644
index 0000000000..ec6380bb46
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ConvTranspose2D/inputs.npz differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ConvTranspose2D/network.onnx b/DeeployTest/Tests/Kernels/FP32/ConvTranspose2D/network.onnx
new file mode 100644
index 0000000000..3c7e61866d
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ConvTranspose2D/network.onnx differ
diff --git a/DeeployTest/Tests/Kernels/FP32/ConvTranspose2D/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ConvTranspose2D/outputs.npz
new file mode 100644
index 0000000000..229a14c1ca
Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ConvTranspose2D/outputs.npz differ
diff --git a/DeeployTest/Tests/Models/Autoencoder2D/GMM/inputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/GMM/inputs.npz
new file mode 100644
index 0000000000..40e2d5e9f2
Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/GMM/inputs.npz differ
diff --git a/DeeployTest/Tests/Models/Autoencoder2D/GMM/network.onnx b/DeeployTest/Tests/Models/Autoencoder2D/GMM/network.onnx
new file mode 100644
index 0000000000..12800274dd
Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/GMM/network.onnx differ
diff --git a/DeeployTest/Tests/Models/Autoencoder2D/GMM/outputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/GMM/outputs.npz
new file mode 100644
index 0000000000..a774bae2e7
Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/GMM/outputs.npz differ
diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMR/inputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMR/inputs.npz
new file mode 100644
index 0000000000..feccff239c
Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMR/inputs.npz differ
diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMR/network.onnx b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMR/network.onnx
new file mode 100644
index 0000000000..1841e24570
Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMR/network.onnx differ
diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMR/outputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMR/outputs.npz
new file mode 100644
index 0000000000..281bb9c67f
Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMR/outputs.npz differ
diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLRes/inputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLRes/inputs.npz
new file mode 100644
index 0000000000..bf9d7a9807
Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLRes/inputs.npz differ
diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLRes/network.onnx b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLRes/network.onnx
new file mode 100644
index 0000000000..bc5a018acc
Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLRes/network.onnx differ
diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLRes/outputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLRes/outputs.npz
new file mode 100644
index 0000000000..34a6da1215
Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLRes/outputs.npz differ
diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLTra/inputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLTra/inputs.npz
new file mode 100644
index 0000000000..fc5c4f1431
Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLTra/inputs.npz differ
diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLTra/network.onnx b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLTra/network.onnx
new file mode 100644
index 0000000000..44db2d394f
Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLTra/network.onnx differ
diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLTra/outputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLTra/outputs.npz
new file mode 100644
index 0000000000..5e144188df
Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLTra/outputs.npz differ
diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLin/inputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLin/inputs.npz
new file mode 100644
index 0000000000..e35f7271e5
Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLin/inputs.npz differ
diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLin/network.onnx b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLin/network.onnx
new file mode 100644
index 0000000000..edc86b2c6a
Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLin/network.onnx differ
diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLin/outputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLin/outputs.npz
new file mode 100644
index 0000000000..00b8a73177
Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLin/outputs.npz differ
diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/Encoder_mini/inputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/Encoder_mini/inputs.npz
new file mode 100644
index 0000000000..2eaecc5698
Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/Encoder_mini/inputs.npz differ
diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/Encoder_mini/network.onnx b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/Encoder_mini/network.onnx
new file mode 100644
index 0000000000..5cef598d23
Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/Encoder_mini/network.onnx differ
diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/Encoder_mini/outputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/Encoder_mini/outputs.npz
new file mode 100644
index 0000000000..74e3b6eccb
Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/Encoder_mini/outputs.npz differ
diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EnoderCMR/inputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EnoderCMR/inputs.npz
new file mode 100644
index 0000000000..54a2d380c8
Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EnoderCMR/inputs.npz differ
diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EnoderCMR/network.onnx b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EnoderCMR/network.onnx
new file mode 100644
index 0000000000..3b485d2c21
Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EnoderCMR/network.onnx differ
diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EnoderCMR/outputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EnoderCMR/outputs.npz
new file mode 100644
index 0000000000..77bcc55404
Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EnoderCMR/outputs.npz differ
diff --git a/DeeployTest/Tests/Models/Autoencoder2D/STD/inputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/STD/inputs.npz
new file mode 100644
index 0000000000..fe562600e8
Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/STD/inputs.npz differ
diff --git a/DeeployTest/Tests/Models/Autoencoder2D/STD/network.onnx b/DeeployTest/Tests/Models/Autoencoder2D/STD/network.onnx
new file mode 100644
index 0000000000..2d5cc658d2
Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/STD/network.onnx differ
diff --git a/DeeployTest/Tests/Models/Autoencoder2D/STD/outputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/STD/outputs.npz
new file mode 100644
index 0000000000..79dcb5faeb
Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/STD/outputs.npz differ
diff --git a/DeeployTest/Tests/Models/GMM/inputs.npz b/DeeployTest/Tests/Models/GMM/inputs.npz
new file mode 100644
index 0000000000..170ef9d86e
Binary files /dev/null and b/DeeployTest/Tests/Models/GMM/inputs.npz differ
diff --git a/DeeployTest/Tests/Models/GMM/network.onnx b/DeeployTest/Tests/Models/GMM/network.onnx
new file mode 100644
index 0000000000..83b97d257b
Binary files /dev/null and b/DeeployTest/Tests/Models/GMM/network.onnx differ
diff --git a/DeeployTest/Tests/Models/GMM/outputs.npz b/DeeployTest/Tests/Models/GMM/outputs.npz
new file mode 100644
index 0000000000..8793bae03e
Binary files /dev/null and b/DeeployTest/Tests/Models/GMM/outputs.npz differ
diff --git a/DeeployTest/testUtils/deeployRunner.py b/DeeployTest/testUtils/deeployRunner.py
index 71b056e9df..b7d2488608 100644
--- a/DeeployTest/testUtils/deeployRunner.py
+++ b/DeeployTest/testUtils/deeployRunner.py
@@ -220,6 +220,10 @@ def create_config_from_args(args: argparse.Namespace,
         gen_args_list.extend(args.input_offset_map)
 
     if tiling:
+        if hasattr(args, 'cores'):
+            gen_args_list.append(f"--cores={args.cores}")
+        elif hasattr(args, 'num_cores'):
+            gen_args_list.append(f"--cores={args.num_cores}")
         if hasattr(args, 'defaultMemLevel') and args.defaultMemLevel:
             gen_args_list.append(f"--defaultMemLevel={args.defaultMemLevel}")
         if hasattr(args, 'doublebuffer') and args.doublebuffer:
diff --git a/DeeployTest/test_siracusa_config.py b/DeeployTest/test_siracusa_config.py
index 8fa105d9f4..ca3a78aa16 100644
--- a/DeeployTest/test_siracusa_config.py
+++ b/DeeployTest/test_siracusa_config.py
@@ -19,6 +19,7 @@
     "Kernels/FP32/Conv/Regular_2D_Bias",
     "Kernels/FP32/Conv/Regular_2D_NoBias",
     "Kernels/FP32/Conv/Regular_2D_ZeroValuedBias",
+    "Kernels/FP32/ConvTranspose2D",
     "Kernels/FP32/GEMM/Regular",
     "Kernels/FP32/MatMul",
     "Kernels/FP32/MaxPool/Regular_2D",
diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py
index a687d9a489..f85cd888bc 100644
--- a/DeeployTest/test_siracusa_tiled_config.py
+++ b/DeeployTest/test_siracusa_tiled_config.py
@@ -19,6 +19,7 @@
     "Kernels/FP32/Conv/Regular_2D_Bias": [6600],
     "Kernels/FP32/Conv/Regular_2D_NoBias": [1600],
     "Kernels/FP32/Conv/Regular_2D_ZeroValuedBias": [6600],
+    "Kernels/FP32/ConvTranspose2D": [2000],
     "Kernels/FP32/GEMM/Regular": [8000],
     "Kernels/FP32/MatMul": [2000],
     "Kernels/FP32/MaxPool/Regular_2D": [2000],
@@ -66,6 +67,7 @@
     "Kernels/FP32/Conv/Regular_2D_Bias": [8800],
     "Kernels/FP32/Conv/Regular_2D_NoBias": [2000],
     "Kernels/FP32/Conv/Regular_2D_ZeroValuedBias": [8800],
+    "Kernels/FP32/ConvTranspose2D": [4000],
     "Kernels/FP32/GEMM/Regular": [8000],
     "Kernels/FP32/MatMul": [5000],
     "Kernels/FP32/MaxPool/Regular_2D": [5000],
diff --git a/TargetLibraries/Generic/inc/DeeployBasicMath.h b/TargetLibraries/Generic/inc/DeeployBasicMath.h
index 22081701a3..2d8aff54c0 100644
--- a/TargetLibraries/Generic/inc/DeeployBasicMath.h
+++ b/TargetLibraries/Generic/inc/DeeployBasicMath.h
@@ -34,6 +34,7 @@
 
 #include "kernel/BatchNorm.h"
 #include "kernel/ConvTranspose1d_fp32.h"
+#include "kernel/ConvTranspose2d_fp32.h"
 #include "kernel/Convolution.h"
 #include "kernel/DWConvolution.h"
 #include "kernel/Div.h"
@@ -48,6 +49,7 @@
 #include "kernel/RQDiv.h"
 #include "kernel/RQGELU.h"
 #include "kernel/RQHardswish.h"
+#include "kernel/ReduceLogSumExp.h"
 #include "kernel/Relu.h"
 #include "kernel/RequantShift.h"
 #include "kernel/Softmax.h"
diff --git a/TargetLibraries/Generic/inc/kernel/BatchNorm.h b/TargetLibraries/Generic/inc/kernel/BatchNorm.h
index 72703f5fe2..e7af458f96 100644
--- a/TargetLibraries/Generic/inc/kernel/BatchNorm.h
+++ b/TargetLibraries/Generic/inc/kernel/BatchNorm.h
@@ -11,6 +11,6 @@
 void BatchNorm_fp32(const float32_t *input, const float32_t *gamma,
                     const float32_t *beta, const float32_t *mean,
                     const float32_t *var, float32_t *output, int N, int C,
-                    int L);
+                    int L, float epsilon, int channels_first);
 
 #endif // BATCHNORM_H
diff --git a/TargetLibraries/Generic/inc/kernel/ConvTranspose2d_fp32.h b/TargetLibraries/Generic/inc/kernel/ConvTranspose2d_fp32.h
new file mode 100644
index 0000000000..67a62ef4e5
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/ConvTranspose2d_fp32.h
@@ -0,0 +1,18 @@
+// SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#ifndef CONV_TRANSPOSE2D_FP32_H
+#define CONV_TRANSPOSE2D_FP32_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+void ConvTranspose2d_fp32(const float32_t *input, uint32_t C_in, uint32_t H_in,
+                          uint32_t W_in, const float32_t *weight,
+                          uint32_t C_out, uint32_t K_h, uint32_t K_w,
+                          uint32_t stride_h, uint32_t stride_w,
+                          const float32_t *bias, bool has_bias,
+                          float32_t *output, uint32_t H_out, uint32_t W_out);
+
+#endif // CONV_TRANSPOSE2D_FP32_H
diff --git a/TargetLibraries/Generic/inc/kernel/ReduceLogSumExp.h b/TargetLibraries/Generic/inc/kernel/ReduceLogSumExp.h
new file mode 100644
index 0000000000..3e40d5cd2c
--- /dev/null
+++ b/TargetLibraries/Generic/inc/kernel/ReduceLogSumExp.h
@@ -0,0 +1,16 @@
+/*
+ * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#ifndef __DEEPLOY_BASIC_MATH_REDUCELOGSUMEXP_KERNEL_HEADER_
+#define __DEEPLOY_BASIC_MATH_REDUCELOGSUMEXP_KERNEL_HEADER_
+
+#include "DeeployBasicMath.h"
+
+void ReduceLogSumExp_fp32_fp32(float32_t *input, float32_t *output,
+                               uint32_t outer_size, uint32_t axis_length,
+                               uint32_t inner_size);
+
+#endif // __DEEPLOY_BASIC_MATH_REDUCELOGSUMEXP_KERNEL_HEADER_
diff --git a/TargetLibraries/Generic/src/BatchNorm_fp32.c b/TargetLibraries/Generic/src/BatchNorm_fp32.c
index 1e94d63dbb..fd0e40e79d 100644
--- a/TargetLibraries/Generic/src/BatchNorm_fp32.c
+++ b/TargetLibraries/Generic/src/BatchNorm_fp32.c
@@ -8,8 +8,7 @@
 void BatchNorm_fp32(const float32_t *input, const float32_t *gamma,
                     const float32_t *beta, const float32_t *mean,
                     const float32_t *var, float32_t *output, int N, int C,
-                    int L) {
-  const float epsilon = 1e-5f;
+                    int L, float epsilon, int channels_first) {
 #pragma omp parallel for
   for (int c = 0; c < C; ++c) {
     float32_t c_mean = mean[c];
@@ -19,7 +18,12 @@ void BatchNorm_fp32(const float32_t *input, const float32_t *gamma,
     float32_t denom = sqrtf(c_var + epsilon);
     for (int n = 0; n < N; ++n) {
       for (int l = 0; l < L; ++l) {
-        int index = n * C * L + c * L + l;
+        int index;
+        if (channels_first) {
+          index = n * C * L + c * L + l;
+        } else {
+          index = n * C * L + l * C + c;
+        }
         float32_t x = input[index];
         float32_t norm = (x - c_mean) / denom;
         output[index] = c_gamma * norm + c_beta;
diff --git a/TargetLibraries/Generic/src/ConvTranspose2d_fp32.c b/TargetLibraries/Generic/src/ConvTranspose2d_fp32.c
new file mode 100644
index 0000000000..51efc078c7
--- /dev/null
+++ b/TargetLibraries/Generic/src/ConvTranspose2d_fp32.c
@@ -0,0 +1,60 @@
+// SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "DeeployBasicMath.h"
+
+void ConvTranspose2d_fp32(const float32_t *input, uint32_t C_in, uint32_t H_in,
+                          uint32_t W_in, const float32_t *weight,
+                          uint32_t C_out, uint32_t K_h, uint32_t K_w,
+                          uint32_t stride_h, uint32_t stride_w,
+                          const float32_t *bias, bool has_bias,
+                          float32_t *output, uint32_t H_out, uint32_t W_out) {
+  /*
+  input:       [C_in, H_in, W_in]
+  weight:      [C_in, C_out, K_h, K_w]
+  output:      [C_out, H_out, W_out]
+  bias:        [C_out] optionally
+  */
+
+  for (uint32_t c = 0; c < C_out; ++c) {
+    for (uint32_t h = 0; h < H_out; ++h) {
+      for (uint32_t w = 0; w < W_out; ++w) {
+        output[(c * H_out + h) * W_out + w] = 0.0f;
+      }
+    }
+  }
+
+  for (uint32_t cout = 0; cout < C_out; ++cout) {
+    for (uint32_t cin = 0; cin < C_in; ++cin) {
+      for (uint32_t h_in = 0; h_in < H_in; ++h_in) {
+        for (uint32_t w_in = 0; w_in < W_in; ++w_in) {
+          float32_t val = input[(cin * H_in + h_in) * W_in + w_in];
+          for (uint32_t kh = 0; kh < K_h; ++kh) {
+            uint32_t h_out = h_in * stride_h + kh;
+            if (h_out >= H_out) {
+              continue;
+            }
+            for (uint32_t kw = 0; kw < K_w; ++kw) {
+              uint32_t w_out = w_in * stride_w + kw;
+              if (w_out >= W_out) {
+                continue;
+              }
+              float32_t wgt =
+                  weight[((cin * C_out + cout) * K_h + kh) * K_w + kw];
+              output[(cout * H_out + h_out) * W_out + w_out] += val * wgt;
+            }
+          }
+        }
+      }
+    }
+
+    if (has_bias) {
+      for (uint32_t h = 0; h < H_out; ++h) {
+        for (uint32_t w = 0; w < W_out; ++w) {
+          output[(cout * H_out + h) * W_out + w] += bias[cout];
+        }
+      }
+    }
+  }
+}
diff --git a/TargetLibraries/Generic/src/ReduceLogSumExp_fp32.c b/TargetLibraries/Generic/src/ReduceLogSumExp_fp32.c
new file mode 100644
index 0000000000..85ae40a60d
--- /dev/null
+++ b/TargetLibraries/Generic/src/ReduceLogSumExp_fp32.c
@@ -0,0 +1,36 @@
+/*
+ * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "DeeployBasicMath.h"
+#include <math.h>
+
+void ReduceLogSumExp_fp32_fp32(float32_t *input, float32_t *output,
+                               uint32_t outer_size, uint32_t axis_length,
+                               uint32_t inner_size) {
+
+  for (uint32_t outer_idx = 0; outer_idx < outer_size; ++outer_idx) {
+    for (uint32_t inner_idx = 0; inner_idx < inner_size; ++inner_idx) {
+      float32_t max_val = -INFINITY;
+
+      for (uint32_t axis_idx = 0; axis_idx < axis_length; ++axis_idx) {
+        uint32_t input_idx =
+            (outer_idx * axis_length + axis_idx) * inner_size + inner_idx;
+        if (input[input_idx] > max_val) {
+          max_val = input[input_idx];
+        }
+      }
+
+      float32_t sum_exp = 0.0f;
+      for (uint32_t axis_idx = 0; axis_idx < axis_length; ++axis_idx) {
+        uint32_t input_idx =
+            (outer_idx * axis_length + axis_idx) * inner_size + inner_idx;
+        sum_exp += expf(input[input_idx] - max_val);
+      }
+
+      output[outer_idx * inner_size + inner_idx] = logf(sum_exp) + max_val;
+    }
+  }
+}
diff --git a/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h b/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h
index f6e8308c97..4800e6912d 100644
--- a/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h
+++ b/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h
@@ -29,9 +29,11 @@
 #include "kernel/Matmul.h"
 #include "kernel/MaxPool.h"
 #include "kernel/RQiHardswish.h"
+#include "kernel/Relu.h"
 #include "kernel/RequantShift.h"
 #include "kernel/Softmax.h"
 #include "kernel/UniformRequantShift.h"
+#include "kernel/gemm.h"
 #include "kernel/gemv.h"
 #include "kernel/iRMSnorm.h"
 
diff --git a/TargetLibraries/PULPOpen/inc/kernel/Conv.h b/TargetLibraries/PULPOpen/inc/kernel/Conv.h
index 3ebab54a0b..b5f0e57ea8 100644
--- a/TargetLibraries/PULPOpen/inc/kernel/Conv.h
+++ b/TargetLibraries/PULPOpen/inc/kernel/Conv.h
@@ -35,4 +35,14 @@ void PULP_DW_Conv2d_Im2Col_fp32_fp32_fp32_HWC(
     uint32_t pad_left, uint32_t pad_right,
     float32_t *__restrict__ pContextBuffer);
 
-#endif // __DEEPLOY_MATH_CONV_KERNEL_HEADER_
\ No newline at end of file
+void PULP_ConvTranspose2d_fp32_fp32_fp32_CHW(
+    const float32_t *__restrict__ pSrcA, uint32_t C_in, uint32_t H_in,
+    uint32_t W_in, const float32_t *__restrict__ pSrcB, uint32_t C_out,
+    uint32_t groups, uint32_t K_h, uint32_t K_w, uint32_t stride_h,
+    uint32_t stride_w, uint32_t dilation_h, uint32_t dilation_w,
+    uint32_t pad_top, uint32_t pad_bottom, uint32_t pad_left,
+    uint32_t pad_right, const float32_t *__restrict__ pSrcBias,
+    const bool has_bias, float32_t *__restrict__ pDstC, uint32_t H_out,
+    uint32_t W_out);
+
+#endif // __DEEPLOY_MATH_CONV_KERNEL_HEADER_
diff --git a/TargetLibraries/PULPOpen/inc/kernel/MaxPool.h b/TargetLibraries/PULPOpen/inc/kernel/MaxPool.h
index b37487439f..021cb5f9b9 100644
--- a/TargetLibraries/PULPOpen/inc/kernel/MaxPool.h
+++ b/TargetLibraries/PULPOpen/inc/kernel/MaxPool.h
@@ -10,10 +10,10 @@
 #include "DeeployPULPMath.h"
 
 void PULP_MaxPool2d_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA,
-                                  uint32_t W, uint32_t H, uint32_t C,
+                                  uint32_t H, uint32_t W, uint32_t C,
                                   uint32_t Q, uint32_t P, uint32_t SQ,
                                   uint32_t SP, float32_t *__restrict__ pDstC,
                                   uint32_t pad_top, uint32_t pad_bottom,
                                   uint32_t pad_left, uint32_t pad_right);
 
-#endif // __DEEPLOY_MATH_MAXPOOL_KERNEL_HEADER_
\ No newline at end of file
+#endif // __DEEPLOY_MATH_MAXPOOL_KERNEL_HEADER_
diff --git a/TargetLibraries/PULPOpen/src/ConvTranspose_fp32.c b/TargetLibraries/PULPOpen/src/ConvTranspose_fp32.c
new file mode 100644
index 0000000000..8e46708273
--- /dev/null
+++ b/TargetLibraries/PULPOpen/src/ConvTranspose_fp32.c
@@ -0,0 +1,91 @@
+/*
+ * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+ *
+ * SPDX-License-Identifier: Apache-2.0
+ */
+
+#include "DeeployPULPMath.h"
+#include "pmsis.h"
+
+__attribute__((noinline, optnone)) void PULP_ConvTranspose2d_fp32_fp32_fp32_CHW(
+    const float32_t *__restrict__ pSrcA, uint32_t C_in, uint32_t H_in,
+    uint32_t W_in, const float32_t *__restrict__ pSrcB, uint32_t C_out,
+    uint32_t groups, uint32_t K_h, uint32_t K_w, uint32_t stride_h,
+    uint32_t stride_w, uint32_t dilation_h, uint32_t dilation_w,
+    uint32_t pad_top, uint32_t pad_bottom, uint32_t pad_left,
+    uint32_t pad_right, const float32_t *__restrict__ pSrcBias,
+    const bool has_bias, float32_t *__restrict__ pDstC, uint32_t H_out,
+    uint32_t W_out) {
+
+  (void)pad_bottom;
+  (void)pad_right;
+
+  int8_t core_id = pi_core_id();
+  int8_t log2Core = LOG2(NUM_CORES);
+
+  uint16_t ch_out_chunk =
+      (C_out >> log2Core) + ((C_out & (NUM_CORES - 1)) != 0);
+  uint16_t ch_out_start = MIN(ch_out_chunk * core_id, C_out);
+  uint16_t ch_out_stop = MIN(ch_out_start + ch_out_chunk, C_out);
+  uint16_t ch_out_count = ch_out_stop - ch_out_start;
+
+  if (ch_out_count == 0) {
+    return;
+  }
+
+  uint32_t output_plane = H_out * W_out;
+
+  for (uint32_t cout = ch_out_start; cout < ch_out_stop; ++cout) {
+    float32_t init = has_bias ? pSrcBias[cout] : 0.0f;
+    float32_t *out_ptr = pDstC + cout * output_plane;
+    for (uint32_t idx = 0; idx < output_plane; ++idx) {
+      out_ptr[idx] = init;
+    }
+  }
+
+  uint32_t ch_in_per_group = C_in / groups;
+  uint32_t ch_out_per_group = C_out / groups;
+
+  for (uint32_t cout = ch_out_start; cout < ch_out_stop; ++cout) {
+    uint32_t group_idx = cout / ch_out_per_group;
+    uint32_t cout_in_group = cout % ch_out_per_group;
+
+    for (uint32_t cin_in_group = 0; cin_in_group < ch_in_per_group;
+         ++cin_in_group) {
+      uint32_t cin = group_idx * ch_in_per_group + cin_in_group;
+
+      for (uint32_t h_in = 0; h_in < H_in; ++h_in) {
+        for (uint32_t w_in = 0; w_in < W_in; ++w_in) {
+          float32_t val = pSrcA[(cin * H_in + h_in) * W_in + w_in];
+
+          for (uint32_t kh = 0; kh < K_h; ++kh) {
+            int32_t h_out =
+                (int32_t)(h_in * stride_h + kh * dilation_h) - (int32_t)pad_top;
+
+            if (h_out < 0 || h_out >= (int32_t)H_out) {
+              continue;
+            }
+
+            for (uint32_t kw = 0; kw < K_w; ++kw) {
+              int32_t w_out = (int32_t)(w_in * stride_w + kw * dilation_w) -
+                              (int32_t)pad_left;
+
+              if (w_out < 0 || w_out >= (int32_t)W_out) {
+                continue;
+              }
+
+              uint32_t weight_idx =
+                  (((cin * ch_out_per_group + cout_in_group) * K_h + kh) *
+                   K_w) +
+                  kw;
+              uint32_t out_idx =
+                  (cout * H_out + (uint32_t)h_out) * W_out + (uint32_t)w_out;
+
+              pDstC[out_idx] += val * pSrcB[weight_idx];
+            }
+          }
+        }
+      }
+    }
+  }
+}
diff --git a/TargetLibraries/PULPOpen/src/MaxPool.c b/TargetLibraries/PULPOpen/src/MaxPool.c
index 3b630b97cc..5ae8a2adb9 100644
--- a/TargetLibraries/PULPOpen/src/MaxPool.c
+++ b/TargetLibraries/PULPOpen/src/MaxPool.c
@@ -8,7 +8,7 @@
 #include "pmsis.h"
 
 void PULP_MaxPool2d_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA,
-                                  uint32_t W, uint32_t H, uint32_t C,
+                                  uint32_t H, uint32_t W, uint32_t C,
                                   uint32_t Q, uint32_t P, uint32_t SQ,
                                   uint32_t SP, float32_t *__restrict__ pDstC,
                                   uint32_t pad_top, uint32_t pad_bottom,
@@ -60,4 +60,4 @@ void PULP_MaxPool2d_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA,
       }
     }
   }
-}
\ No newline at end of file
+}