diff --git a/.vscode/launch.json b/.vscode/launch.json index c9238e502b..82f5390f5f 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -11,9 +11,18 @@ "program": "generateNetwork.py", "console": "integratedTerminal", "cwd": "${workspaceFolder}/DeeployTest", + "env": { + "PYTHONPATH": "${workspaceFolder}" + }, + "subProcess": true, "justMyCode": false, - "args": - "-p${input:platformUntiled} -t${input:model} ${input:additionalArgsUntiled}" + "args": [ + "-p", + "${input:platformUntiled}", + "-t", + "${input:model}", + "${input:additionalArgsUntiled}" + ] }, { "name": "Deeploy Generate Tiled", @@ -22,9 +31,18 @@ "program": "testMVP.py", "console": "integratedTerminal", "cwd": "${workspaceFolder}/DeeployTest", + "env": { + "PYTHONPATH": "${workspaceFolder}" + }, + "subProcess": true, "justMyCode": false, - "args": - "-p${input:platformTiled} -t${input:model} ${input:additionalArgsTiled}" + "args": [ + "-p", + "${input:platformTiled}", + "-t", + "${input:model}", + "${input:additionalArgsTiled}" + ] } ], "inputs": [ @@ -85,7 +103,7 @@ "id": "additionalArgsTiled", "type": "promptString", "description": "Additional Arguments", - "default": "-v --doublebuffer" + "default": "--doublebuffer" } ] -} \ No newline at end of file +} diff --git a/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py b/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py index 609a179c7b..e3ba2f8e1c 100644 --- a/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py +++ b/Deeploy/CommonExtensions/CodeTransformationPasses/MemoryAllocation.py @@ -276,6 +276,7 @@ def apply(self, for buffer in inputs + transients: assert buffer._live == True, f"Tried to deallocate already dead buffer {buffer.name}" + buffer._live = False # Don't deallocate if it's an alias of a live buffer if not buffer.has_live_aliases(ctxt): @@ -363,7 +364,6 @@ def apply(self, for buffer in inputs + transients: assert buffer._live == True, f"Tried to deallocate already dead buffer {buffer.name}" - memoryLevel = "None" if not hasattr(buffer, "_memoryLevel") else buffer._memoryLevel if memoryLevel not in ctxt._dynamicSize: ctxt._dynamicSize[memoryLevel] = 0 diff --git a/Deeploy/DeeployTypes.py b/Deeploy/DeeployTypes.py index 44abe85112..a96d5fc046 100644 --- a/Deeploy/DeeployTypes.py +++ b/Deeploy/DeeployTypes.py @@ -63,6 +63,37 @@ class CodeGenVerbosity: _backendPostParsingFilename = 'backend_post_parsing' _backendPostBindingFilename = 'backend_post_binding' + +def _deeployTypeToNpType(ty: Type[BaseType]): + + def _broadcastInteger(ty: Type[IntegerImmediate]): + if ty.signed: + return np.dtype(getattr(np, "int" + str(ty.typeWidth))) + else: + return np.dtype(getattr(np, "uint" + str(ty.typeWidth))) + + def _broadcastFloat(ty: Type[FloatImmediate]): + if ty.typeWidth == 16: + return np.dtype(np.float16) + if ty.typeWidth == 32: + return np.dtype(np.float32) + if ty.typeWidth == 64: + return np.dtype(np.float64) + return np.dtype(np.float32) + + if issubclass(ty, Pointer) and hasattr(ty, "referencedType"): + if issubclass(ty.referencedType, IntegerImmediate): + return _broadcastInteger(ty.referencedType) + if issubclass(ty.referencedType, FloatImmediate): + return _broadcastFloat(ty.referencedType) + elif issubclass(ty, IntegerImmediate): + return _broadcastInteger(ty) + elif issubclass(ty, FloatImmediate): + return _broadcastFloat(ty) + + return None + + _ctxtExtension = '.pkl' _graphExtension = '.onnx' _dataExtension = '.data' @@ -415,7 +446,12 @@ def __eq__(self, other): def _valueString(self) -> str: values = list(self.values.reshape(-1)) if self._type.typeName == 'float32_t*': - strValues = [f'{value}f' for value in values] + strValues = [] + for value in values: + literal = f"{float(value):.9g}" + if "e" not in literal and "." not in literal: + literal += ".0" + strValues.append(literal + "f") elif self._type.typeName == 'int8_t*': strValues = [f'{int(value)}' for value in values] else: @@ -977,8 +1013,6 @@ def hoistConstant(self, Returns the name of the newly registed ConstantBuffer """ - assert len(constant.outputs) <= 1, f"Constant {constant.name} has more than one output" - name = name if name is not None else constant.name # LMACAN: The shape needs to be copied into a tuple for pickling to work. Don't ask me why.. @@ -2027,25 +2061,7 @@ def parse(self, ctxt: NetworkContext, default_channels_first: bool) -> Tuple[Net return ctxt, False def _broadcastToNpType(self, ty: Type[BaseType]): - - def _broadcastInteger(ty: Type[IntegerImmediate]): - if ty.signed: - return np.dtype(getattr(np, "int" + str(ty.typeWidth))) - else: - return np.dtype(getattr(np, "uint" + str(ty.typeWidth))) - - def _broadcastFloat(ty: Type[FloatImmediate]): - return np.dtype(getattr(np, "double")) - - if issubclass(ty, Pointer) and hasattr(ty, "referencedType"): - if issubclass(ty.referencedType, IntegerImmediate): - return _broadcastInteger(ty.referencedType) - elif issubclass(ty, IntegerImmediate): - return _broadcastInteger(ty) - elif issubclass(ty, FloatImmediate): - return _broadcastFloat(ty) - - return None + return _deeployTypeToNpType(ty) def typeCheck(self, ctxt: NetworkContext) -> Tuple[NetworkContext, bool]: """Invokes the mapper's typeCheck method @@ -2106,8 +2122,9 @@ def bind(self, ctxt: NetworkContext) -> Tuple[NetworkContext, bool]: elif ctxt.is_global(node.name): npType = self._broadcastToNpType(ctxt.globalObjects[node.name]._type) if isinstance(ctxt.globalObjects[node.name], ConstantBuffer): - if isinstance(node, gs.Constant): + if isinstance(node, gs.Constant) and npType is not None: node.values = node.values.astype(npType) + node.export_dtype = npType else: node.shape = ctxt.globalObjects[node.name].shape if npType is not None: @@ -2856,7 +2873,17 @@ def generateInferenceInitializationCode(self) -> str: name = node.name node.name = self.ctxt._mangle(node.name) - callStack += node.init() + + if ("TILING_CODEGEN" not in node.name and isinstance(node, VariableBuffer) and hasattr(node, "_type") + and issubclass(node._type, Pointer)): + # Local inference buffers are late-bound by the generated layer code. Initializing them to NULL keeps + # clang from flagging false-positive uninitialized reads on paths where the assignment is emitted in a + # separate closure, and marking them unused avoids noise for scratch buffers that are reserved + # generically but optimized away for a specific layer instance. + typeName = node._instance.typeName if hasattr(node, "_instance") else node._type.typeName + callStack += f"{typeName} {node.name} __attribute__((unused)) = NULL;\n" + else: + callStack += node.init() node.name = name return callStack @@ -3121,6 +3148,15 @@ def _exportGraph(self, folderPath, fileName): # VJUNG: ONNX-Graphsurgeon needs tensors to be in their export types constTensors = [tensor for tensor in self.graph.tensors().values() if isinstance(tensor, gs.Constant)] for tensor in constTensors: + if tensor.name in self.ctxt.globalObjects: + ctxtTensor = self.ctxt.globalObjects[tensor.name] + if isinstance(ctxtTensor, ConstantBuffer) and hasattr(ctxtTensor, "_type"): + npType = _deeployTypeToNpType(ctxtTensor._type) + if npType is not None: + tensor.values = tensor.values.astype(npType) + tensor.export_dtype = npType + continue + if tensor.dtype != tensor.export_dtype: tensor.values = tensor.values.astype(tensor.export_dtype) diff --git a/Deeploy/EngineExtension/OptimizationPasses/TopologyOptimizationPasses/EngineColoringPasses.py b/Deeploy/EngineExtension/OptimizationPasses/TopologyOptimizationPasses/EngineColoringPasses.py index 82b7d1fde4..fb4a647634 100644 --- a/Deeploy/EngineExtension/OptimizationPasses/TopologyOptimizationPasses/EngineColoringPasses.py +++ b/Deeploy/EngineExtension/OptimizationPasses/TopologyOptimizationPasses/EngineColoringPasses.py @@ -36,6 +36,8 @@ def apply(self, graph: gs.Graph) -> Tuple[gs.Graph]: engine = self.engineMapper.mapNodeToEngine(node, graph) if engine is not None: node.attrs["engine"] = engine.name + if hasattr(engine, "n_cores"): + node.attrs["n_cores"] = engine.n_cores return graph diff --git a/Deeploy/Targets/Generic/Bindings.py b/Deeploy/Targets/Generic/Bindings.py index 308b179aef..807f3ba5cd 100644 --- a/Deeploy/Targets/Generic/Bindings.py +++ b/Deeploy/Targets/Generic/Bindings.py @@ -15,16 +15,16 @@ ConvTransposeTemplate, DebugPrintTemplate, DequantTemplate, DummyTemplate, DWConvTemplate, FloatAddTemplate, \ FloatConvTemplate, FloatDivTemplate, FloatDWConvTemplate, FloatGELUTemplate, FloatGemmTemplate, \ FloatLayernormTemplate, FloatMatMulTemplate, FloatMaxPoolTemplate, FloatMulTemplate, FloatPadTemplate, \ - FloatPowTemplate, FloatReduceMeanTemplate, FloatReluTemplate, FloatSoftmaxTemplate, FloatSqrtTemplate, \ - GatherTemplate, GemmTemplate, IntegerDivTemplate, ITAMaxTemplate, ITAPartialMaxTemplate, MatMulTemplate, \ - MaxPoolTemplate, MulTemplate, PadTemplate, QuantTemplate, ReduceMeanTemplate, ReduceSumTemplate, \ + FloatPowTemplate, FloatReduceLogSumExpTemplate, FloatReduceMeanTemplate, FloatReluTemplate, FloatSoftmaxTemplate, \ + FloatSqrtTemplate, GatherTemplate, GemmTemplate, IntegerDivTemplate, ITAMaxTemplate, ITAPartialMaxTemplate, \ + MatMulTemplate, MaxPoolTemplate, MulTemplate, PadTemplate, QuantTemplate, ReduceMeanTemplate, ReduceSumTemplate, \ RequantShiftTemplate, ReshapeTemplate, RQIntegerDivTemplate, RQSiGELUTemplate, SliceTemplate, TransposeTemplate, \ iGELUTemplate, iLayernormTemplate, iRMSNormTemplate, iSoftmaxTemplate from Deeploy.Targets.Generic.TypeCheckers import AddChecker, BatchNormChecker, ConcatChecker, ConvChecker, \ DebugPrintChecker, DequantChecker, DivChecker, DummyChecker, GatherChecker, GELUChecker, GEMMChecker, \ - LayerNormChecker, MatMulChecker, MaxPoolChecker, MulChecker, PadChecker, QuantChecker, ReduceMeanChecker, \ - ReduceSumChecker, ReluChecker, RequantShiftChecker, ReshapeChecker, RQIntegerDivChecker, SliceChecker, \ - SoftmaxChecker, TransposeChecker + LayerNormChecker, MatMulChecker, MaxPoolChecker, MulChecker, PadChecker, QuantChecker, ReduceLogSumExpChecker, \ + ReduceMeanChecker, ReduceSumChecker, ReluChecker, RequantShiftChecker, ReshapeChecker, RQIntegerDivChecker, \ + SliceChecker, SoftmaxChecker, TransposeChecker BasicTransformer = CodeTransformation([ArgumentStructGeneration(), MemoryManagementGeneration(), FutureGeneration()]) @@ -227,6 +227,11 @@ BasicTransformer) for type in SignedIntegerDataTypes ] +BasicReduceLogSumExpBindings = [ + NodeBinding(ReduceLogSumExpChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatReduceLogSumExpTemplate.referenceTemplate, BasicTransformer) +] + BasicReluBinding = NodeBinding(ReluChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), FloatReluTemplate.referenceTemplate, BasicTransformer) @@ -286,6 +291,9 @@ BasicConcatBindings = [ NodeBinding(ConcatChecker([PointerClass(type), PointerClass(type)], [PointerClass(type)]), ConcatTemplate.referenceTemplate, BasicTransformer) for type in IntegerDataTypes +] + [ + NodeBinding(ConcatChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + ConcatTemplate.referenceTemplate, BasicTransformer) ] BasicQuantBindings = [ @@ -312,18 +320,34 @@ for type in FloatDataTypes ] -BasicConvTransposeBindings = [ +BasicConvTranspose1DBindings = [ + NodeBinding( + ConvChecker( + [PointerClass(type), PointerClass(type), PointerClass(type)], # input, weight, bias + [PointerClass(type)]), + ConvTransposeTemplate.reference1DTemplate, + BasicTransformer) for type in FloatDataTypes +] + [ + NodeBinding( + ConvChecker( + [PointerClass(type), PointerClass(type)], # input, weight + [PointerClass(type)]), + ConvTransposeTemplate.reference1DTemplate, + BasicTransformer) for type in FloatDataTypes +] + +BasicConvTranspose2DBindings = [ NodeBinding( ConvChecker( [PointerClass(type), PointerClass(type), PointerClass(type)], # input, weight, bias [PointerClass(type)]), - ConvTransposeTemplate.referenceTemplate, + ConvTransposeTemplate.reference2DTemplate, BasicTransformer) for type in FloatDataTypes ] + [ NodeBinding( ConvChecker( [PointerClass(type), PointerClass(type)], # input, weight [PointerClass(type)]), - ConvTransposeTemplate.referenceTemplate, + ConvTransposeTemplate.reference2DTemplate, BasicTransformer) for type in FloatDataTypes ] diff --git a/Deeploy/Targets/Generic/Layers.py b/Deeploy/Targets/Generic/Layers.py index cc733937cc..2d011e21f9 100644 --- a/Deeploy/Targets/Generic/Layers.py +++ b/Deeploy/Targets/Generic/Layers.py @@ -340,6 +340,12 @@ def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorReprese if inputShapes[1] == () or inputShapes[1] == []: inputShapes[1] = (1,) + # Scalars and singletons should broadcast to the tensor operand, + # not shrink the tensor shape to (1,). + if tuple(inputShapes[1]) == (1,): + inputShapes[1] = inputShapes[0] + return (inputShapes, outputShapes) + if len(inputShapes[0]) > len(inputShapes[1]): inputShapes[1] = inputShapes[0] else: @@ -438,6 +444,27 @@ def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorReprese return (inputShapes, outputShapes) +class ReduceLogSumExpLayer(ONNXLayer): + + def __init__(self, maps: List[NodeMapper]): + super().__init__(maps) + + def computeShapes(self, inputShapes: Shape, outputShapes: Shape, operatorRepresentation, + channels_first) -> Tuple[Shape, Shape]: + axis = operatorRepresentation['axes'][0] + inputShape = list(copy.deepcopy(inputShapes[0])) + + if operatorRepresentation['keepdims']: + outputShape = inputShape + outputShape[axis] = 1 + else: + outputShape = inputShape[:axis] + inputShape[axis + 1:] + if len(outputShape) == 0: + outputShape = [1] + + return (inputShapes, [outputShape]) + + class ReluLayer(ONNXLayer): def __init__(self, maps: List[NodeMapper]): diff --git a/Deeploy/Targets/Generic/Parsers.py b/Deeploy/Targets/Generic/Parsers.py index ad787d9e4b..3b69f9e526 100644 --- a/Deeploy/Targets/Generic/Parsers.py +++ b/Deeploy/Targets/Generic/Parsers.py @@ -6,6 +6,7 @@ from typing import Tuple import numpy as np +import onnx import onnx_graphsurgeon as gs from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, NodeParser, VariableBuffer @@ -334,25 +335,99 @@ class PadParser(NodeParser): def __init__(self): super().__init__() + def _evaluate_constant_tensor(self, tensor: gs.Tensor): + + if isinstance(tensor, gs.Constant): + return np.asarray(tensor.values) + + if not hasattr(tensor, "inputs") or len(tensor.inputs) != 1: + return None + + node = tensor.inputs[0] + input_values = [] + for input_tensor in node.inputs: + value = self._evaluate_constant_tensor(input_tensor) + if value is None: + return None + input_values.append(value) + + if node.op == "Constant": + value = node.attrs.get("value") + if value is None: + return None + return np.asarray(value.values) + + if node.op == "Cast": + cast_dtype = onnx.helper.tensor_dtype_to_np_dtype(node.attrs["to"]) + return input_values[0].astype(cast_dtype) + + if node.op == "Reshape": + return np.reshape(input_values[0], input_values[1].astype(np.int64).tolist()) + + if node.op == "Concat": + return np.concatenate(input_values, axis = node.attrs["axis"]) + + if node.op == "Transpose": + return np.transpose(input_values[0], axes = node.attrs["perm"]) + + if node.op == "ConstantOfShape": + fill_value = node.attrs.get("value") + if fill_value is None: + scalar = np.array(0, dtype = np.float32) + else: + scalar = np.asarray(fill_value.values).reshape(-1)[0] + return np.full(input_values[0].astype(np.int64).tolist(), scalar, dtype = np.asarray(scalar).dtype) + + if node.op == "Slice": + data = input_values[0] + starts = input_values[1].astype(np.int64).reshape(-1) + ends = input_values[2].astype(np.int64).reshape(-1) + axes = input_values[3].astype(np.int64).reshape(-1) if len(input_values) >= 4 else np.arange(len(starts)) + steps = input_values[4].astype(np.int64).reshape(-1) if len(input_values) >= 5 else np.ones( + len(starts), dtype = np.int64) + + slices = [slice(None)] * data.ndim + for start, end, axis, step in zip(starts, ends, axes, steps): + slices[int(axis)] = slice(int(start), int(end), int(step)) + + return data[tuple(slices)] + + return None + def parseNode(self, node: gs.Node) -> bool: - ret = all([ - 'mode' in node.attrs, 'pads' in node.attrs, 'value' in node.attrs, - len(node.inputs) == 1, - len(node.outputs) == 1 - ]) + ret = all(['mode' in node.attrs, len(node.outputs) == 1]) if ret: self.operatorRepresentation['mode'] = node.attrs['mode'] + self.operatorRepresentation['value'] = 0 - try: - self.operatorRepresentation['pads'] = [int(p) for p in node.attrs['pads']] - except Exception as e: - self.operatorRepresentation['pads'] = node.attrs['pads'] + if 'pads' in node.attrs and len(node.inputs) == 1: + try: + self.operatorRepresentation['pads'] = [int(p) for p in node.attrs['pads']] + except Exception: + self.operatorRepresentation['pads'] = node.attrs['pads'] - self.operatorRepresentation['value'] = node.attrs['value'] + if 'value' in node.attrs: + self.operatorRepresentation['value'] = node.attrs['value'] + return True - return ret + if len(node.inputs) in (2, 3): + pads = self._evaluate_constant_tensor(node.inputs[1]) + if pads is None: + return False + + self.operatorRepresentation['pads'] = [int(p) for p in np.asarray(pads).reshape(-1)] + + if len(node.inputs) == 3: + value = self._evaluate_constant_tensor(node.inputs[2]) + if value is None: + return False + self.operatorRepresentation['value'] = np.asarray(value).reshape(-1)[0].item() + + return True + + return False def parseNodeCtxt(self, ctxt: NetworkContext, @@ -366,6 +441,16 @@ def parseNodeCtxt(self, self.operatorRepresentation['data_in_size'] = np.prod(data_in.shape) self.operatorRepresentation['data_out_size'] = np.prod(data_out.shape) + # Keep optional constant Pad inputs visible to the tiler. + # The template uses the decoded scalar/list values, but the tiled + # scheduler still tracks the original constant tensors as node inputs. + if len(node.inputs) >= 2: + pads_tensor = ctxt.lookup(node.inputs[1].name) + self.operatorRepresentation['pads_tensor'] = pads_tensor.name + if len(node.inputs) >= 3: + value_tensor = ctxt.lookup(node.inputs[2].name) + self.operatorRepresentation['value_tensor'] = value_tensor.name + return ctxt, True @@ -623,6 +708,118 @@ def parseNodeCtxt(self, return newCtxt, ret +class ReduceLogSumExpParser(NodeParser): + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + if len(node.inputs) < 1 or len(node.inputs) > 2 or len(node.outputs) != 1: + return False + + if 'keepdims' not in node.attrs: + return False + + self.operatorRepresentation['keepdims'] = int(node.attrs['keepdims']) + if len(node.inputs[0].shape) == 0: + return False + + if len(node.inputs) == 2: + if not isinstance(node.inputs[1], gs.Constant): + return False + axes = np.array(node.inputs[1].values, dtype = np.int64) + else: + if 'axes' not in node.attrs: + return False + axes = node.attrs['axes'] + if isinstance(axes, int): + axes = [axes] + axes = np.array(axes, dtype = np.int64) + + normalized_axes = [] + rank = len(node.inputs[0].shape) + for axis in axes: + normalized_axis = int(axis) + if normalized_axis < 0: + normalized_axis += rank + normalized_axes.append(normalized_axis) + + if len(normalized_axes) != 1: + return False + + axis = normalized_axes[0] + if axis < 0 or axis >= rank: + return False + + self.operatorRepresentation['axes'] = np.array([axis], dtype = np.int64) + + if self.operatorRepresentation['keepdims']: + output_shape = list(node.inputs[0].shape) + output_shape[axis] = 1 + else: + output_shape = list(node.inputs[0].shape[:axis]) + list(node.inputs[0].shape[axis + 1:]) + if len(output_shape) == 0: + output_shape = [1] + + node.outputs[0].shape = output_shape + return True + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + data_in = ctxt.lookup(node.inputs[0].name) + data_out = ctxt.lookup(node.outputs[0].name) + + if len(node.inputs) == 2: + axes_buffer = ctxt.lookup(node.inputs[1].name) + axes_buffer._live = False + axes_buffer._deploy = False + axes = np.array(axes_buffer.values, dtype = np.int64) + else: + axes = np.array(self.operatorRepresentation['axes'], dtype = np.int64) + + normalized_axes = [] + for axis in axes: + normalized_axis = int(axis) + if normalized_axis < 0: + normalized_axis += len(data_in.shape) + normalized_axes.append(normalized_axis) + + if len(normalized_axes) != 1: + return ctxt, False + + axis = normalized_axes[0] + if axis < 0 or axis >= len(data_in.shape): + return ctxt, False + + outer_size = int(np.prod(data_in.shape[:axis])) if axis > 0 else 1 + inner_size = int(np.prod(data_in.shape[axis + 1:])) if axis + 1 < len(data_in.shape) else 1 + if self.operatorRepresentation['keepdims']: + output_shape = list(data_in.shape) + output_shape[axis] = 1 + else: + output_shape = list(data_in.shape[:axis]) + list(data_in.shape[axis + 1:]) + if len(output_shape) == 0: + output_shape = [1] + + data_out.shape = output_shape + node.outputs[0].shape = output_shape + + self.operatorRepresentation['data_in'] = data_in.name + self.operatorRepresentation['data_out'] = data_out.name + self.operatorRepresentation['data_in_shape'] = data_in.shape + self.operatorRepresentation['data_out_shape'] = output_shape + self.operatorRepresentation['size'] = int(np.prod(data_in.shape)) + self.operatorRepresentation['axes'] = np.array([axis], dtype = np.int64) + self.operatorRepresentation['axisLength'] = data_in.shape[axis] + self.operatorRepresentation['outerSize'] = outer_size + self.operatorRepresentation['innerSize'] = inner_size + + return ctxt, True + + class SoftmaxParser(NodeParser): def __init__(self): @@ -2703,14 +2900,11 @@ def __init__(self): super().__init__() def parseNode(self, node: gs.Node) -> bool: - # Verify the attributes (epsilon is mandatory, momentum and training_mode are optional) - if 'epsilon' not in node.attrs: - return False # Common Inputs: 5 (X, scale, B, mean, var) if len(node.inputs) < 5: return False - # Save the attributes, default values are provided if not present + # Save attributes (ONNX defaults when attributes are omitted) self.operatorRepresentation['epsilon'] = node.attrs.get('epsilon', 1e-5) self.operatorRepresentation['momentum'] = node.attrs.get('momentum', 0.9) self.operatorRepresentation['training_mode'] = node.attrs.get('training_mode', 0) @@ -2728,10 +2922,27 @@ def parseNodeCtxt(self, ctxt, node: gs.Node, channels_first: bool = True): self.operatorRepresentation[outputs[0]] = ctxt.lookup(node.outputs[0].name).name input_shape = ctxt.lookup(node.inputs[0].name).shape - # Save input shape information + param_shape = ctxt.lookup(node.inputs[1].name).shape + + if len(input_shape) >= 2 and len(param_shape) >= 1: + channel_count = param_shape[0] + channels_first_match = input_shape[1] == channel_count + channels_last_match = input_shape[-1] == channel_count + + # Prefer explicit evidence from the BN parameter length when the + # default layout would otherwise be ambiguous for mixed-layout graphs. + if channels_first_match != channels_last_match: + channels_first = channels_first_match + + # BatchNorm runs on flattened [N, C, L] for channels-first and [N, L, C] + # for channels-last, where L covers all spatial positions. self.operatorRepresentation['batch_size'] = input_shape[0] - self.operatorRepresentation['channel_size'] = input_shape[1] - self.operatorRepresentation['window_size'] = input_shape[2] + self.operatorRepresentation['channel_size'] = input_shape[1] if channels_first else input_shape[-1] + if channels_first: + self.operatorRepresentation['window_size'] = int(np.prod(input_shape[2:])) + else: + self.operatorRepresentation['window_size'] = int(np.prod(input_shape[1:-1])) + self.operatorRepresentation['channels_first'] = int(channels_first) return ctxt, True @@ -2783,8 +2994,8 @@ def parseNodeCtxt(self, ctxt: NetworkContext, node: gs.Node, channels_first: boo stride_x, stride_y = 1, 1 if "strides" in node.attrs: - stride_y = node.attrs["strides"][0] - stride_x = node.attrs["strides"][1] if len(node.attrs["strides"]) > 1 else stride_y + stride_x = node.attrs["strides"][0] + stride_y = node.attrs["strides"][1] if len(node.attrs["strides"]) > 1 else stride_x self.operatorRepresentation["stride_y"] = stride_y self.operatorRepresentation["stride_x"] = stride_x @@ -2865,6 +3076,80 @@ def parseNodeCtxt(self, return ctxt, False +class ConvTranspose2DParser(ConvTransposeParser): + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + # 2D ConvTranspose expects 4D input/output and 4D weight + wellFormed = super().parseNode(node) + ret = False + if wellFormed: + ret = all([ + # Make sure strides are 2D + len(node.attrs['strides']) == 2, + len(node.attrs['pads']) == 4, + len(node.attrs['dilations']) == 2, + ]) + if ret: + + self.operatorRepresentation['kernel_shape'] = node.attrs['kernel_shape'] + self.operatorRepresentation['dim_kernel_x'] = int(self.operatorRepresentation['kernel_shape'][0]) + self.operatorRepresentation['dim_kernel_y'] = int(self.operatorRepresentation['kernel_shape'][1]) + self.operatorRepresentation['dilation_x'] = int(self.operatorRepresentation['dilations'][0]) + self.operatorRepresentation['dilation_y'] = int(self.operatorRepresentation['dilations'][1]) + self.operatorRepresentation['padding_x'] = int(self.operatorRepresentation['pads'][0]) + self.operatorRepresentation['padding_y'] = int(self.operatorRepresentation['pads'][1]) + self.operatorRepresentation['stride_x'] = int(self.operatorRepresentation['strides'][0]) + self.operatorRepresentation['stride_y'] = int(self.operatorRepresentation['strides'][1]) + + return ret + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) + + if ret: + data_in = newCtxt.lookup(node.inputs[0].name) + data_out = newCtxt.lookup(node.outputs[0].name) + in_shape = data_in.shape + out_shape = data_out.shape + weight = newCtxt.lookup(node.inputs[1].name) + + if len(in_shape) != 4 or len(out_shape) != 4 or len(weight.shape) != 4: + return ctxt, False + + self.operatorRepresentation['batch'] = in_shape[0] + + if channels_first: + self.operatorRepresentation['ch_im_in'] = in_shape[1] + self.operatorRepresentation['dim_im_in_x'] = in_shape[2] + self.operatorRepresentation['dim_im_in_y'] = in_shape[3] + self.operatorRepresentation['ch_im_out'] = out_shape[1] + self.operatorRepresentation['dim_im_out_x'] = out_shape[2] + self.operatorRepresentation['dim_im_out_y'] = out_shape[3] + else: + self.operatorRepresentation['ch_im_in'] = in_shape[3] + self.operatorRepresentation['dim_im_in_x'] = in_shape[1] + self.operatorRepresentation['dim_im_in_y'] = in_shape[2] + self.operatorRepresentation['ch_im_out'] = out_shape[3] + self.operatorRepresentation['dim_im_out_x'] = out_shape[1] + self.operatorRepresentation['dim_im_out_y'] = out_shape[2] + + self.operatorRepresentation["batchOffsetIn"] = (self.operatorRepresentation["ch_im_in"] * + self.operatorRepresentation["dim_im_in_x"] * + self.operatorRepresentation["dim_im_in_y"]) + self.operatorRepresentation["batchOffsetOut"] = (self.operatorRepresentation["ch_im_out"] * + self.operatorRepresentation["dim_im_out_x"] * + self.operatorRepresentation["dim_im_out_y"]) + return newCtxt, True + return ctxt, False + + class SqrtParser(NodeParser): def __init__(self): diff --git a/Deeploy/Targets/Generic/Platform.py b/Deeploy/Targets/Generic/Platform.py index e05e897270..c0066bd78c 100644 --- a/Deeploy/Targets/Generic/Platform.py +++ b/Deeploy/Targets/Generic/Platform.py @@ -7,26 +7,27 @@ from Deeploy.DeeployTypes import ConstantBuffer, DeploymentEngine, DeploymentPlatform, NodeMapper, NodeTemplate, \ StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer from Deeploy.Targets.Generic.Bindings import BasicAddBindings, BasicBatchNormBindings, BasicConcatBindings, \ - BasicConv1DBindings, BasicConv2DBindings, BasicConvTransposeBindings, BasicDebugPrintBindings, \ - BasicDequantBindings, BasicDivBindings, BasicDWConv1DBinding, BasicDWConv2DBindings, BasicGatherBindings, \ - BasicGELUBindings, BasicGEMMBindings, BasicITAPartialSoftmaxBinding, BasicITASoftmaxBinding, \ + BasicConv1DBindings, BasicConv2DBindings, BasicConvTranspose1DBindings, BasicConvTranspose2DBindings, \ + BasicDebugPrintBindings, BasicDequantBindings, BasicDivBindings, BasicDWConv1DBinding, BasicDWConv2DBindings, \ + BasicGatherBindings, BasicGELUBindings, BasicGEMMBindings, BasicITAPartialSoftmaxBinding, BasicITASoftmaxBinding, \ BasicLayerNormBindings, BasicMatMulBindings, BasicMaxPool1DBindings, BasicMaxPool2DBindings, BasicMulBindings, \ - BasicPad1DBindings, BasicPad2DBindings, BasicPowBindings, BasicQuantBindings, BasicReduceMeanBindings, \ - BasicReduceSumBindings, BasicReluBinding, BasicReshapeBindings, BasicRQIntegerDivBinding, BasicRQSBindings, \ - BasicRQSGELUBinding, BasicSliceBindings, BasicSoftmaxBindings, BasicSqrtBindings, BasicTransposeBindings, \ - DummyBinding + BasicPad1DBindings, BasicPad2DBindings, BasicPowBindings, BasicQuantBindings, BasicReduceLogSumExpBindings, \ + BasicReduceMeanBindings, BasicReduceSumBindings, BasicReluBinding, BasicReshapeBindings, BasicRQIntegerDivBinding, \ + BasicRQSBindings, BasicRQSGELUBinding, BasicSliceBindings, BasicSoftmaxBindings, BasicSqrtBindings, \ + BasicTransposeBindings, DummyBinding from Deeploy.Targets.Generic.Layers import AddLayer, BatchNormalizationLayer, ConcatLayer, ConvLayer, \ ConvTransposeLayer, DebugPrintLayer, DequantLayer, DivLayer, GatherLayer, GELULayer, GEMMLayer, ITAMaxLayer, \ - LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, PowLayer, QuantLayer, ReduceMeanLayer, \ - ReduceSumLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, SliceLayer, \ - SoftmaxLayer, SqrtLayer, TransposeLayer + LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, PowLayer, QuantLayer, ReduceLogSumExpLayer, \ + ReduceMeanLayer, ReduceSumLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, \ + SliceLayer, SoftmaxLayer, SqrtLayer, TransposeLayer from Deeploy.Targets.Generic.Parsers import AddParser, BatchNormParser, ConcatParser, ConvTranspose1DParser, \ - DebugParser, DequantParser, DivParser, DummyParser, FlattenParser, GatherParser, GELUParser, GenericConv1DParser, \ - GenericConv2DParser, GenericDWConv1DParser, GenericDWConv2DParser, GenericGEMMParser, GenericMaxPool2DParser, \ - IntegerDivParser, ITAMaxParser, ITAPartialMaxParser, LayerNormParser, MatMulParser, MaxPool1DParser, MulParser, \ - Pad1DParser, Pad2DParser, PowParser, QuantParser, ReduceMeanParser, ReduceSumParser, ReluParser, \ - RequantShiftParser, ReshapeParser, RQIntegerDivParser, RQSiGELUParser, SliceParser, SoftmaxParser, SqrtParser, \ - TransposeParser, UnsqueezeParser, iLayerNormParser, iSoftmaxParser + ConvTranspose2DParser, DebugParser, DequantParser, DivParser, DummyParser, FlattenParser, GatherParser, \ + GELUParser, GenericConv1DParser, GenericConv2DParser, GenericDWConv1DParser, GenericDWConv2DParser, \ + GenericGEMMParser, GenericMaxPool2DParser, IntegerDivParser, ITAMaxParser, ITAPartialMaxParser, LayerNormParser, \ + MatMulParser, MaxPool1DParser, MulParser, Pad1DParser, Pad2DParser, PowParser, QuantParser, ReduceLogSumExpParser, \ + ReduceMeanParser, ReduceSumParser, ReluParser, RequantShiftParser, ReshapeParser, RQIntegerDivParser, \ + RQSiGELUParser, SliceParser, SoftmaxParser, SqrtParser, TransposeParser, UnsqueezeParser, iLayerNormParser, \ + iSoftmaxParser from Deeploy.Targets.Generic.Templates import AllocateTemplate, FreeTemplate from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import DequantPatternPass, ExtractPaddingFromConvPass, \ ExtractPaddingFromPoolPass, MatMulAddMergePass, MergeConstAddAndRequantPass, QuantPatternPass, \ @@ -58,6 +59,7 @@ Pad1DMapper = NodeMapper(Pad1DParser(), BasicPad1DBindings) Pad2DMapper = NodeMapper(Pad2DParser(), BasicPad2DBindings) ReduceMeanMapper = NodeMapper(ReduceMeanParser(), BasicReduceMeanBindings) +ReduceLogSumExpMapper = NodeMapper(ReduceLogSumExpParser(), BasicReduceLogSumExpBindings) ReduceSumMapper = NodeMapper(ReduceSumParser(), BasicReduceSumBindings) ReluMapper = NodeMapper(ReluParser(), [BasicReluBinding]) RequantShiftMapper = NodeMapper(RequantShiftParser(), BasicRQSBindings) @@ -71,7 +73,8 @@ QuantMapper = NodeMapper(QuantParser(), BasicQuantBindings) DequantMapper = NodeMapper(DequantParser(), BasicDequantBindings) BatchNormalizationMapper = NodeMapper(BatchNormParser(), BasicBatchNormBindings) -ConvTransposeMapper = NodeMapper(ConvTranspose1DParser(), BasicConvTransposeBindings) +ConvTranspose1DMapper = NodeMapper(ConvTranspose1DParser(), BasicConvTranspose1DBindings) +ConvTranspose2DMapper = NodeMapper(ConvTranspose2DParser(), BasicConvTranspose2DBindings) SliceMapper = NodeMapper(SliceParser(), BasicSliceBindings) # Dummy nodes are intended for development purposes only! @@ -105,6 +108,7 @@ 'Sqrt': SqrtLayer([SqrtMapper]), 'Pad': PadLayer([Pad1DMapper, Pad2DMapper]), 'ReduceMean': ReduceMeanLayer([ReduceMeanMapper]), + 'ReduceLogSumExp': ReduceLogSumExpLayer([ReduceLogSumExpMapper]), 'ReduceSum': ReduceSumLayer([ReduceSumMapper]), 'Relu': ReluLayer([ReluMapper]), 'RequantizediGELU': RQSiGELULayer([RQGELUMapper]), @@ -118,7 +122,7 @@ 'Quant': QuantLayer([QuantMapper]), 'Dequant': DequantLayer([DequantMapper]), 'BatchNormalization': BatchNormalizationLayer([BatchNormalizationMapper]), - 'ConvTranspose': ConvTransposeLayer([ConvTransposeMapper]) + 'ConvTranspose': ConvTransposeLayer([ConvTranspose1DMapper, ConvTranspose2DMapper]) # # For example, you can use the DummpyMapper, in case you want to test # # deployment or optimizations with GlobalAveragePool nodes but did not yet # # implement the corresponding kernel diff --git a/Deeploy/Targets/Generic/Templates/BatchNormalizationTemplate.py b/Deeploy/Targets/Generic/Templates/BatchNormalizationTemplate.py index 5377c91ca0..0233a36c96 100644 --- a/Deeploy/Targets/Generic/Templates/BatchNormalizationTemplate.py +++ b/Deeploy/Targets/Generic/Templates/BatchNormalizationTemplate.py @@ -9,7 +9,7 @@ BEGIN_SINGLE_CORE BatchNorm_fp32( ${data_in}, ${scale}, ${bias}, ${mean}, ${variance}, - ${data_out}, ${batch_size}, ${channel_size}, ${window_size} + ${data_out}, ${batch_size}, ${channel_size}, ${window_size}, ${epsilon}, ${channels_first} ); END_SINGLE_CORE """) diff --git a/Deeploy/Targets/Generic/Templates/ConvTransposeTemplate.py b/Deeploy/Targets/Generic/Templates/ConvTransposeTemplate.py index 9bf864c91f..73bb130f78 100644 --- a/Deeploy/Targets/Generic/Templates/ConvTransposeTemplate.py +++ b/Deeploy/Targets/Generic/Templates/ConvTransposeTemplate.py @@ -4,8 +4,9 @@ from Deeploy.DeeployTypes import NodeTemplate -referenceTemplate = NodeTemplate(""" +reference1DTemplate = NodeTemplate(""" <% +bias_ptr = bias if 'bias' in locals() else "NULL" batchOffsetIn = ch_im_in * dim_im_in_y batchOffsetOut = ch_im_out * dim_im_out_y %> @@ -20,7 +21,7 @@ ref_${data_out}_${data_in}, ${ch_im_in}, ${dim_im_in_y}, ${weight}, ${ch_im_out}, ${dim_kernel_y}, ${stride_y}, - ${bias}, ${has_bias}, + ${bias_ptr}, ${has_bias}, ref_${data_out}_${data_out}, ${dim_im_out_y} ); @@ -29,3 +30,30 @@ } END_SINGLE_CORE """) + +reference2DTemplate = NodeTemplate(""" +<% +bias_ptr = bias if 'bias' in locals() else "NULL" +batchOffsetIn = ch_im_in * dim_im_in_x * dim_im_in_y +batchOffsetOut = ch_im_out * dim_im_out_x * dim_im_out_y +%> + +// 2D Transposed Conv (Name: ${nodeName}, Op: ${nodeOp}) +BEGIN_SINGLE_CORE + ${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; + ${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + + for (uint32_t n=0; n<${batch}; ++n) { + ConvTranspose2d_fp32( + ref_${data_out}_${data_in}, ${ch_im_in}, ${dim_im_in_x}, ${dim_im_in_y}, + ${weight}, ${ch_im_out}, ${dim_kernel_x}, ${dim_kernel_y}, + ${stride_x}, ${stride_y}, + ${bias_ptr}, ${has_bias}, + ref_${data_out}_${data_out}, ${dim_im_out_x}, ${dim_im_out_y} + ); + + ref_${data_out}_${data_in} += ${batchOffsetIn}; + ref_${data_out}_${data_out} += ${batchOffsetOut}; + } +END_SINGLE_CORE +""") diff --git a/Deeploy/Targets/Generic/Templates/FloatMulTemplate.py b/Deeploy/Targets/Generic/Templates/FloatMulTemplate.py index 3c8c2da501..0df2e901a0 100644 --- a/Deeploy/Targets/Generic/Templates/FloatMulTemplate.py +++ b/Deeploy/Targets/Generic/Templates/FloatMulTemplate.py @@ -8,7 +8,7 @@ // Float Mul (Name: ${nodeName}, Op: ${nodeOp}) BEGIN_SINGLE_CORE for (uint32_t i=0;i<${size};i++){ - ${C}[i] = ${A}[i] * ${B}[0]; + ${C}[i] = ${A}[i] * ${B}[${0 if sizeB == 1 else 'i'}]; } END_SINGLE_CORE """) diff --git a/Deeploy/Targets/Generic/Templates/FloatReduceLogSumExpTemplate.py b/Deeploy/Targets/Generic/Templates/FloatReduceLogSumExpTemplate.py new file mode 100644 index 0000000000..98d04cfaf6 --- /dev/null +++ b/Deeploy/Targets/Generic/Templates/FloatReduceLogSumExpTemplate.py @@ -0,0 +1,10 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from Deeploy.DeeployTypes import NodeTemplate + +referenceTemplate = NodeTemplate(""" +// ReduceLogSumExp (Name: ${nodeName}, Op: ${nodeOp}) +SINGLE_CORE ReduceLogSumExp_fp32_fp32(${data_in}, ${data_out}, ${outerSize}, ${axisLength}, ${innerSize}); +""") diff --git a/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py b/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py index 091cb55a41..0e24d51635 100644 --- a/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py +++ b/Deeploy/Targets/Generic/TileConstraints/UntiledTileConstraint.py @@ -10,11 +10,23 @@ from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint from Deeploy.TilingExtension.TileConstraint import TileConstraint from Deeploy.TilingExtension.TilerModel import TilerModel -from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, TilingSchedule, VariableReplacementScheme +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme class UntiledTileConstraint(TileConstraint): + @staticmethod + def _normalizedShape(shape) -> Tuple[int, ...]: + if isinstance(shape, int): + return (shape,) + + normalized = tuple(shape) + if len(normalized) == 0: + return (1,) + + return normalized + @staticmethod def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: @@ -35,7 +47,8 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw tilerModel.addTensorDimToModel(ctxt, tensorName) - for idx, shapeDim in enumerate(_buffer.shape): + shape = [_buffer.shape] if isinstance(_buffer.shape, int) else _buffer.shape + for idx, shapeDim in enumerate(shape): tilerModel.addConstraint(tilerModel.getTensorDimVar(tensorName = tensorName, dimIdx = idx) == shapeDim) return tilerModel @@ -53,22 +66,55 @@ def serializeTilingSolution( cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], targetMemLevel: str, ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: - outputCubes = [cube.rectangle for cube in absoluteOutputCubes] - - schedule = TilingSchedule({}, {}, [], []) repScheme = VariableReplacementScheme({}, {}) + inputLoadSchedule: List[Dict[str, HyperRectangle]] = [{}] + outputLoadSchedule: List[Dict[str, HyperRectangle]] = [{}] + inputBaseOffsets: Dict[str, List[int]] = {} + outputBaseOffsets: Dict[str, List[int]] = {} - for key, value in tilingSolution.tensorMemoryConstraints.items(): + addrNames: List[str] = [] + for key, value in operatorRepresentation.items(): + if not isinstance(value, str): + continue + + if value not in tilingSolution.tensorMemoryConstraints: + continue - assert len(value.memoryConstraints.keys()) == 1, f"{cls} should be untiled, but {value} is tiled!" + _buffer = ctxt.lookup(value) + if isinstance(_buffer, TransientBuffer): + continue - memKey = list(value.memoryConstraints.keys())[0] - memValue = value.memoryConstraints[memKey] + addrNames.append(key) + for key, value in tilingSolution.tensorMemoryConstraints.items(): _buffer = ctxt.lookup(key) if isinstance(_buffer, TransientBuffer): continue - assert memValue.shape == tuple(_buffer.shape) + fullShape = cls._normalizedShape(_buffer.shape) + memoryConstraints = list(value.memoryConstraints.values()) + + # Untiled tensors may still be materialized on multiple memory levels when the same + # full-shape buffer is replicated along the memory path (for example, an untiled Slice + # fed by an L3-backed producer). This is not true tiling and should remain acceptable + # here. We only reject the case where different memory levels carry different shapes. + assert all(cls._normalizedShape(memValue.shape) == fullShape for memValue in memoryConstraints), \ + f"{cls} should be untiled, but {value} carries multiple shapes across memory levels!" + + if len(addrNames) > 0: + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + for addrName in inputBaseOffsets: + buffer = ctxt.lookup(operatorRepresentation[addrName]) + shape = cls._normalizedShape(buffer.shape) + inputLoadSchedule[0][addrName] = HyperRectangle((0,) * len(shape), shape) + + for addrName in outputBaseOffsets: + buffer = ctxt.lookup(operatorRepresentation[addrName]) + shape = cls._normalizedShape(buffer.shape) + outputLoadSchedule[0][addrName] = HyperRectangle((0,) * len(shape), shape) + + schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) return repScheme, schedule diff --git a/Deeploy/Targets/Generic/TopologyOptimizationPasses/Passes.py b/Deeploy/Targets/Generic/TopologyOptimizationPasses/Passes.py index 146bcf699e..4841187abe 100644 --- a/Deeploy/Targets/Generic/TopologyOptimizationPasses/Passes.py +++ b/Deeploy/Targets/Generic/TopologyOptimizationPasses/Passes.py @@ -478,7 +478,12 @@ def _merge_matmul_add_fun(graph: gs.Graph, match: Match, name: str): matched_nodes = [m for k, m in match.nodes_map.items()] gemm = matched_nodes[0] add = matched_nodes[1] - _bias = add.inputs[0] if isinstance(add.inputs[0], gs.Constant) else add.inputs[1] + + constant_inputs = [inp for inp in add.inputs if isinstance(inp, gs.Constant)] + if len(constant_inputs) != 1: + return graph + + _bias = constant_inputs[0] _inputs = gemm.inputs + [_bias] _outputs = add.outputs diff --git a/Deeploy/Targets/Generic/TypeCheckers.py b/Deeploy/Targets/Generic/TypeCheckers.py index c2c8d436f8..6834ed8975 100644 --- a/Deeploy/Targets/Generic/TypeCheckers.py +++ b/Deeploy/Targets/Generic/TypeCheckers.py @@ -341,6 +341,20 @@ def _inferSignedness(self, inputs: List[VariableBuffer], return [False] +class ReduceLogSumExpChecker(SignPropTypeChecker): + + def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): + super().__init__(input_types, output_types) + + def _inferNumLevels(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[int]: + return [2**(self.input_types[0].referencedType.typeWidth)] + + def _inferSignedness(self, inputs: List[VariableBuffer], + operatorRepresentation: OperatorRepresentation) -> List[bool]: + return [True] + + class ReluChecker(SignPropTypeChecker): def __init__(self, input_types: Sequence[Type[Pointer]], output_types: Sequence[Type[Pointer]]): diff --git a/Deeploy/Targets/Neureka/Engine.py b/Deeploy/Targets/Neureka/Engine.py index 2585b1a688..d7745bd11c 100644 --- a/Deeploy/Targets/Neureka/Engine.py +++ b/Deeploy/Targets/Neureka/Engine.py @@ -2,8 +2,9 @@ # # SPDX-License-Identifier: Apache-2.0 -from typing import List +from typing import Any, List +import numpy as np import onnx_graphsurgeon as gs from Deeploy.DeeployTypes import DeploymentEngine, NodeMapper @@ -53,30 +54,95 @@ def __init__(self, self.enable3x3 = enable3x3 self.enableStrides = enableStrides - def isDenseConv(self, node) -> bool: + @staticmethod + def _isSupportedConvNode(node: gs.Node) -> bool: + # Common N-EUREKA preconditions for all convolution flavors. Keep this + # structural: engine coloring runs before Deeploy has reliable type info. return node.op in ["Conv", "RequantizedConv"] and \ + len(node.inputs) > 1 and \ isinstance(node.inputs[1], gs.Constant) and \ - node.attrs['kernel_shape'] == [3, 3] and \ - node.attrs['dilations'] == [1, 1] and \ - node.attrs['group'] == 1 and \ - (node.attrs['strides'] == [1, 1] or self.enableStrides) + node.attrs.get('dilations') == [1, 1] + + def _hasSupportedStrides(self, node: gs.Node) -> bool: + # Strided convolutions are opt-in because not every N-EUREKA setup enables + # them, while unit strides are always supported. + return node.attrs.get('strides') == [1, 1] or self.enableStrides + + @staticmethod + def _isIntegerDtype(dtype: Any) -> bool: + # ONNX-GraphSurgeon may expose either numpy dtypes/classes or plain ONNX + # enum integers depending on the graph loading path. Treat unknown or + # missing dtypes as inconclusive instead of rejecting the node outright. + if dtype is None: + return False + + try: + return np.issubdtype(dtype, np.integer) + except TypeError: + return dtype in { + 2, # UINT8 + 3, # INT8 + 4, # UINT16 + 5, # INT16 + 6, # INT32 + 7, # INT64 + 12, # UINT32 + 13, # UINT64 + } + + @classmethod + def _hasIntegerTensorType(cls, node: gs.Node) -> bool: + # Prefer real tensor metadata when it is available. This catches already + # integer-typed ONNX graphs without depending on exporter-specific names. + return any(cls._isIntegerDtype(getattr(tensor, "dtype", None)) for tensor in [*node.inputs, *node.outputs]) + + @staticmethod + def _hasQuantizedProvenance(node: gs.Node) -> bool: + # Some quantized Deeploy/PACT graphs still carry FLOAT ONNX annotations + # before type inference. In that case the stable signal is the provenance + # left by quantization/integerization passes on tensors or attributes. + quantizedMarkers = ("INTEGERIZE", "QUANT", "REQUANT", "PACT") + names = [getattr(tensor, "name", "") for tensor in [*node.inputs, *node.outputs]] + attrNames = [str(name) for name in node.attrs.keys()] + return any(marker in name.upper() for name in [*names, *attrNames] for marker in quantizedMarkers) + + @classmethod + def _hasNeurekaCompatibleSemantics(cls, node: gs.Node) -> bool: + # RequantizedConv is produced only after a quantized convolution/requant + # pattern was merged, so its op already carries the integer semantics that + # N-EUREKA expects. Plain Conv needs an additional signal; otherwise FP32 + # convolutions from mixed models would be colored for N-EUREKA and fail + # later in parsing/binding. + if node.op == "RequantizedConv": + return True + + return cls._hasIntegerTensorType(node) or cls._hasQuantizedProvenance(node) + + def isDenseConv(self, node) -> bool: + return self._isSupportedConvNode(node) and \ + self._hasNeurekaCompatibleSemantics(node) and \ + node.attrs.get('kernel_shape') == [3, 3] and \ + node.attrs.get('group', 1) == 1 and \ + self._hasSupportedStrides(node) def isPWConv(self, node) -> bool: - return node.op in ["Conv", "RequantizedConv"] and \ - isinstance(node.inputs[1], gs.Constant) and \ - node.attrs['kernel_shape'] == [1, 1] and \ - node.attrs['dilations'] == [1, 1] and \ - (node.attrs['strides'] == [1, 1] or self.enableStrides) + return self._isSupportedConvNode(node) and \ + self._hasNeurekaCompatibleSemantics(node) and \ + node.attrs.get('kernel_shape') == [1, 1] and \ + self._hasSupportedStrides(node) def isDWConv(self, node) -> bool: - return node.op in ["Conv", "RequantizedConv"] and \ - isinstance(node.inputs[1], gs.Constant) and \ - node.attrs['kernel_shape'] == [3, 3] and \ - node.attrs['dilations'] == [1, 1] and \ - node.attrs['group'] != 1 and \ - (node.attrs['strides'] == [1, 1] or self.enableStrides) + return self._isSupportedConvNode(node) and \ + self._hasNeurekaCompatibleSemantics(node) and \ + node.attrs.get('kernel_shape') == [3, 3] and \ + node.attrs.get('group', 1) != 1 and \ + self._hasSupportedStrides(node) def canExecute(self, node: gs.Node) -> bool: + # Engine coloring runs before Deeploy type inference, and ONNX dtype + # annotations are not reliable for every quantized graph. Still, N-EUREKA + # is an integer accelerator, so the coloring must avoid pure FP Conv + # nodes and let the fallback engine handle them. if self.enable3x3: return self.isPWConv(node) or self.isDWConv(node) or self.isDenseConv(node) else: diff --git a/Deeploy/Targets/Neureka/Platform.py b/Deeploy/Targets/Neureka/Platform.py index e83f7f20f4..af8762c595 100644 --- a/Deeploy/Targets/Neureka/Platform.py +++ b/Deeploy/Targets/Neureka/Platform.py @@ -6,6 +6,7 @@ import onnx_graphsurgeon as gs +from Deeploy.AbstractDataTypes import PointerClass, VoidType from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \ RequantizedGemmToPwPass from Deeploy.DeeployTypes import ConstantBuffer, NetworkContext, NodeTemplate, TopologyOptimizer @@ -27,6 +28,12 @@ class NeurekaConstantBuffer(ConstantBuffer): allocTemplate = NodeTemplate("") deallocTemplate = NodeTemplate("") + def __init__(self, name: str = '', shape = [1], values = [0]): + super().__init__(name, shape, values) + # Some Neureka lowering paths inspect global constants before type inference + # has populated them with the final pointer type. + self._type = PointerClass(VoidType) + def _bufferRepresentation(self): operatorRepresentation = super()._bufferRepresentation() operatorRepresentation["_memoryLevel"] = getattr(self, "_memoryLevel", None) diff --git a/Deeploy/Targets/PULPOpen/Bindings.py b/Deeploy/Targets/PULPOpen/Bindings.py index 2c78978e23..9882efe568 100644 --- a/Deeploy/Targets/PULPOpen/Bindings.py +++ b/Deeploy/Targets/PULPOpen/Bindings.py @@ -15,12 +15,14 @@ from Deeploy.FutureExtension.Bindings.AutoFutureBinding import AutoFutureBinding from Deeploy.FutureExtension.CodeTransformationPasses.FutureCodeTransformation import FutureGeneration from Deeploy.MemoryLevelExtension.CodeTransformationPasses.Closure import MemoryAwareClosureGeneration -from Deeploy.Targets.Generic.Templates import AddTemplate, ConcatTemplate, DequantTemplate, FloatReduceSumTemplate, \ - GatherTemplate, QuantTemplate, RQSiGELUTemplate, SliceTemplate, iHardswishTemplate -from Deeploy.Targets.Generic.TypeCheckers import AddChecker, ConcatChecker, ConvChecker, DequantChecker, \ - GatherChecker, GELUChecker, GEMMChecker, HardswishChecker, LayerNormChecker, MatMulChecker, MulChecker, \ - QuantChecker, ReduceMeanChecker, ReluChecker, ReshapeChecker, RQAddChecker, RQHardswishChecker, SGDChecker, \ - SliceChecker, SoftmaxChecker, SoftmaxCrossEntropyLossChecker, TransposeChecker +from Deeploy.Targets.Generic.Templates import AddTemplate, BatchNormalizationTemplate, ConcatTemplate, \ + DequantTemplate, FloatPadTemplate, FloatReduceLogSumExpTemplate, FloatReduceSumTemplate, GatherTemplate, \ + PadTemplate, QuantTemplate, RQSiGELUTemplate, SliceTemplate, iHardswishTemplate +from Deeploy.Targets.Generic.TypeCheckers import AddChecker, BatchNormChecker, ConcatChecker, ConvChecker, \ + DequantChecker, GatherChecker, GELUChecker, GEMMChecker, HardswishChecker, LayerNormChecker, MatMulChecker, \ + MulChecker, PadChecker, QuantChecker, ReduceLogSumExpChecker, ReduceMeanChecker, ReluChecker, ReshapeChecker, \ + RQAddChecker, RQHardswishChecker, SGDChecker, SliceChecker, SoftmaxChecker, SoftmaxCrossEntropyLossChecker, \ + TransposeChecker from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterSynch import PULPSynchCoresPass from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPClusterTiling import PULPClusterTiling from Deeploy.Targets.PULPOpen.CodeTransformationPasses.PULPL3Tiling import PULPL3Tiling @@ -29,12 +31,13 @@ from Deeploy.Targets.PULPOpen.DataTypes import PULPDMAFuture from Deeploy.Targets.PULPOpen.DMA.L3Dma import l3DmaHack from Deeploy.Targets.PULPOpen.DMA.MchanDma import MchanDma -from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, DMASliceTemplate, FloatAddTemplate, FloatConvTemplate, \ - FloatGELUTemplate, FloatGemmTemplate, FloatLayernormTemplate, FloatMatMulTemplate, FloatMaxPoolTemplate, \ - FloatMulTemplate, FloatReduceMeanTemplate, FloatReluTemplate, FloatSoftmaxTemplate, GEMMTemplate, \ - MatrixVectorTemplate, MaxPoolTemplate, MulTemplate, ReduceMeanTemplate, RequantShiftTemplate, ReshapeTemplate, \ - RQAddTemplate, RQSiHardswishTemplate, SGDTemplate, SoftmaxCrossEntropyLossTemplate, TallGEMMTemplate, \ - TransposeTemplate, UniformRequantShiftTemplate, iRMSNormTemplate, iSoftmaxTemplate +from Deeploy.Targets.PULPOpen.Templates import ConvTemplate, ConvTransposeTemplate, DMASliceTemplate, \ + FloatAddTemplate, FloatConvTemplate, FloatGELUTemplate, FloatGemmTemplate, FloatLayernormTemplate, \ + FloatMatMulTemplate, FloatMaxPoolTemplate, FloatMulTemplate, FloatReduceMeanTemplate, FloatReluTemplate, \ + FloatSoftmaxTemplate, GEMMTemplate, MatrixVectorTemplate, MaxPoolTemplate, MulTemplate, ReduceMeanTemplate, \ + RequantShiftTemplate, ReshapeTemplate, RQAddTemplate, RQSiHardswishTemplate, SGDTemplate, \ + SoftmaxCrossEntropyLossTemplate, TallGEMMTemplate, TransposeTemplate, UniformRequantShiftTemplate, \ + iRMSNormTemplate, iSoftmaxTemplate from Deeploy.Targets.PULPOpen.TypeCheckers import PULPConvChecker, PULPLinearChecker, PULPMaxPoolChecker, \ PULPRequantShiftChecker from Deeploy.TilingExtension.CodeTransformationPasses.TilingVariableReplacement import TilingVariableReplacement, \ @@ -176,6 +179,35 @@ ReshapeTemplate.referenceTemplate, SkipTransformer) for type in IntegerDataTypes + FloatDataTypes ] +PULPBatchNormBindings = [ + NodeBinding( + BatchNormChecker( + [PointerClass(type), + PointerClass(type), + PointerClass(type), + PointerClass(type), + PointerClass(type)], [PointerClass(type)]), BatchNormalizationTemplate.referenceTemplate, ForkTransformer) + for type in FloatDataTypes +] + +PULPPad1DBindings = [ + NodeBinding(PadChecker([PointerClass(type)], [PointerClass(type)]), PadTemplate.reference1DTemplate, + ForkTransformer) for type in IntegerDataTypes +] + [ + NodeBinding(PadChecker([PointerClass(type)], [PointerClass(type)]), FloatPadTemplate.reference1DTemplate, + ForkTransformer) for type in FloatDataTypes +] + +PULPPad2DBindings = [ + NodeBinding(PadChecker([PointerClass(type)], [PointerClass(type)]), PadTemplate.reference2DTemplate, + ForkTransformer) for type in IntegerDataTypes +] + [ + NodeBinding( + PadChecker([PointerClass(float32_t), PointerClass(float32_t), + PointerClass(float32_t)], [PointerClass(float32_t)]), FloatPadTemplate.reference2DTemplate, + ForkTransformer) +] + PULPRQAddBindings = [ NodeBinding(RQAddChecker([PointerClass(_type), PointerClass(_type2)], [PointerClass(_type3)]), RQAddTemplate.referenceTemplate, ForkTransformer) @@ -249,6 +281,15 @@ ForkTransformer) for float_type in FloatDataTypes ] +PULPFloatConvTranspose2DBindings = [ + NodeBinding(ConvChecker( + [PointerClass(type), PointerClass(type), PointerClass(type)], [PointerClass(type)]), + ConvTransposeTemplate.reference2DTemplate, ForkTransformer) for type in FloatDataTypes +] + [ + NodeBinding(ConvChecker([PointerClass(type), PointerClass(type)], [PointerClass(type)]), + ConvTransposeTemplate.reference2DTemplate, ForkTransformer) for type in FloatDataTypes +] + PULPRQSMatrixVecBindings = [ NodeBinding( PULPLinearChecker([PointerClass(type1), @@ -322,6 +363,11 @@ FloatReduceSumTemplate.referenceTemplate, ClusterTransformer) ] +PULPReduceLogSumExpBindings = [ + NodeBinding(ReduceLogSumExpChecker([PointerClass(float32_t)], [PointerClass(float32_t)]), + FloatReduceLogSumExpTemplate.referenceTemplate, ForkTransformer) +] + PULPUniformRQSBindings = [ NodeBinding( PULPRequantShiftChecker([PointerClass(type), PointerClass(int32_t), @@ -382,6 +428,9 @@ PULPConcatBindings = [ NodeBinding(ConcatChecker([PointerClass(type), PointerClass(type)], [PointerClass(type)]), ConcatTemplate.referenceTemplate, ClusterTransformer) for type in IntegerDataTypes +] + [ + NodeBinding(ConcatChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + ConcatTemplate.referenceTemplate, ClusterTransformer) ] PULPiRMSNormBindings = [ diff --git a/Deeploy/Targets/PULPOpen/DMA/L3Dma.py b/Deeploy/Targets/PULPOpen/DMA/L3Dma.py index 6c2aa30811..375ad75958 100644 --- a/Deeploy/Targets/PULPOpen/DMA/L3Dma.py +++ b/Deeploy/Targets/PULPOpen/DMA/L3Dma.py @@ -29,7 +29,7 @@ class L3Dma(AsyncDma): _transferTemplates = { 2: NodeTemplate( - "pi_cl_ram_copy_2d(get_ram_ptr(), ${ext}, ${loc}, ${transfer_size}, ${stride}, ${length}, ${ext2loc}, &${future});" + "pi_cl_ram_copy_2d(get_ram_ptr(), (uint32_t) ${ext}, ${loc}, ${transfer_size}, ${stride}, ${length}, ${ext2loc}, &${future});" ) } _waitingStrategy = PerTensorWaitingStrategy(L3DmaFuture) diff --git a/Deeploy/Targets/PULPOpen/Parsers.py b/Deeploy/Targets/PULPOpen/Parsers.py index 5c5951eaba..b71b0aead6 100644 --- a/Deeploy/Targets/PULPOpen/Parsers.py +++ b/Deeploy/Targets/PULPOpen/Parsers.py @@ -8,8 +8,8 @@ import onnx_graphsurgeon as gs from Deeploy.DeeployTypes import NetworkContext -from Deeploy.Targets.Generic.Parsers import Conv2DParser, GEMMParser, ReduceMeanParser, RQSConv1DParser, \ - RQSConv2DParser, RQSParserInterface +from Deeploy.Targets.Generic.Parsers import Conv2DParser, ConvTranspose2DParser, GEMMParser, ReduceMeanParser, \ + RQSConv1DParser, RQSConv2DParser, RQSParserInterface class PULPConv2DParser(RQSConv2DParser): @@ -90,6 +90,8 @@ def parseNode(self, node: gs.Node) -> (bool): self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][1]) self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][2]) self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][3]) + if "n_cores" in node.attrs: + self.operatorRepresentation["n_cores"] = int(node.attrs["n_cores"]) return ret return False @@ -148,6 +150,8 @@ def parseNode(self, node: gs.Node) -> (bool): self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][1]) self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][2]) self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][3]) + if "n_cores" in node.attrs: + self.operatorRepresentation["n_cores"] = int(node.attrs["n_cores"]) return ret return False @@ -182,6 +186,51 @@ def parseNodeCtxt(self, return ctxt, False +class PULPConvTranspose2DParser(ConvTranspose2DParser): + + def __init__(self): + super().__init__() + + def parseNode(self, node: gs.Node) -> bool: + wellFormed = super().parseNode(node) + + if wellFormed: + # ConvTranspose kernels on PULP/Siracusa are emitted in CHW layout. + # This must be visible before broadcast() runs, otherwise output shapes + # are reinterpreted as NHWC and the last dimension gets clobbered. + self.operatorRepresentation['channels_first'] = True + self.operatorRepresentation['padding_y_top'] = int(self.operatorRepresentation['pads'][0]) + self.operatorRepresentation['padding_x_left'] = int(self.operatorRepresentation['pads'][1]) + self.operatorRepresentation['padding_y_bottom'] = int(self.operatorRepresentation['pads'][2]) + self.operatorRepresentation['padding_x_right'] = int(self.operatorRepresentation['pads'][3]) + + return wellFormed + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + if node.attrs.get("channels_first", True) == False: + return ctxt, False + + newCtxt, ret = super().parseNodeCtxt(ctxt, node, True) + + if ret: + self.operatorRepresentation['data_in'] = newCtxt.lookup(node.inputs[0].name).name + self.operatorRepresentation['weight'] = newCtxt.lookup(node.inputs[1].name).name + + if len(node.inputs) == 2: + self.operatorRepresentation["has_bias"] = "false" + self.operatorRepresentation["bias"] = "NULL" + else: + self.operatorRepresentation["has_bias"] = "true" + self.operatorRepresentation["bias"] = newCtxt.lookup(node.inputs[2].name).name + + return newCtxt, True + + return ctxt, False + + class PULPDWConv1DParser(RQSConv1DParser): def __init__(self, noBiasHoisting = True): diff --git a/Deeploy/Targets/PULPOpen/Platform.py b/Deeploy/Targets/PULPOpen/Platform.py index f13e6451fb..fe0fdd737f 100644 --- a/Deeploy/Targets/PULPOpen/Platform.py +++ b/Deeploy/Targets/PULPOpen/Platform.py @@ -11,20 +11,20 @@ NodeTemplate, StructBuffer, TopologyOptimizer, TransientBuffer, VariableBuffer from Deeploy.MemoryLevelExtension.MemoryLevels import MemoryHierarchy, MemoryLevel from Deeploy.MemoryLevelExtension.NetworkDeployers.MemoryLevelDeployer import MemoryPlatform, MemoryPlatformWrapper -from Deeploy.Targets.Generic.Bindings import BasicGEMMBindings, BasicPad1DBindings, BasicPad2DBindings, \ - BasicRQIntegerDivBinding -from Deeploy.Targets.Generic.Layers import AddLayer, ConcatLayer, ConvLayer, GatherLayer, GELUGradLayer, GELULayer, \ - GEMMLayer, LayerNormGradLayer, LayerNormLayer, MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, QuantLayer, \ - ReduceMeanLayer, ReduceSumLayer, ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, \ - RQSiHardswishLayer, SGDLayer, SliceLayer, SoftmaxCrossEntropyLossGradLayer, SoftmaxCrossEntropyLossLayer, \ - SoftmaxGradLayer, SoftmaxLayer, TransposeLayer, iHardswishLayer, iRMSNormLayer -from Deeploy.Targets.Generic.Parsers import AddParser, ConcatParser, DequantParser, FlattenParser, GatherParser, \ - GELUGradParser, GELUParser, GEMMParser, LayerNormGradParser, LayerNormParser, MatMulParser, MaxPool1DParser, \ - MaxPool2DParser, MulParser, Pad1DParser, Pad2DParser, QuantParser, ReduceSumParser, ReluParser, \ - RequantShiftParser, ReshapeParser, RQAddParser, RQIntegerDivParser, RQSiGELUParser, RQSiHardswishParser, \ - SGDParser, SliceParser, SoftmaxCrossEntropyLossGradParser, SoftmaxCrossEntropyLossParser, SoftmaxGradParser, \ - SoftmaxParser, TransposeParser, UniformRequantShiftParser, UnsqueezeParser, iHardswishParser, iRMSNormParser, \ - iSoftmaxParser +from Deeploy.Targets.Generic.Bindings import BasicGEMMBindings, BasicRQIntegerDivBinding +from Deeploy.Targets.Generic.Layers import AddLayer, BatchNormalizationLayer, ConcatLayer, ConvLayer, \ + ConvTransposeLayer, GatherLayer, GELUGradLayer, GELULayer, GEMMLayer, LayerNormGradLayer, LayerNormLayer, \ + MatMulLayer, MaxPoolLayer, MulLayer, PadLayer, QuantLayer, ReduceLogSumExpLayer, ReduceMeanLayer, ReduceSumLayer, \ + ReluLayer, RequantShiftLayer, ReshapeLayer, RQIntegerDivLayer, RQSiGELULayer, RQSiHardswishLayer, SGDLayer, \ + SliceLayer, SoftmaxCrossEntropyLossGradLayer, SoftmaxCrossEntropyLossLayer, SoftmaxGradLayer, SoftmaxLayer, \ + TransposeLayer, iHardswishLayer, iRMSNormLayer +from Deeploy.Targets.Generic.Parsers import AddParser, BatchNormParser, ConcatParser, DequantParser, FlattenParser, \ + GatherParser, GELUGradParser, GELUParser, GEMMParser, LayerNormGradParser, LayerNormParser, MatMulParser, \ + MaxPool1DParser, MaxPool2DParser, MulParser, Pad1DParser, Pad2DParser, QuantParser, ReduceLogSumExpParser, \ + ReduceSumParser, ReluParser, RequantShiftParser, ReshapeParser, RQAddParser, RQIntegerDivParser, RQSiGELUParser, \ + RQSiHardswishParser, SGDParser, SliceParser, SoftmaxCrossEntropyLossGradParser, SoftmaxCrossEntropyLossParser, \ + SoftmaxGradParser, SoftmaxParser, TransposeParser, UniformRequantShiftParser, UnsqueezeParser, iHardswishParser, \ + iRMSNormParser, iSoftmaxParser from Deeploy.Targets.Generic.Templates import AllocateTemplate as BasicAllocateTemplate from Deeploy.Targets.Generic.TopologyOptimizationPasses.Passes import DequantPatternPass, IntegerDivRequantMergePass, \ MergeConstAddAndRequantPass, MergeTrueIntegerDivRequantShiftPass, QuantPatternPass, RQSSplitPass, \ @@ -32,24 +32,25 @@ from Deeploy.Targets.PULPOpen.Bindings import BasicDequantBindings, BasicQuantBindings, PULPDMASliceBindings, \ PULPDWConv1DBinding from Deeploy.Targets.PULPOpen.Layers import PULPRQSConvLayer, PULPRQSGEMMLayer -from Deeploy.Targets.PULPOpen.Parsers import PULPConv1DParser, PULPConv2DParser, PULPDWConv1DParser, \ - PULPDWConv2DParser, PULPFPConv2DParser, PULPFPDWConv2DParser, PULPGEMMParser, PULPMatrixVecParser, \ - PULPReduceMeanParser, PULPTallGEMMParser +from Deeploy.Targets.PULPOpen.Parsers import PULPConv1DParser, PULPConv2DParser, PULPConvTranspose2DParser, \ + PULPDWConv1DParser, PULPDWConv2DParser, PULPFPConv2DParser, PULPFPDWConv2DParser, PULPGEMMParser, \ + PULPMatrixVecParser, PULPReduceMeanParser, PULPTallGEMMParser from Deeploy.Targets.PULPOpen.Templates import AllocateTemplate, FreeTemplate -from Deeploy.Targets.PULPOpen.Tiler import PULPAddTilingReadyBindings, PULPConcatTilingReadyBindings, \ - PULPConv2DTilingReadyBindings, PULPDWConv2DTilingReadyBindings, PULPFlattenTilingReadyBindings, \ - PULPFPGELUGradTilingReadyBindings, PULPFPGELUTilingReadyBindings, PULPFPGEMMTilingReadyBindings, \ - PULPGatherTilingReadyBindings, PULPiHardswishTilingReadyBindings, PULPiRMSNormTilingReadyBindings, \ - PULPiRQSGELUTilingReadyBindings, PULPLayernormGradTilingReadyBindings, PULPLayernormTilingReadyBindings, \ - PULPMatMulTilingReadyBindings, PULPMaxPool1DTilingReadyBindings, PULPMaxPool2DTilingReadyBindings, \ - PULPMulTilingReadyBindings, PULPReduceMeanTilingReadyBindings, PULPReduceSumTilingReadyBindings, \ - PULPReluTilingReadyBindings, PULPRQAddTilingReadyBindings, PULPRQSConv1DTilingReadyBindings, \ - PULPRQSConv2DTilingReadyBindings, PULPRQSDWConv2DTilingReadyBindings, PULPRQSGEMMTilingReadyBindings, \ - PULPRQSiHardswishTilingReadyBindings, PULPRQSMatrixVecTilingReadyBindings, PULPRQSTallGEMMTilingReadyBindings, \ - PULPRQSTilingReadyBindings, PULPSGDTilingReadyBindings, PULPSliceTilingReadyBindings, \ - PULPSoftmaxCrossEntropyGradTilingReadyBindings, PULPSoftmaxCrossEntropyTilingReadyBindings, \ - PULPSoftmaxGradTilingReadyBindings, PULPSoftmaxTilingReadyBindings, PULPTransposeTilingReadyBindings, \ - PULPUniformRQSTilingReadyBindings +from Deeploy.Targets.PULPOpen.Tiler import PULPAddTilingReadyBindings, PULPBatchNormTilingReadyBindings, \ + PULPConcatTilingReadyBindings, PULPConv2DTilingReadyBindings, PULPConvTranspose2DTilingReadyBindings, \ + PULPDWConv2DTilingReadyBindings, PULPFlattenTilingReadyBindings, PULPFPGELUGradTilingReadyBindings, \ + PULPFPGELUTilingReadyBindings, PULPFPGEMMTilingReadyBindings, PULPGatherTilingReadyBindings, \ + PULPiHardswishTilingReadyBindings, PULPiRMSNormTilingReadyBindings, PULPiRQSGELUTilingReadyBindings, \ + PULPLayernormGradTilingReadyBindings, PULPLayernormTilingReadyBindings, PULPMatMulTilingReadyBindings, \ + PULPMaxPool1DTilingReadyBindings, PULPMaxPool2DTilingReadyBindings, PULPMulTilingReadyBindings, \ + PULPPad1DTilingReadyBindings, PULPPad2DTilingReadyBindings, PULPReduceLogSumExpTilingReadyBindings, \ + PULPReduceMeanTilingReadyBindings, PULPReduceSumTilingReadyBindings, PULPReluTilingReadyBindings, \ + PULPRQAddTilingReadyBindings, PULPRQSConv1DTilingReadyBindings, PULPRQSConv2DTilingReadyBindings, \ + PULPRQSDWConv2DTilingReadyBindings, PULPRQSGEMMTilingReadyBindings, PULPRQSiHardswishTilingReadyBindings, \ + PULPRQSMatrixVecTilingReadyBindings, PULPRQSTallGEMMTilingReadyBindings, PULPRQSTilingReadyBindings, \ + PULPSGDTilingReadyBindings, PULPSliceTilingReadyBindings, PULPSoftmaxCrossEntropyGradTilingReadyBindings, \ + PULPSoftmaxCrossEntropyTilingReadyBindings, PULPSoftmaxGradTilingReadyBindings, PULPSoftmaxTilingReadyBindings, \ + PULPTransposeTilingReadyBindings, PULPUniformRQSTilingReadyBindings from Deeploy.Targets.PULPOpen.TopologyOptimizationPasses.Passes import PULPAddRequantMergePass, \ PULPConvRequantMergePass, PULPGEMMRequantMergePass, PULPMatMulRequantMergePass @@ -60,8 +61,8 @@ GELUGradMapper = NodeMapper(GELUGradParser(), PULPFPGELUGradTilingReadyBindings) GatherMapper = NodeMapper(GatherParser(), PULPGatherTilingReadyBindings) MulMapper = NodeMapper(MulParser(), PULPMulTilingReadyBindings) -Pad1DMapper = NodeMapper(Pad1DParser(), BasicPad1DBindings) -Pad2DMapper = NodeMapper(Pad2DParser(), BasicPad2DBindings) +Pad1DMapper = NodeMapper(Pad1DParser(), PULPPad1DTilingReadyBindings) +Pad2DMapper = NodeMapper(Pad2DParser(), PULPPad2DTilingReadyBindings) ReshapeMapper = NodeMapper(ReshapeParser(), PULPFlattenTilingReadyBindings) TransposeMapper = NodeMapper(TransposeParser(), PULPTransposeTilingReadyBindings) UnsqueezeMapper = NodeMapper(UnsqueezeParser(), PULPFlattenTilingReadyBindings) @@ -70,6 +71,7 @@ UniformRequantShiftMapper = NodeMapper(UniformRequantShiftParser(), PULPUniformRQSTilingReadyBindings) ReduceMeanMapper = NodeMapper(PULPReduceMeanParser(), PULPReduceMeanTilingReadyBindings) +ReduceLogSumExpMapper = NodeMapper(ReduceLogSumExpParser(), PULPReduceLogSumExpTilingReadyBindings) ReduceSumMapper = NodeMapper(ReduceSumParser(), PULPReduceSumTilingReadyBindings) MatMulMapper = NodeMapper(MatMulParser(), PULPMatMulTilingReadyBindings) RQIntegerDivMapper = NodeMapper(RQIntegerDivParser(), [BasicRQIntegerDivBinding]) @@ -81,6 +83,7 @@ Conv2DMapper = NodeMapper(PULPConv2DParser(), PULPRQSConv2DTilingReadyBindings) FPDWConv2DMapper = NodeMapper(PULPFPDWConv2DParser(), PULPDWConv2DTilingReadyBindings) DWConv2DMapper = NodeMapper(PULPDWConv2DParser(), PULPRQSDWConv2DTilingReadyBindings) +ConvTranspose2DMapper = NodeMapper(PULPConvTranspose2DParser(), PULPConvTranspose2DTilingReadyBindings) GEMMMapper = NodeMapper(PULPGEMMParser(), PULPRQSGEMMTilingReadyBindings) FloatGEMMMapper = NodeMapper(GEMMParser(), PULPFPGEMMTilingReadyBindings) MatrixVecMapper = NodeMapper(PULPMatrixVecParser(), PULPRQSMatrixVecTilingReadyBindings) @@ -111,8 +114,10 @@ QuantMapper = NodeMapper(QuantParser(), BasicQuantBindings) DequantMapper = NodeMapper(DequantParser(), BasicDequantBindings) GEMMDequantMapper = NodeMapper(PULPGEMMParser(), BasicGEMMBindings) +BatchNormalizationMapper = NodeMapper(BatchNormParser(), PULPBatchNormTilingReadyBindings) PULPMapping = { 'Conv': ConvLayer([FPConv2DMapper, FPDWConv2DMapper]), + 'ConvTranspose': ConvTransposeLayer([ConvTranspose2DMapper]), 'RequantizedConv': PULPRQSConvLayer([Conv2DMapper, DWConv2DMapper, Conv1DMapper, DWConv1DMapper]), 'RequantizedGemm': PULPRQSGEMMLayer([MatrixVecMapper, TallGEMMMapper, GEMMMapper]), 'Gemm': GEMMLayer([FloatGEMMMapper, GEMMDequantMapper]), @@ -128,6 +133,7 @@ 'iSoftmax': SoftmaxLayer([Softmax_int8_Mapper]), 'Softmax': SoftmaxLayer([SoftmaxMapper]), 'ReduceMean': ReduceMeanLayer([ReduceMeanMapper]), + 'ReduceLogSumExp': ReduceLogSumExpLayer([ReduceLogSumExpMapper]), 'ReduceSum': ReduceSumLayer([ReduceSumMapper]), 'RequantShift': RequantShiftLayer([UniformRequantShiftMapper, RequantShiftMapper]), 'Add': AddLayer([AddMapper]), @@ -148,6 +154,7 @@ 'RequantizediHardswish': RQSiHardswishLayer([RQSiHardswishMapper]), 'Quant': QuantLayer([QuantMapper]), 'Dequant': QuantLayer([DequantMapper]), + 'BatchNormalization': BatchNormalizationLayer([BatchNormalizationMapper]), 'SoftmaxGrad': SoftmaxGradLayer([SoftmaxGradMapper]), 'SoftmaxCrossEntropyLoss': SoftmaxCrossEntropyLossLayer([SoftmaxCrossEntropyLossMapper]), 'SoftmaxCrossEntropyLossGrad': SoftmaxCrossEntropyLossGradLayer([SoftmaxCrossEntropyLossGradMapper]), diff --git a/Deeploy/Targets/PULPOpen/Templates/ConvTransposeTemplate.py b/Deeploy/Targets/PULPOpen/Templates/ConvTransposeTemplate.py new file mode 100644 index 0000000000..5ef8d5855e --- /dev/null +++ b/Deeploy/Targets/PULPOpen/Templates/ConvTransposeTemplate.py @@ -0,0 +1,36 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from Deeploy.DeeployTypes import NodeTemplate + +reference2DTemplate = NodeTemplate(""" +<% +batchOffsetIn = ch_im_in * dim_im_in_x * dim_im_in_y +batchOffsetOut = ch_im_out * dim_im_out_x * dim_im_out_y +%> + +// 2D FP ConvTranspose CHW on PULPOpen/Siracusa (Name: ${nodeName}, Op: ${nodeOp}) +${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; +${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + +for (uint32_t n=0; n<${batch}; ++n) { + PULP_ConvTranspose2d_fp32_fp32_fp32_CHW( + ref_${data_out}_${data_in}, + ${ch_im_in}, ${dim_im_in_x}, ${dim_im_in_y}, + ${weight}, + ${ch_im_out}, ${group}, + ${dim_kernel_x}, ${dim_kernel_y}, + ${stride_x}, ${stride_y}, + ${dilation_x}, ${dilation_y}, + ${padding_y_top}, ${padding_y_bottom}, + ${padding_x_left}, ${padding_x_right}, + ${bias}, ${has_bias}, + ref_${data_out}_${data_out}, + ${dim_im_out_x}, ${dim_im_out_y} + ); + + ref_${data_out}_${data_in} += ${batchOffsetIn}; + ref_${data_out}_${data_out} += ${batchOffsetOut}; +} +""") diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py index bfa893db94..bd4f108ab3 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatConvTemplate.py @@ -18,10 +18,12 @@ def __init__(self, templateStr): def computeTransientBuffersSize( ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]: + n_cores = int(operatorRepresentation["n_cores"]) + # Conservative fallback used during Autoencoder2D debug: + # n_cores = max(int(operatorRepresentation.get("n_cores", 8)), 8) # Memory allocation for the im2col buffer can be dynamic, based on the number of cores. - im2col_dim = (operatorRepresentation["weight_type"].typeWidth // - 8) * operatorRepresentation["n_cores"] * operatorRepresentation[ - 'ch_im_in'] * operatorRepresentation['dim_kernel_x'] * operatorRepresentation['dim_kernel_y'] + im2col_dim = (operatorRepresentation["weight_type"].typeWidth // 8) * n_cores * operatorRepresentation[ + 'ch_im_in'] * operatorRepresentation['dim_kernel_x'] * operatorRepresentation['dim_kernel_y'] im2col_name = operatorRepresentation['nodeName'] + "_buffer" @@ -48,9 +50,12 @@ def computeTransientBuffersSize( ctxt: NetworkContext, operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]: + n_cores = int(operatorRepresentation["n_cores"]) + # Conservative fallback used during Autoencoder2D debug: + # n_cores = max(int(operatorRepresentation.get("n_cores", 8)), 8) # Memory allocation for the im2col buffer can be dynamic, based on the number of cores. - im2col_dim = (operatorRepresentation["weight_type"].typeWidth // 8) * operatorRepresentation[ - "n_cores"] * operatorRepresentation['dim_kernel_x'] * operatorRepresentation['dim_kernel_y'] + im2col_dim = (operatorRepresentation["weight_type"].typeWidth // + 8) * n_cores * operatorRepresentation['dim_kernel_x'] * operatorRepresentation['dim_kernel_y'] im2col_name = operatorRepresentation['nodeName'] + "_buffer" diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py index 59499706e5..917e82a822 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py @@ -4,7 +4,8 @@ from typing import Dict, List, Tuple -from Deeploy.AbstractDataTypes import float32_tPtr +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import float32_t from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation @@ -19,7 +20,7 @@ def alignToContext(self, ctxt: NetworkContext, if 'C' not in operatorRepresentation or operatorRepresentation['C'] is None: # No bias case - set C to NULL and provide a default type operatorRepresentation['C'] = None - operatorRepresentation['C_type'] = float32_tPtr # Default to fp32 type + operatorRepresentation['C_type'] = PointerClass(float32_t) # Default to fp32 type operatorRepresentation['C_batched'] = False return ctxt, operatorRepresentation, [] @@ -76,4 +77,4 @@ def alignToContext(self, ctxt: NetworkContext, ref_${data_out}_${data_out} += ${M} * ${O}; } -""") \ No newline at end of file +""") diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py index ced6c3cbcf..1d1d6a70bc 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatMulTemplate.py @@ -14,18 +14,33 @@ uint32_t ${nodeName}_end = MIN(${nodeName}_start + ${nodeName}_chunk, (uint32_t) ${size}); if (${nodeName}_start < ${nodeName}_end) { +% if sizeB == 1: float32_t ${nodeName}_scalar = ${B}[0]; +% endif uint32_t ${nodeName}_unroll_end = ${nodeName}_start + ((${nodeName}_end - ${nodeName}_start) / 6) * 6; for (uint32_t i = ${nodeName}_start; i < ${nodeName}_unroll_end; i += 6) { +% if sizeB == 1: ${C}[i + 0] = ${A}[i + 0] * ${nodeName}_scalar; ${C}[i + 1] = ${A}[i + 1] * ${nodeName}_scalar; ${C}[i + 2] = ${A}[i + 2] * ${nodeName}_scalar; ${C}[i + 3] = ${A}[i + 3] * ${nodeName}_scalar; ${C}[i + 4] = ${A}[i + 4] * ${nodeName}_scalar; ${C}[i + 5] = ${A}[i + 5] * ${nodeName}_scalar; +% else: + ${C}[i + 0] = ${A}[i + 0] * ${B}[i + 0]; + ${C}[i + 1] = ${A}[i + 1] * ${B}[i + 1]; + ${C}[i + 2] = ${A}[i + 2] * ${B}[i + 2]; + ${C}[i + 3] = ${A}[i + 3] * ${B}[i + 3]; + ${C}[i + 4] = ${A}[i + 4] * ${B}[i + 4]; + ${C}[i + 5] = ${A}[i + 5] * ${B}[i + 5]; +% endif } for (uint32_t i = ${nodeName}_unroll_end; i < ${nodeName}_end; i++) { +% if sizeB == 1: ${C}[i] = ${A}[i] * ${nodeName}_scalar; +% else: + ${C}[i] = ${A}[i] * ${B}[i]; +% endif } } -""") \ No newline at end of file +""") diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py index ae9ca96b3c..bd6e5dc290 100644 --- a/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py +++ b/Deeploy/Targets/PULPOpen/TileConstraints/ConvTileConstraint.py @@ -1060,4 +1060,4 @@ def serializeTilingSolution( tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) - return variableReplacementSchedule, tilingSchedule \ No newline at end of file + return variableReplacementSchedule, tilingSchedule diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/ConvTransposeTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/ConvTransposeTileConstraint.py new file mode 100644 index 0000000000..b3b6756c41 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/TileConstraints/ConvTransposeTileConstraint.py @@ -0,0 +1,189 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import uint8_t, uint16_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class ConvTranspose2DTileConstraint(TileConstraint): + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + # Register all tensors that participate in the tiled operator. The tiler will + # create symbolic variables for each tensor dimension and solve over them. + inputBufferName = parseDict['data_in'] + weightBufferName = parseDict['weight'] + outputBufferName = parseDict['data_out'] + + for bufferName in [inputBufferName, weightBufferName, outputBufferName]: + tilerModel.addTensorDimToModel(ctxt, bufferName) + + # Symbolic dimensions for NCHW input and output, and [Cin, Cout/group, Kh, Kw] weights. + inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 0) + inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1) + inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 2) + inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 3) + + weightInChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 0) + weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 1) + weightHeightVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 2) + weightWidthVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 3) + + outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0) + outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1) + outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2) + outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3) + + # Geometrical constraints: batch is preserved, the weight Cin must match input Cin, + # and in this first implementation H/W are not tiled, so they stay equal to the + # full tensor shapes known from the context. + tilerModel.addConstraint(outputBatchVar == inputBatchVar) + tilerModel.addConstraint(weightInChannelVar == inputChannelVar) + tilerModel.addConstraint(outputHeightVar == ctxt.lookup(outputBufferName).shape[2]) + tilerModel.addConstraint(outputWidthVar == ctxt.lookup(outputBufferName).shape[3]) + tilerModel.addConstraint(inputHeightVar == ctxt.lookup(inputBufferName).shape[2]) + tilerModel.addConstraint(inputWidthVar == ctxt.lookup(inputBufferName).shape[3]) + tilerModel.addConstraint(weightHeightVar == ctxt.lookup(weightBufferName).shape[2]) + tilerModel.addConstraint(weightWidthVar == ctxt.lookup(weightBufferName).shape[3]) + + if parseDict['group'] == 1: + # For the regular case, the only tiled dimension is Cout, so the output tile + # channel count must match the weight tile channel count. + tilerModel.addConstraint(outputChannelVar == weightOutChannelVar) + else: + # Grouped ConvTranspose is kept conservative for now: no real Cout slicing is + # enforced on the weight tensor because the grouped layout needs dedicated handling. + tilerModel.addConstraint(outputChannelVar == ctxt.lookup(outputBufferName).shape[1]) + tilerModel.addConstraint(weightOutChannelVar == ctxt.lookup(weightBufferName).shape[1]) + + if parseDict.get('has_bias', "false") == "true": + # Bias follows the tiled output channels one-to-one. + biasBufferName = parseDict['bias'] + tilerModel.addTensorDimToModel(ctxt, biasBufferName) + biasChannelVar = tilerModel.getTensorDimVar(tensorName = biasBufferName, dimIdx = 0) + tilerModel.addConstraint(biasChannelVar == outputChannelVar) + + return tilerModel + + @staticmethod + def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + inputBufferName = parseDict['data_in'] + outputBufferName = parseDict['data_out'] + + inputShape = ctxt.lookup(inputBufferName).shape + outputShape = ctxt.lookup(outputBufferName).shape + + inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 0) + inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1) + outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0) + outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1) + + # Policy constraints: keep the full input resident and tile only along output + # channels. This is the simplest correct strategy for ConvTranspose because it + # avoids the inverse spatial dependency problem on H/W tiles. + tilerModel.addConstraint(inputBatchVar == inputShape[0]) + tilerModel.addConstraint(inputChannelVar == inputShape[1]) + tilerModel.addConstraint(outputBatchVar == outputShape[0]) + + if parseDict['group'] == 1 and parseDict["ch_im_out"] >= 8: + # Do not create tiny channel tiles unless necessary. This reduces overhead + # from DMA/codegen and keeps the kernel granularity reasonable. + tilerModel.addMinTileSizeConstraint(parseDict, 'ch_im_out', outputChannelVar, 8) + + return tilerModel + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + # The solver gives us output tiles. For each output tile we must reconstruct the + # exact input/weight/bias tiles that must be transferred into the target memory. + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + addrNames = ['data_in', 'weight', 'data_out'] + has_bias = operatorRepresentation.get('has_bias', "false") == "true" + if has_bias: + addrNames.append('bias') + + # Resolve base addresses in the target memory level (typically L1) for all tensors + # used by this tiled operator. + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + inputShape = ctxt.lookup(operatorRepresentation['data_in']).shape + weightShape = ctxt.lookup(operatorRepresentation['weight']).shape + + inputInCubes = [] + inputWeightCubes = [] + inputBiasCubes = [] + + replacements = { + "ch_im_out": [], + "batch": [], + } + replacementTypes = { + "ch_im_out": PointerClass(uint16_t), + "batch": PointerClass(uint8_t), + } + + for cube in outputCubes: + # Output tiles are NCHW. Since we only tile along C_out, only the channel + # offset/size are relevant here. + _, cOffset, _, _ = cube.offset + nSize, cSize, _, _ = cube.dims + + # Input is kept whole for every tile in this first implementation. + inputInCubes.append(HyperRectangle((0, 0, 0, 0), tuple(inputShape))) + + if operatorRepresentation['group'] == 1: + # Slice the weight tensor along the output-channel axis so the kernel sees + # only the filters needed for the current output tile. + weightOffset = (0, cOffset, 0, 0) + weightDims = (weightShape[0], cSize, weightShape[2], weightShape[3]) + else: + # Conservative grouped fallback: use full weights until grouped slicing is + # implemented explicitly for ConvTranspose tiling. + weightOffset = (0, 0, 0, 0) + weightDims = tuple(weightShape) + + inputWeightCubes.append(HyperRectangle(weightOffset, weightDims)) + + if has_bias: + if operatorRepresentation['group'] == 1: + # Bias is sliced exactly like the output channels. + inputBiasCubes.append(HyperRectangle((cOffset,), (cSize,))) + else: + inputBiasCubes.append(HyperRectangle((0,), (ctxt.lookup(operatorRepresentation['bias']).shape[0],))) + + # These replacements are injected into the template so each kernel invocation + # uses the tile-local shape instead of the global layer shape. + replacements["ch_im_out"].append(cSize) + replacements["batch"].append(nSize) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + # Build the per-tile DMA/load schedule: which tensor cubes must be present before + # computing each output cube. + if has_bias: + for inCube, weightCube, biasCube in zip(inputInCubes, inputWeightCubes, inputBiasCubes): + inputLoadSchedule.append({"data_in": inCube, "weight": weightCube, "bias": biasCube}) + else: + for inCube, weightCube in zip(inputInCubes, inputWeightCubes): + inputLoadSchedule.append({"data_in": inCube, "weight": weightCube}) + + for out in outputCubes: + outputLoadSchedule.append({"data_out": out}) + + schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + return VariableReplacementScheme(replacements, replacementTypes), schedule diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py index ee7e448be6..fb55dd536d 100644 --- a/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py +++ b/Deeploy/Targets/PULPOpen/TileConstraints/MatMulTileConstraint.py @@ -6,7 +6,7 @@ from typing import Dict, List, Tuple from Deeploy.AbstractDataTypes import PointerClass -from Deeploy.CommonExtensions.DataTypes import int8_t +from Deeploy.CommonExtensions.DataTypes import uint16_t from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint from Deeploy.TilingExtension.TileConstraint import TileConstraint @@ -209,10 +209,10 @@ def serializeTilingSolution( replacements["N"] = [NSize] * len(outputCubes) replacementTypes = { - "M": PointerClass(int8_t), - "N": PointerClass(int8_t), - "O": PointerClass(int8_t), - "batch": PointerClass(int8_t) + "M": PointerClass(uint16_t), + "N": PointerClass(uint16_t), + "O": PointerClass(uint16_t), + "batch": PointerClass(uint16_t) } # Update load schedule lists diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/PadTileConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/PadTileConstraint.py new file mode 100644 index 0000000000..c7cce27d06 --- /dev/null +++ b/Deeploy/Targets/PULPOpen/TileConstraints/PadTileConstraint.py @@ -0,0 +1,215 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +from typing import Dict, List, Tuple + +import numpy as np + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import uint16_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class Pad2DTileConstraint(TileConstraint): + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + inputBufferName = parseDict['data_in'] + outputBufferName = parseDict['data_out'] + + buffersOfInterest = [inputBufferName, outputBufferName] + if 'pads_tensor' in parseDict: + buffersOfInterest.append(parseDict['pads_tensor']) + if 'value_tensor' in parseDict: + buffersOfInterest.append(parseDict['value_tensor']) + + for bufferName in buffersOfInterest: + tilerModel.addTensorDimToModel(ctxt, bufferName) + + channels_first = bool(parseDict.get('channels_first', 1)) + + if channels_first: + batchDim = 0 + channelDim = 1 + heightDim = 2 + widthDim = 3 + else: + batchDim = 0 + heightDim = 1 + widthDim = 2 + channelDim = 3 + + inputBatchVar = tilerModel.getTensorDimVar(inputBufferName, batchDim) + inputHeightVar = tilerModel.getTensorDimVar(inputBufferName, heightDim) + inputWidthVar = tilerModel.getTensorDimVar(inputBufferName, widthDim) + inputChannelVar = tilerModel.getTensorDimVar(inputBufferName, channelDim) + + outputBatchVar = tilerModel.getTensorDimVar(outputBufferName, batchDim) + outputHeightVar = tilerModel.getTensorDimVar(outputBufferName, heightDim) + outputWidthVar = tilerModel.getTensorDimVar(outputBufferName, widthDim) + outputChannelVar = tilerModel.getTensorDimVar(outputBufferName, channelDim) + + tilerModel.addConstraint(outputBatchVar == inputBatchVar) + tilerModel.addConstraint(outputWidthVar == inputWidthVar + 2 * parseDict['pad_x']) + tilerModel.addConstraint(outputChannelVar == inputChannelVar) + + # Height tiles may include top/bottom padding only at the boundary tiles. + tilerModel.addConstraint(outputHeightVar >= inputHeightVar) + tilerModel.addConstraint(outputHeightVar <= inputHeightVar + 2 * parseDict['pad_y']) + + return tilerModel + + @staticmethod + def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + inputBufferName = parseDict['data_in'] + outputBufferName = parseDict['data_out'] + + channels_first = bool(parseDict.get('channels_first', 1)) + + if channels_first: + batchDim = 0 + channelDim = 1 + heightDim = 2 + widthDim = 3 + else: + batchDim = 0 + heightDim = 1 + widthDim = 2 + channelDim = 3 + + inputBuffer = ctxt.lookup(inputBufferName) + outputBuffer = ctxt.lookup(outputBufferName) + + # Keep batch, width and channels whole to preserve contiguous NHWC row copies. + tilerModel.addConstraint(tilerModel.getTensorDimVar(inputBufferName, batchDim) == inputBuffer.shape[batchDim]) + tilerModel.addConstraint(tilerModel.getTensorDimVar(outputBufferName, batchDim) == outputBuffer.shape[batchDim]) + tilerModel.addConstraint(tilerModel.getTensorDimVar(inputBufferName, widthDim) == inputBuffer.shape[widthDim]) + tilerModel.addConstraint(tilerModel.getTensorDimVar(outputBufferName, widthDim) == outputBuffer.shape[widthDim]) + tilerModel.addConstraint( + tilerModel.getTensorDimVar(inputBufferName, channelDim) == inputBuffer.shape[channelDim]) + tilerModel.addConstraint( + tilerModel.getTensorDimVar(outputBufferName, channelDim) == outputBuffer.shape[channelDim]) + + # Avoid pure-padding tiles and avoid tail tiles with a single padding row. + outputHeightVar = tilerModel.getTensorDimVar(outputBufferName, heightDim) + tilerModel.addConstraint(outputHeightVar >= (parseDict['pad_y'] + 1)) + + return tilerModel + + @staticmethod + def _computeInputCube(outputCube: HyperRectangle, parseDict: Dict, + outputShape: Tuple[int, ...]) -> Tuple[HyperRectangle, int]: + channels_first = bool(parseDict.get('channels_first', 1)) + + if channels_first: + batchOffset, _, outputHOffset, _ = outputCube.offset + batchSize, _, outputHSize, _ = outputCube.dims + inputChannels = parseDict['dim_im_in_ch'] + inputWidth = parseDict['dim_im_in_y'] + else: + batchOffset, outputHOffset, _, _ = outputCube.offset + batchSize, outputHSize, _, _ = outputCube.dims + inputChannels = parseDict['dim_im_in_ch'] + inputWidth = parseDict['dim_im_in_y'] + + padTop = parseDict['pad_y'] + inputHeightTotal = parseDict['dim_im_in_x'] + outputHeightTotal = outputShape[2] if channels_first else outputShape[1] + + dataStart = padTop + dataEnd = padTop + inputHeightTotal + tileEnd = outputHOffset + outputHSize + + localPadTop = max(dataStart - outputHOffset, 0) + localPadBottom = max(tileEnd - dataEnd, 0) + + inputStart = max(outputHOffset, dataStart) - dataStart + inputEnd = min(tileEnd, dataEnd) - dataStart + inputHeight = max(inputEnd - inputStart, 1) + + if channels_first: + inCube = HyperRectangle((batchOffset, 0, inputStart, 0), + (batchSize, inputChannels, inputHeight, inputWidth)) + else: + inCube = HyperRectangle((batchOffset, inputStart, 0, 0), + (batchSize, inputHeight, inputWidth, inputChannels)) + + _ = localPadBottom # Bottom padding is encoded by the output height itself. + return inCube, localPadTop + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + addrNames = ['data_in', 'data_out'] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + replacements: Dict[str, List[int]] = { + "batch": [], + "dim_im_in_x": [], + "dim_im_in_y": [], + "dim_im_in_ch": [], + "dim_im_out_x": [], + "dim_im_out_y": [], + "dim_im_out_ch": [], + "pad_y": [], + "data_out_size": [], + } + + replacementTypes = { + "batch": PointerClass(uint16_t), + "dim_im_in_x": PointerClass(uint16_t), + "dim_im_in_y": PointerClass(uint16_t), + "dim_im_in_ch": PointerClass(uint16_t), + "dim_im_out_x": PointerClass(uint16_t), + "dim_im_out_y": PointerClass(uint16_t), + "dim_im_out_ch": PointerClass(uint16_t), + "pad_y": PointerClass(uint16_t), + "data_out_size": PointerClass(uint16_t), + } + + outputShape = tuple(ctxt.lookup(operatorRepresentation['data_out']).shape) + channels_first = bool(operatorRepresentation.get('channels_first', 1)) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + for outCube in outputCubes: + inCube, localPadTop = cls._computeInputCube(outCube, operatorRepresentation, outputShape) + + replacements["batch"].append(outCube.dims[0]) + replacements["pad_y"].append(localPadTop) + replacements["data_out_size"].append(int(np.prod(outCube.dims))) + + if channels_first: + replacements["dim_im_in_ch"].append(inCube.dims[1]) + replacements["dim_im_in_x"].append(inCube.dims[2]) + replacements["dim_im_in_y"].append(inCube.dims[3]) + replacements["dim_im_out_ch"].append(outCube.dims[1]) + replacements["dim_im_out_x"].append(outCube.dims[2]) + replacements["dim_im_out_y"].append(outCube.dims[3]) + else: + replacements["dim_im_in_x"].append(inCube.dims[1]) + replacements["dim_im_in_y"].append(inCube.dims[2]) + replacements["dim_im_in_ch"].append(inCube.dims[3]) + replacements["dim_im_out_x"].append(outCube.dims[1]) + replacements["dim_im_out_y"].append(outCube.dims[2]) + replacements["dim_im_out_ch"].append(outCube.dims[3]) + + inputLoadSchedule.append({"data_in": inCube}) + outputLoadSchedule.append({"data_out": outCube}) + + tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) + + return variableReplacementSchedule, tilingSchedule diff --git a/Deeploy/Targets/PULPOpen/TileConstraints/SliceConstraint.py b/Deeploy/Targets/PULPOpen/TileConstraints/SliceConstraint.py index 5309300659..4a1993c45e 100644 --- a/Deeploy/Targets/PULPOpen/TileConstraints/SliceConstraint.py +++ b/Deeploy/Targets/PULPOpen/TileConstraints/SliceConstraint.py @@ -19,6 +19,12 @@ class SliceTileConstraint(TileConstraint): + @staticmethod + def _resolveImmediate(value, ctxt: NetworkContext): + if isinstance(value, str): + return np.asarray(ctxt.lookup(value).values).astype(np.int64).reshape(-1) + return np.asarray(value).astype(np.int64).reshape(-1) + @staticmethod def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: @@ -31,8 +37,8 @@ def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: Netw inputShape = parseDict['data_in_shape'] # Get other necessary information - sliceAxes = parseDict['axes'] - sliceSteps = parseDict['steps'] + sliceAxes = SliceTileConstraint._resolveImmediate(parseDict['axes'], ctxt) + sliceSteps = SliceTileConstraint._resolveImmediate(parseDict['steps'], ctxt) # ===== ADD I/O DIMENSIONS TO THE MODEL AS VARIABLES ===== for bufferName in [inputBufferName, outputBufferName]: @@ -83,10 +89,14 @@ def computeInputCubeFromOutputCube(outputCube: AbsoluteHyperRectangle, parseDict in_cube_offset = list(outputCube.offset).copy() # Iterate through the sliced axes - for idx, ax in enumerate(parseDict['axes']): + sliceAxes = np.asarray(parseDict['axes']).astype(np.int64).reshape(-1) + sliceStarts = np.asarray(parseDict['starts']).astype(np.int64).reshape(-1) + sliceSteps = np.asarray(parseDict['steps']).astype(np.int64).reshape(-1) + + for idx, ax in enumerate(sliceAxes): # Get current sliced ax parameters - start = parseDict['starts'][idx] - step = parseDict['steps'][idx] + start = sliceStarts[idx] + step = sliceSteps[idx] # Compute input cube parameters for the current axis in_cube_dims[ax] = (outputCube.dims[ax] - 1) * step + 1 @@ -149,12 +159,18 @@ def serializeTilingSolution( outputLoadSchedule = [] for out_cube in outputCubes: + immediateOperatorRepresentation = operatorRepresentation.copy() + immediateOperatorRepresentation['axes'] = cls._resolveImmediate(operatorRepresentation['axes'], ctxt) + immediateOperatorRepresentation['starts'] = cls._resolveImmediate(operatorRepresentation['starts'], ctxt) + immediateOperatorRepresentation['steps'] = cls._resolveImmediate(operatorRepresentation['steps'], ctxt) + # Compute input cube - in_cube = SliceTileConstraint.computeInputCubeFromOutputCube(out_cube, parseDict = operatorRepresentation) + in_cube = SliceTileConstraint.computeInputCubeFromOutputCube(out_cube, + parseDict = immediateOperatorRepresentation) # Compute new ends for replacement new_ends = list() - for ax in operatorRepresentation['axes']: + for ax in immediateOperatorRepresentation['axes']: new_ends.append(in_cube.offset[ax] + in_cube.dims[ax]) # Append replacement elements diff --git a/Deeploy/Targets/PULPOpen/Tiler.py b/Deeploy/Targets/PULPOpen/Tiler.py index 901106459e..75856b0314 100644 --- a/Deeploy/Targets/PULPOpen/Tiler.py +++ b/Deeploy/Targets/PULPOpen/Tiler.py @@ -14,10 +14,12 @@ from Deeploy.Targets.Generic.TileConstraints.RQSiHardswishTileConstraint import RQSiHardswishTileConstraint from Deeploy.Targets.Generic.TileConstraints.TransposeTileConstraint import TransposeTileConstraint from Deeploy.Targets.Generic.TileConstraints.UnaryTileConstraint import UnaryTileConstraint -from Deeploy.Targets.PULPOpen.Bindings import PULPAddBindings, PULPConcatBindings, PULPFloatConv2DBindings, \ - PULPFloatDWConv2DBindings, PULPFloatGELUBinding, PULPFloatGELUGradBinding, PULPFloatGEMMBindings, \ - PULPGatherBindings, PULPiHardswishBindings, PULPiRMSNormBindings, PULPiRQSGELUBindings, PULPLayernormBinding, \ - PULPLayernormGradBinding, PULPMatMulBindings, PULPMaxPool1DBindings, PULPMaxPool2DBindings, PULPMulBindings, \ +from Deeploy.Targets.Generic.TileConstraints.UntiledTileConstraint import UntiledTileConstraint +from Deeploy.Targets.PULPOpen.Bindings import PULPAddBindings, PULPBatchNormBindings, PULPConcatBindings, \ + PULPFloatConv2DBindings, PULPFloatConvTranspose2DBindings, PULPFloatDWConv2DBindings, PULPFloatGELUBinding, \ + PULPFloatGELUGradBinding, PULPFloatGEMMBindings, PULPGatherBindings, PULPiHardswishBindings, PULPiRMSNormBindings, \ + PULPiRQSGELUBindings, PULPLayernormBinding, PULPLayernormGradBinding, PULPMatMulBindings, PULPMaxPool1DBindings, \ + PULPMaxPool2DBindings, PULPMulBindings, PULPPad1DBindings, PULPPad2DBindings, PULPReduceLogSumExpBindings, \ PULPReduceMeanBindings, PULPReduceSumBindings, PULPReluBinding, PULPReshapeBindings, PULPRQAddBindings, \ PULPRQSBindings, PULPRQSConv1DBindings, PULPRQSConv2DBindings, PULPRQSDWConv2DBindings, PULPRQSGEMMBindings, \ PULPRQSiHardswishBindings, PULPRQSMatrixVecBindings, PULPRQSTallGEMMBindings, PULPSGDBindings, PULPSliceBindings, \ @@ -25,6 +27,7 @@ PULPSoftmaxGradBindings, PULPTransposeBindings, PULPUniformRQSBindings from Deeploy.Targets.PULPOpen.TileConstraints.ConvTileConstraint import Conv2DTileConstraint, RQConv1DTileConstraint, \ RQConv2DTileConstraint +from Deeploy.Targets.PULPOpen.TileConstraints.ConvTransposeTileConstraint import ConvTranspose2DTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.DWConvTileConstraint import DWConv2DTileConstraint, \ RQDWConv2DTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.GatherTileConstraint import GatherTileConstraint @@ -40,7 +43,6 @@ from Deeploy.Targets.PULPOpen.TileConstraints.ReduceSumTileConstraint import ReduceSumTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.RequantShiftTileConstraint import RequantShiftTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.SGDTileConstraint import SGDTileConstraint -from Deeploy.Targets.PULPOpen.TileConstraints.SliceConstraint import SliceTileConstraint from Deeploy.Targets.PULPOpen.TileConstraints.SoftmaxCrossEntropyTileConstraint import \ SoftmaxCrossEntropyGradTileConstraint, SoftmaxCrossEntropyTileConstraint from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings @@ -57,6 +59,12 @@ PULPConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPFloatConv2DBindings, tileConstraint = Conv2DTileConstraint()) +PULPConv2DUntiledTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPFloatConv2DBindings, + tileConstraint = UntiledTileConstraint()) + +PULPConvTranspose2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPFloatConvTranspose2DBindings, + tileConstraint = ConvTranspose2DTileConstraint()) + PULPDWConv2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPFloatDWConv2DBindings, tileConstraint = DWConv2DTileConstraint()) @@ -95,6 +103,9 @@ PULPMaxPool2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPMaxPool2DBindings, tileConstraint = MaxPoolCTileConstraint()) +PULPMaxPool2DUntiledTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPMaxPool2DBindings, + tileConstraint = UntiledTileConstraint()) + PULPRQSTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPRQSBindings, tileConstraint = RequantShiftTileConstraint()) @@ -152,11 +163,25 @@ PULPReduceSumTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPReduceSumBindings, tileConstraint = ReduceSumTileConstraint()) +_PULPReduceLogSumExpBindings = copy.deepcopy(PULPReduceLogSumExpBindings) + +PULPReduceLogSumExpTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = _PULPReduceLogSumExpBindings, + tileConstraint = UntiledTileConstraint()) + PULPSGDTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPSGDBindings, tileConstraint = SGDTileConstraint()) PULPSliceTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPSliceBindings, - tileConstraint = SliceTileConstraint()) + tileConstraint = UntiledTileConstraint()) PULPReduceMeanTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPReduceMeanBindings, tileConstraint = ReduceMeanTileConstraint()) + +PULPBatchNormTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPBatchNormBindings, + tileConstraint = UntiledTileConstraint()) + +PULPPad1DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPPad1DBindings, + tileConstraint = UntiledTileConstraint()) + +PULPPad2DTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = PULPPad2DBindings, + tileConstraint = UntiledTileConstraint()) diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py index 51f87534ea..1cbd64bbe7 100644 --- a/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py +++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingCodeGeneration.py @@ -187,6 +187,7 @@ def _legalizeTransfers(self, transfers: List[HyperRectangle], outerShape: Tuple[ if isFinalMemoryLevel: minimizedTransfers = [] + fallbackToSharedOuterShape = False for rect in transfers: paddedRect = HyperRectangle(padOffset(rect.offset, commonRank), padShape(rect.dims, commonRank)) minRect, newMinOuterShape = minimizeRectangle(paddedRect, outerShape) @@ -194,13 +195,26 @@ def _legalizeTransfers(self, transfers: List[HyperRectangle], outerShape: Tuple[ minOuterShape = newMinOuterShape else: if minOuterShape != newMinOuterShape: - rectStr = "\n".join(str(trans) for trans in transfers[:transfers.index(rect)]) - raise RuntimeError(f"""Currently support a single minimal outer shape. -Old minOuterShape: {minOuterShape} vs. new minOuterShape {newMinOuterShape}. -New minOuterShape produced by outerDims: {outerShape} and rect: {rect}. -Old minOuterShape produced by outerDims: {outerShape} and rects: -{rectStr}""") + # Keep a shared outer shape when different tiles of the same external tensor + # minimize differently. This shows up for L3->L2 transposed DMA on residual + # tiles: a full-width tile may collapse to 1D, while the tail tile keeps a + # higher-rank shape. Falling back to the common padded shape is conservative + # but preserves the existing tiling structure and avoids introducing + # layer-specific hacks here. + # + # A better long-term fix would be earlier in tiling propagation: if an + # intermediate layer such as a Transpose fits entirely in L2, keep it whole + # there instead of inheriting a downstream residual tiling split. That would + # minimize L3 traffic rather than only making DMA legalization robust. + fallbackToSharedOuterShape = True + break minimizedTransfers.append(minRect) + if fallbackToSharedOuterShape: + minimizedTransfers = [ + HyperRectangle(padOffset(rect.offset, commonRank), padShape(rect.dims, commonRank)) + for rect in transfers + ] + minOuterShape = outerShape else: minimizedTransfers = [HyperRectangle((0,), (int(np.prod(rect.dims)),)) for rect in transfers] minOuterShape = (int(np.prod(outerShape)),) diff --git a/Deeploy/TilingExtension/CodeTransformationPasses/TilingHoistingMixIn.py b/Deeploy/TilingExtension/CodeTransformationPasses/TilingHoistingMixIn.py index 8a0c1b9b54..bbcbc512cc 100644 --- a/Deeploy/TilingExtension/CodeTransformationPasses/TilingHoistingMixIn.py +++ b/Deeploy/TilingExtension/CodeTransformationPasses/TilingHoistingMixIn.py @@ -70,7 +70,21 @@ def _hoistReference(self, shape: Tuple[int, ...] = (1,), offset: Union[int, str, VariableBuffer] = 0, override_type: Optional[Type[BaseType]] = None) -> _ReferenceBuffer: - ref = ctxt.hoistReference(self.prefix + name, reference, shape, offset, override_type) + refName = self.prefix + name + if ctxt.is_local(refName): + ref = ctxt.lookup(refName) + assert isinstance(ref, _ReferenceBuffer) + assert ref._referenceName == reference.name + assert tuple(ref.shape) == tuple(shape) + expectedOffset = offset.name if isinstance(offset, VariableBuffer) else offset + assert ref._offset == expectedOffset + if override_type is not None: + assert ref._type == PointerClass(override_type) + else: + assert ref._type == reference._type + return ref + + ref = ctxt.hoistReference(refName, reference, shape, offset, override_type) ref._memoryLevel = self.memory return ref diff --git a/DeeployTest/Platforms/Generic/main.c b/DeeployTest/Platforms/Generic/main.c index e2b0449fb5..20fb980510 100644 --- a/DeeployTest/Platforms/Generic/main.c +++ b/DeeployTest/Platforms/Generic/main.c @@ -25,6 +25,23 @@ int main() { printf("Running network...\r\n"); RunNetwork(0, 1); + const char *dump_outputs = getenv("DEEPLOY_DUMP_OUTPUTS"); + if (dump_outputs && dump_outputs[0] != '\0') { + for (uint32_t buf = 0; buf < DeeployNetwork_num_outputs; buf++) { + uint32_t count = DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE); + printf("OUTPUT %u %u\r\n", buf, count); + for (uint32_t i = 0; i < count; i++) { + OUTPUTTYPE actual = ((OUTPUTTYPE *)DeeployNetwork_outputs[buf])[i]; +#if ISOUTPUTFLOAT == 1 + printf("%.9g\r\n", (double)actual); +#else + printf("%d\r\n", actual); +#endif + } + } + return 0; + } + int32_t tot_err = 0; uint32_t tot = 0; OUTPUTTYPE diff; @@ -62,4 +79,4 @@ int main() { printf("Errors: %d out of %d \r\n", tot_err, tot); return tot_err; -} \ No newline at end of file +} diff --git a/DeeployTest/Platforms/PULPOpen/src/deeploytest.c b/DeeployTest/Platforms/PULPOpen/src/deeploytest.c index 11d889e48d..1252edd518 100644 --- a/DeeployTest/Platforms/PULPOpen/src/deeploytest.c +++ b/DeeployTest/Platforms/PULPOpen/src/deeploytest.c @@ -4,6 +4,8 @@ * SPDX-License-Identifier: Apache-2.0 */ +#include + #include "CycleCounter.h" #include "Network.h" #include "dory_mem.h" @@ -13,6 +15,8 @@ #define MAINSTACKSIZE 8000 #define SLAVESTACKSIZE 3800 +#define FLOAT_ABS_TOL 1e-4f +#define FLOAT_REL_TOL 1e-5f struct pi_device cluster_dev; @@ -40,8 +44,15 @@ void CompareFloatOnCluster(void *args) { float expected_val = expected[i]; float actual_val = actual[i]; float diff = expected_val - actual_val; + float abs_diff = fabsf(diff); + float scale = fabsf(expected_val); + float abs_actual = fabsf(actual_val); + if (abs_actual > scale) { + scale = abs_actual; + } + float tolerance = FLOAT_ABS_TOL + FLOAT_REL_TOL * scale; - if ((diff < -1e-4) || (diff > 1e-4) || isnan(diff)) { + if ((abs_diff > tolerance) || isnan(diff)) { local_err_count += 1; printf("Expected: %10.6f ", expected_val); @@ -125,39 +136,38 @@ int main(void) { compbuf = DeeployNetwork_outputs[buf]; } - if (ISOUTPUTFLOAT) { - float_error_count = 0; - float_compare_args.expected = testOutputVector[buf]; - float_compare_args.actual = compbuf; - float_compare_args.num_elements = - DeeployNetwork_outputs_bytes[buf] / sizeof(float); - float_compare_args.output_buf_index = buf; - float_compare_args.err_count = &float_error_count; - - pi_cluster_task(&cluster_task, CompareFloatOnCluster, - &float_compare_args); - cluster_task.stack_size = MAINSTACKSIZE; - cluster_task.slave_stack_size = SLAVESTACKSIZE; - pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task); - - tot_err += float_error_count; - } else { - - for (uint32_t i = 0; - i < DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE); i++) { - OUTPUTTYPE expected = ((OUTPUTTYPE *)testOutputVector[buf])[i]; - OUTPUTTYPE actual = ((OUTPUTTYPE *)compbuf)[i]; - int32_t error = expected - actual; - OUTPUTTYPE diff = (OUTPUTTYPE)(error < 0 ? -error : error); - - if (diff) { - tot_err += 1; - printf("Expected: %4d ", expected); - printf("Actual: %4d ", actual); - printf("Diff: %4d at Index %12u in Output %u\r\n", diff, i, buf); - } +#if ISOUTPUTFLOAT == 1 + float_error_count = 0; + float_compare_args.expected = testOutputVector[buf]; + float_compare_args.actual = compbuf; + float_compare_args.num_elements = + DeeployNetwork_outputs_bytes[buf] / sizeof(float); + float_compare_args.output_buf_index = buf; + float_compare_args.err_count = &float_error_count; + + pi_cluster_task(&cluster_task, CompareFloatOnCluster, &float_compare_args); + cluster_task.stack_size = MAINSTACKSIZE; + cluster_task.slave_stack_size = SLAVESTACKSIZE; + pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task); + + tot_err += float_error_count; +#else + + for (uint32_t i = 0; + i < DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE); i++) { + OUTPUTTYPE expected = ((OUTPUTTYPE *)testOutputVector[buf])[i]; + OUTPUTTYPE actual = ((OUTPUTTYPE *)compbuf)[i]; + int32_t error = expected - actual; + OUTPUTTYPE diff = (OUTPUTTYPE)(error < 0 ? -error : error); + + if (diff) { + tot_err += 1; + printf("Expected: %4d ", expected); + printf("Actual: %4d ", actual); + printf("Diff: %4d at Index %12u in Output %u\r\n", diff, i, buf); } } +#endif if ((uint32_t)DeeployNetwork_outputs[buf] < 0x1000000) { pi_l2_free(compbuf, (int)DeeployNetwork_outputs_bytes[buf]); } @@ -167,4 +177,4 @@ int main(void) { printf("Errors: %u out of %u \r\n", tot_err, tot_tested); return (int)tot_err; -} \ No newline at end of file +} diff --git a/DeeployTest/Platforms/Siracusa/src/deeploytest.c b/DeeployTest/Platforms/Siracusa/src/deeploytest.c index 11d889e48d..1252edd518 100644 --- a/DeeployTest/Platforms/Siracusa/src/deeploytest.c +++ b/DeeployTest/Platforms/Siracusa/src/deeploytest.c @@ -4,6 +4,8 @@ * SPDX-License-Identifier: Apache-2.0 */ +#include + #include "CycleCounter.h" #include "Network.h" #include "dory_mem.h" @@ -13,6 +15,8 @@ #define MAINSTACKSIZE 8000 #define SLAVESTACKSIZE 3800 +#define FLOAT_ABS_TOL 1e-4f +#define FLOAT_REL_TOL 1e-5f struct pi_device cluster_dev; @@ -40,8 +44,15 @@ void CompareFloatOnCluster(void *args) { float expected_val = expected[i]; float actual_val = actual[i]; float diff = expected_val - actual_val; + float abs_diff = fabsf(diff); + float scale = fabsf(expected_val); + float abs_actual = fabsf(actual_val); + if (abs_actual > scale) { + scale = abs_actual; + } + float tolerance = FLOAT_ABS_TOL + FLOAT_REL_TOL * scale; - if ((diff < -1e-4) || (diff > 1e-4) || isnan(diff)) { + if ((abs_diff > tolerance) || isnan(diff)) { local_err_count += 1; printf("Expected: %10.6f ", expected_val); @@ -125,39 +136,38 @@ int main(void) { compbuf = DeeployNetwork_outputs[buf]; } - if (ISOUTPUTFLOAT) { - float_error_count = 0; - float_compare_args.expected = testOutputVector[buf]; - float_compare_args.actual = compbuf; - float_compare_args.num_elements = - DeeployNetwork_outputs_bytes[buf] / sizeof(float); - float_compare_args.output_buf_index = buf; - float_compare_args.err_count = &float_error_count; - - pi_cluster_task(&cluster_task, CompareFloatOnCluster, - &float_compare_args); - cluster_task.stack_size = MAINSTACKSIZE; - cluster_task.slave_stack_size = SLAVESTACKSIZE; - pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task); - - tot_err += float_error_count; - } else { - - for (uint32_t i = 0; - i < DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE); i++) { - OUTPUTTYPE expected = ((OUTPUTTYPE *)testOutputVector[buf])[i]; - OUTPUTTYPE actual = ((OUTPUTTYPE *)compbuf)[i]; - int32_t error = expected - actual; - OUTPUTTYPE diff = (OUTPUTTYPE)(error < 0 ? -error : error); - - if (diff) { - tot_err += 1; - printf("Expected: %4d ", expected); - printf("Actual: %4d ", actual); - printf("Diff: %4d at Index %12u in Output %u\r\n", diff, i, buf); - } +#if ISOUTPUTFLOAT == 1 + float_error_count = 0; + float_compare_args.expected = testOutputVector[buf]; + float_compare_args.actual = compbuf; + float_compare_args.num_elements = + DeeployNetwork_outputs_bytes[buf] / sizeof(float); + float_compare_args.output_buf_index = buf; + float_compare_args.err_count = &float_error_count; + + pi_cluster_task(&cluster_task, CompareFloatOnCluster, &float_compare_args); + cluster_task.stack_size = MAINSTACKSIZE; + cluster_task.slave_stack_size = SLAVESTACKSIZE; + pi_cluster_send_task_to_cl(&cluster_dev, &cluster_task); + + tot_err += float_error_count; +#else + + for (uint32_t i = 0; + i < DeeployNetwork_outputs_bytes[buf] / sizeof(OUTPUTTYPE); i++) { + OUTPUTTYPE expected = ((OUTPUTTYPE *)testOutputVector[buf])[i]; + OUTPUTTYPE actual = ((OUTPUTTYPE *)compbuf)[i]; + int32_t error = expected - actual; + OUTPUTTYPE diff = (OUTPUTTYPE)(error < 0 ? -error : error); + + if (diff) { + tot_err += 1; + printf("Expected: %4d ", expected); + printf("Actual: %4d ", actual); + printf("Diff: %4d at Index %12u in Output %u\r\n", diff, i, buf); } } +#endif if ((uint32_t)DeeployNetwork_outputs[buf] < 0x1000000) { pi_l2_free(compbuf, (int)DeeployNetwork_outputs_bytes[buf]); } @@ -167,4 +177,4 @@ int main(void) { printf("Errors: %u out of %u \r\n", tot_err, tot_tested); return (int)tot_err; -} \ No newline at end of file +} diff --git a/DeeployTest/Tests/Kernels/FP32/Conv2D/inputs.npz b/DeeployTest/Tests/Kernels/FP32/Conv2D/inputs.npz new file mode 100644 index 0000000000..54a2d380c8 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Conv2D/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/Conv2D/network.onnx b/DeeployTest/Tests/Kernels/FP32/Conv2D/network.onnx new file mode 100644 index 0000000000..87846a20cd --- /dev/null +++ b/DeeployTest/Tests/Kernels/FP32/Conv2D/network.onnx @@ -0,0 +1,26 @@ +pytorch2.5.1:¿ +— +input + conv.weight + conv.biasoutput +/conv/Conv"Conv* + dilations@@ * +group * + kernel_shape@@ * +pads@@@@ * +strides@@  +main_graph*=B conv.weightJ$V#»(7>dwŒ¾V3{¾Nw¾æ·=«iØ»ßQ‡>–Tò¼*B conv.biasJf¤´=Z +input + + + + + +b +output + + + + + +B \ No newline at end of file diff --git a/DeeployTest/Tests/Kernels/FP32/Conv2D/outputs.npz b/DeeployTest/Tests/Kernels/FP32/Conv2D/outputs.npz new file mode 100644 index 0000000000..af275c1945 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/Conv2D/outputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/ConvTranspose2D/inputs.npz b/DeeployTest/Tests/Kernels/FP32/ConvTranspose2D/inputs.npz new file mode 100644 index 0000000000..ec6380bb46 Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ConvTranspose2D/inputs.npz differ diff --git a/DeeployTest/Tests/Kernels/FP32/ConvTranspose2D/network.onnx b/DeeployTest/Tests/Kernels/FP32/ConvTranspose2D/network.onnx new file mode 100644 index 0000000000..3c7e61866d Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ConvTranspose2D/network.onnx differ diff --git a/DeeployTest/Tests/Kernels/FP32/ConvTranspose2D/outputs.npz b/DeeployTest/Tests/Kernels/FP32/ConvTranspose2D/outputs.npz new file mode 100644 index 0000000000..229a14c1ca Binary files /dev/null and b/DeeployTest/Tests/Kernels/FP32/ConvTranspose2D/outputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D/GMM/inputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/GMM/inputs.npz new file mode 100644 index 0000000000..40e2d5e9f2 Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/GMM/inputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D/GMM/network.onnx b/DeeployTest/Tests/Models/Autoencoder2D/GMM/network.onnx new file mode 100644 index 0000000000..12800274dd Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/GMM/network.onnx differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D/GMM/outputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/GMM/outputs.npz new file mode 100644 index 0000000000..a774bae2e7 Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/GMM/outputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMR/inputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMR/inputs.npz new file mode 100644 index 0000000000..feccff239c Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMR/inputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMR/network.onnx b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMR/network.onnx new file mode 100644 index 0000000000..1841e24570 Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMR/network.onnx differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMR/outputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMR/outputs.npz new file mode 100644 index 0000000000..281bb9c67f Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMR/outputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLRes/inputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLRes/inputs.npz new file mode 100644 index 0000000000..bf9d7a9807 Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLRes/inputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLRes/network.onnx b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLRes/network.onnx new file mode 100644 index 0000000000..bc5a018acc Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLRes/network.onnx differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLRes/outputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLRes/outputs.npz new file mode 100644 index 0000000000..34a6da1215 Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLRes/outputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLTra/inputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLTra/inputs.npz new file mode 100644 index 0000000000..fc5c4f1431 Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLTra/inputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLTra/network.onnx b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLTra/network.onnx new file mode 100644 index 0000000000..44db2d394f Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLTra/network.onnx differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLTra/outputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLTra/outputs.npz new file mode 100644 index 0000000000..5e144188df Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLTra/outputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLin/inputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLin/inputs.npz new file mode 100644 index 0000000000..e35f7271e5 Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLin/inputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLin/network.onnx b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLin/network.onnx new file mode 100644 index 0000000000..edc86b2c6a Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLin/network.onnx differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLin/outputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLin/outputs.npz new file mode 100644 index 0000000000..00b8a73177 Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EncoderPCMRLin/outputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/Encoder_mini/inputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/Encoder_mini/inputs.npz new file mode 100644 index 0000000000..2eaecc5698 Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/Encoder_mini/inputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/Encoder_mini/network.onnx b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/Encoder_mini/network.onnx new file mode 100644 index 0000000000..5cef598d23 Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/Encoder_mini/network.onnx differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/Encoder_mini/outputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/Encoder_mini/outputs.npz new file mode 100644 index 0000000000..74e3b6eccb Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/Encoder_mini/outputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EnoderCMR/inputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EnoderCMR/inputs.npz new file mode 100644 index 0000000000..54a2d380c8 Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EnoderCMR/inputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EnoderCMR/network.onnx b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EnoderCMR/network.onnx new file mode 100644 index 0000000000..3b485d2c21 Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EnoderCMR/network.onnx differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EnoderCMR/outputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EnoderCMR/outputs.npz new file mode 100644 index 0000000000..77bcc55404 Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/MicroBlocks/EnoderCMR/outputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D/STD/inputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/STD/inputs.npz new file mode 100644 index 0000000000..fe562600e8 Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/STD/inputs.npz differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D/STD/network.onnx b/DeeployTest/Tests/Models/Autoencoder2D/STD/network.onnx new file mode 100644 index 0000000000..2d5cc658d2 Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/STD/network.onnx differ diff --git a/DeeployTest/Tests/Models/Autoencoder2D/STD/outputs.npz b/DeeployTest/Tests/Models/Autoencoder2D/STD/outputs.npz new file mode 100644 index 0000000000..79dcb5faeb Binary files /dev/null and b/DeeployTest/Tests/Models/Autoencoder2D/STD/outputs.npz differ diff --git a/DeeployTest/Tests/Models/GMM/inputs.npz b/DeeployTest/Tests/Models/GMM/inputs.npz new file mode 100644 index 0000000000..170ef9d86e Binary files /dev/null and b/DeeployTest/Tests/Models/GMM/inputs.npz differ diff --git a/DeeployTest/Tests/Models/GMM/network.onnx b/DeeployTest/Tests/Models/GMM/network.onnx new file mode 100644 index 0000000000..83b97d257b Binary files /dev/null and b/DeeployTest/Tests/Models/GMM/network.onnx differ diff --git a/DeeployTest/Tests/Models/GMM/outputs.npz b/DeeployTest/Tests/Models/GMM/outputs.npz new file mode 100644 index 0000000000..8793bae03e Binary files /dev/null and b/DeeployTest/Tests/Models/GMM/outputs.npz differ diff --git a/DeeployTest/testUtils/deeployRunner.py b/DeeployTest/testUtils/deeployRunner.py index 71b056e9df..b7d2488608 100644 --- a/DeeployTest/testUtils/deeployRunner.py +++ b/DeeployTest/testUtils/deeployRunner.py @@ -220,6 +220,10 @@ def create_config_from_args(args: argparse.Namespace, gen_args_list.extend(args.input_offset_map) if tiling: + if hasattr(args, 'cores'): + gen_args_list.append(f"--cores={args.cores}") + elif hasattr(args, 'num_cores'): + gen_args_list.append(f"--cores={args.num_cores}") if hasattr(args, 'defaultMemLevel') and args.defaultMemLevel: gen_args_list.append(f"--defaultMemLevel={args.defaultMemLevel}") if hasattr(args, 'doublebuffer') and args.doublebuffer: diff --git a/DeeployTest/test_siracusa_config.py b/DeeployTest/test_siracusa_config.py index 8fa105d9f4..ca3a78aa16 100644 --- a/DeeployTest/test_siracusa_config.py +++ b/DeeployTest/test_siracusa_config.py @@ -19,6 +19,7 @@ "Kernels/FP32/Conv/Regular_2D_Bias", "Kernels/FP32/Conv/Regular_2D_NoBias", "Kernels/FP32/Conv/Regular_2D_ZeroValuedBias", + "Kernels/FP32/ConvTranspose2D", "Kernels/FP32/GEMM/Regular", "Kernels/FP32/MatMul", "Kernels/FP32/MaxPool/Regular_2D", diff --git a/DeeployTest/test_siracusa_tiled_config.py b/DeeployTest/test_siracusa_tiled_config.py index a687d9a489..f85cd888bc 100644 --- a/DeeployTest/test_siracusa_tiled_config.py +++ b/DeeployTest/test_siracusa_tiled_config.py @@ -19,6 +19,7 @@ "Kernels/FP32/Conv/Regular_2D_Bias": [6600], "Kernels/FP32/Conv/Regular_2D_NoBias": [1600], "Kernels/FP32/Conv/Regular_2D_ZeroValuedBias": [6600], + "Kernels/FP32/ConvTranspose2D": [2000], "Kernels/FP32/GEMM/Regular": [8000], "Kernels/FP32/MatMul": [2000], "Kernels/FP32/MaxPool/Regular_2D": [2000], @@ -66,6 +67,7 @@ "Kernels/FP32/Conv/Regular_2D_Bias": [8800], "Kernels/FP32/Conv/Regular_2D_NoBias": [2000], "Kernels/FP32/Conv/Regular_2D_ZeroValuedBias": [8800], + "Kernels/FP32/ConvTranspose2D": [4000], "Kernels/FP32/GEMM/Regular": [8000], "Kernels/FP32/MatMul": [5000], "Kernels/FP32/MaxPool/Regular_2D": [5000], diff --git a/TargetLibraries/Generic/inc/DeeployBasicMath.h b/TargetLibraries/Generic/inc/DeeployBasicMath.h index 22081701a3..2d8aff54c0 100644 --- a/TargetLibraries/Generic/inc/DeeployBasicMath.h +++ b/TargetLibraries/Generic/inc/DeeployBasicMath.h @@ -34,6 +34,7 @@ #include "kernel/BatchNorm.h" #include "kernel/ConvTranspose1d_fp32.h" +#include "kernel/ConvTranspose2d_fp32.h" #include "kernel/Convolution.h" #include "kernel/DWConvolution.h" #include "kernel/Div.h" @@ -48,6 +49,7 @@ #include "kernel/RQDiv.h" #include "kernel/RQGELU.h" #include "kernel/RQHardswish.h" +#include "kernel/ReduceLogSumExp.h" #include "kernel/Relu.h" #include "kernel/RequantShift.h" #include "kernel/Softmax.h" diff --git a/TargetLibraries/Generic/inc/kernel/BatchNorm.h b/TargetLibraries/Generic/inc/kernel/BatchNorm.h index 72703f5fe2..e7af458f96 100644 --- a/TargetLibraries/Generic/inc/kernel/BatchNorm.h +++ b/TargetLibraries/Generic/inc/kernel/BatchNorm.h @@ -11,6 +11,6 @@ void BatchNorm_fp32(const float32_t *input, const float32_t *gamma, const float32_t *beta, const float32_t *mean, const float32_t *var, float32_t *output, int N, int C, - int L); + int L, float epsilon, int channels_first); #endif // BATCHNORM_H diff --git a/TargetLibraries/Generic/inc/kernel/ConvTranspose2d_fp32.h b/TargetLibraries/Generic/inc/kernel/ConvTranspose2d_fp32.h new file mode 100644 index 0000000000..67a62ef4e5 --- /dev/null +++ b/TargetLibraries/Generic/inc/kernel/ConvTranspose2d_fp32.h @@ -0,0 +1,18 @@ +// SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +// +// SPDX-License-Identifier: Apache-2.0 + +#ifndef CONV_TRANSPOSE2D_FP32_H +#define CONV_TRANSPOSE2D_FP32_H + +#include +#include + +void ConvTranspose2d_fp32(const float32_t *input, uint32_t C_in, uint32_t H_in, + uint32_t W_in, const float32_t *weight, + uint32_t C_out, uint32_t K_h, uint32_t K_w, + uint32_t stride_h, uint32_t stride_w, + const float32_t *bias, bool has_bias, + float32_t *output, uint32_t H_out, uint32_t W_out); + +#endif // CONV_TRANSPOSE2D_FP32_H diff --git a/TargetLibraries/Generic/inc/kernel/ReduceLogSumExp.h b/TargetLibraries/Generic/inc/kernel/ReduceLogSumExp.h new file mode 100644 index 0000000000..3e40d5cd2c --- /dev/null +++ b/TargetLibraries/Generic/inc/kernel/ReduceLogSumExp.h @@ -0,0 +1,16 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#ifndef __DEEPLOY_BASIC_MATH_REDUCELOGSUMEXP_KERNEL_HEADER_ +#define __DEEPLOY_BASIC_MATH_REDUCELOGSUMEXP_KERNEL_HEADER_ + +#include "DeeployBasicMath.h" + +void ReduceLogSumExp_fp32_fp32(float32_t *input, float32_t *output, + uint32_t outer_size, uint32_t axis_length, + uint32_t inner_size); + +#endif // __DEEPLOY_BASIC_MATH_REDUCELOGSUMEXP_KERNEL_HEADER_ diff --git a/TargetLibraries/Generic/src/BatchNorm_fp32.c b/TargetLibraries/Generic/src/BatchNorm_fp32.c index 1e94d63dbb..fd0e40e79d 100644 --- a/TargetLibraries/Generic/src/BatchNorm_fp32.c +++ b/TargetLibraries/Generic/src/BatchNorm_fp32.c @@ -8,8 +8,7 @@ void BatchNorm_fp32(const float32_t *input, const float32_t *gamma, const float32_t *beta, const float32_t *mean, const float32_t *var, float32_t *output, int N, int C, - int L) { - const float epsilon = 1e-5f; + int L, float epsilon, int channels_first) { #pragma omp parallel for for (int c = 0; c < C; ++c) { float32_t c_mean = mean[c]; @@ -19,7 +18,12 @@ void BatchNorm_fp32(const float32_t *input, const float32_t *gamma, float32_t denom = sqrtf(c_var + epsilon); for (int n = 0; n < N; ++n) { for (int l = 0; l < L; ++l) { - int index = n * C * L + c * L + l; + int index; + if (channels_first) { + index = n * C * L + c * L + l; + } else { + index = n * C * L + l * C + c; + } float32_t x = input[index]; float32_t norm = (x - c_mean) / denom; output[index] = c_gamma * norm + c_beta; diff --git a/TargetLibraries/Generic/src/ConvTranspose2d_fp32.c b/TargetLibraries/Generic/src/ConvTranspose2d_fp32.c new file mode 100644 index 0000000000..51efc078c7 --- /dev/null +++ b/TargetLibraries/Generic/src/ConvTranspose2d_fp32.c @@ -0,0 +1,60 @@ +// SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +// +// SPDX-License-Identifier: Apache-2.0 + +#include "DeeployBasicMath.h" + +void ConvTranspose2d_fp32(const float32_t *input, uint32_t C_in, uint32_t H_in, + uint32_t W_in, const float32_t *weight, + uint32_t C_out, uint32_t K_h, uint32_t K_w, + uint32_t stride_h, uint32_t stride_w, + const float32_t *bias, bool has_bias, + float32_t *output, uint32_t H_out, uint32_t W_out) { + /* + input: [C_in, H_in, W_in] + weight: [C_in, C_out, K_h, K_w] + output: [C_out, H_out, W_out] + bias: [C_out] optionally + */ + + for (uint32_t c = 0; c < C_out; ++c) { + for (uint32_t h = 0; h < H_out; ++h) { + for (uint32_t w = 0; w < W_out; ++w) { + output[(c * H_out + h) * W_out + w] = 0.0f; + } + } + } + + for (uint32_t cout = 0; cout < C_out; ++cout) { + for (uint32_t cin = 0; cin < C_in; ++cin) { + for (uint32_t h_in = 0; h_in < H_in; ++h_in) { + for (uint32_t w_in = 0; w_in < W_in; ++w_in) { + float32_t val = input[(cin * H_in + h_in) * W_in + w_in]; + for (uint32_t kh = 0; kh < K_h; ++kh) { + uint32_t h_out = h_in * stride_h + kh; + if (h_out >= H_out) { + continue; + } + for (uint32_t kw = 0; kw < K_w; ++kw) { + uint32_t w_out = w_in * stride_w + kw; + if (w_out >= W_out) { + continue; + } + float32_t wgt = + weight[((cin * C_out + cout) * K_h + kh) * K_w + kw]; + output[(cout * H_out + h_out) * W_out + w_out] += val * wgt; + } + } + } + } + } + + if (has_bias) { + for (uint32_t h = 0; h < H_out; ++h) { + for (uint32_t w = 0; w < W_out; ++w) { + output[(cout * H_out + h) * W_out + w] += bias[cout]; + } + } + } + } +} diff --git a/TargetLibraries/Generic/src/ReduceLogSumExp_fp32.c b/TargetLibraries/Generic/src/ReduceLogSumExp_fp32.c new file mode 100644 index 0000000000..85ae40a60d --- /dev/null +++ b/TargetLibraries/Generic/src/ReduceLogSumExp_fp32.c @@ -0,0 +1,36 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeployBasicMath.h" +#include + +void ReduceLogSumExp_fp32_fp32(float32_t *input, float32_t *output, + uint32_t outer_size, uint32_t axis_length, + uint32_t inner_size) { + + for (uint32_t outer_idx = 0; outer_idx < outer_size; ++outer_idx) { + for (uint32_t inner_idx = 0; inner_idx < inner_size; ++inner_idx) { + float32_t max_val = -INFINITY; + + for (uint32_t axis_idx = 0; axis_idx < axis_length; ++axis_idx) { + uint32_t input_idx = + (outer_idx * axis_length + axis_idx) * inner_size + inner_idx; + if (input[input_idx] > max_val) { + max_val = input[input_idx]; + } + } + + float32_t sum_exp = 0.0f; + for (uint32_t axis_idx = 0; axis_idx < axis_length; ++axis_idx) { + uint32_t input_idx = + (outer_idx * axis_length + axis_idx) * inner_size + inner_idx; + sum_exp += expf(input[input_idx] - max_val); + } + + output[outer_idx * inner_size + inner_idx] = logf(sum_exp) + max_val; + } + } +} diff --git a/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h b/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h index f6e8308c97..4800e6912d 100644 --- a/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h +++ b/TargetLibraries/PULPOpen/inc/DeeployPULPMath.h @@ -29,9 +29,11 @@ #include "kernel/Matmul.h" #include "kernel/MaxPool.h" #include "kernel/RQiHardswish.h" +#include "kernel/Relu.h" #include "kernel/RequantShift.h" #include "kernel/Softmax.h" #include "kernel/UniformRequantShift.h" +#include "kernel/gemm.h" #include "kernel/gemv.h" #include "kernel/iRMSnorm.h" diff --git a/TargetLibraries/PULPOpen/inc/kernel/Conv.h b/TargetLibraries/PULPOpen/inc/kernel/Conv.h index 3ebab54a0b..b5f0e57ea8 100644 --- a/TargetLibraries/PULPOpen/inc/kernel/Conv.h +++ b/TargetLibraries/PULPOpen/inc/kernel/Conv.h @@ -35,4 +35,14 @@ void PULP_DW_Conv2d_Im2Col_fp32_fp32_fp32_HWC( uint32_t pad_left, uint32_t pad_right, float32_t *__restrict__ pContextBuffer); -#endif // __DEEPLOY_MATH_CONV_KERNEL_HEADER_ \ No newline at end of file +void PULP_ConvTranspose2d_fp32_fp32_fp32_CHW( + const float32_t *__restrict__ pSrcA, uint32_t C_in, uint32_t H_in, + uint32_t W_in, const float32_t *__restrict__ pSrcB, uint32_t C_out, + uint32_t groups, uint32_t K_h, uint32_t K_w, uint32_t stride_h, + uint32_t stride_w, uint32_t dilation_h, uint32_t dilation_w, + uint32_t pad_top, uint32_t pad_bottom, uint32_t pad_left, + uint32_t pad_right, const float32_t *__restrict__ pSrcBias, + const bool has_bias, float32_t *__restrict__ pDstC, uint32_t H_out, + uint32_t W_out); + +#endif // __DEEPLOY_MATH_CONV_KERNEL_HEADER_ diff --git a/TargetLibraries/PULPOpen/inc/kernel/MaxPool.h b/TargetLibraries/PULPOpen/inc/kernel/MaxPool.h index b37487439f..021cb5f9b9 100644 --- a/TargetLibraries/PULPOpen/inc/kernel/MaxPool.h +++ b/TargetLibraries/PULPOpen/inc/kernel/MaxPool.h @@ -10,10 +10,10 @@ #include "DeeployPULPMath.h" void PULP_MaxPool2d_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA, - uint32_t W, uint32_t H, uint32_t C, + uint32_t H, uint32_t W, uint32_t C, uint32_t Q, uint32_t P, uint32_t SQ, uint32_t SP, float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom, uint32_t pad_left, uint32_t pad_right); -#endif // __DEEPLOY_MATH_MAXPOOL_KERNEL_HEADER_ \ No newline at end of file +#endif // __DEEPLOY_MATH_MAXPOOL_KERNEL_HEADER_ diff --git a/TargetLibraries/PULPOpen/src/ConvTranspose_fp32.c b/TargetLibraries/PULPOpen/src/ConvTranspose_fp32.c new file mode 100644 index 0000000000..8e46708273 --- /dev/null +++ b/TargetLibraries/PULPOpen/src/ConvTranspose_fp32.c @@ -0,0 +1,91 @@ +/* + * SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna + * + * SPDX-License-Identifier: Apache-2.0 + */ + +#include "DeeployPULPMath.h" +#include "pmsis.h" + +__attribute__((noinline, optnone)) void PULP_ConvTranspose2d_fp32_fp32_fp32_CHW( + const float32_t *__restrict__ pSrcA, uint32_t C_in, uint32_t H_in, + uint32_t W_in, const float32_t *__restrict__ pSrcB, uint32_t C_out, + uint32_t groups, uint32_t K_h, uint32_t K_w, uint32_t stride_h, + uint32_t stride_w, uint32_t dilation_h, uint32_t dilation_w, + uint32_t pad_top, uint32_t pad_bottom, uint32_t pad_left, + uint32_t pad_right, const float32_t *__restrict__ pSrcBias, + const bool has_bias, float32_t *__restrict__ pDstC, uint32_t H_out, + uint32_t W_out) { + + (void)pad_bottom; + (void)pad_right; + + int8_t core_id = pi_core_id(); + int8_t log2Core = LOG2(NUM_CORES); + + uint16_t ch_out_chunk = + (C_out >> log2Core) + ((C_out & (NUM_CORES - 1)) != 0); + uint16_t ch_out_start = MIN(ch_out_chunk * core_id, C_out); + uint16_t ch_out_stop = MIN(ch_out_start + ch_out_chunk, C_out); + uint16_t ch_out_count = ch_out_stop - ch_out_start; + + if (ch_out_count == 0) { + return; + } + + uint32_t output_plane = H_out * W_out; + + for (uint32_t cout = ch_out_start; cout < ch_out_stop; ++cout) { + float32_t init = has_bias ? pSrcBias[cout] : 0.0f; + float32_t *out_ptr = pDstC + cout * output_plane; + for (uint32_t idx = 0; idx < output_plane; ++idx) { + out_ptr[idx] = init; + } + } + + uint32_t ch_in_per_group = C_in / groups; + uint32_t ch_out_per_group = C_out / groups; + + for (uint32_t cout = ch_out_start; cout < ch_out_stop; ++cout) { + uint32_t group_idx = cout / ch_out_per_group; + uint32_t cout_in_group = cout % ch_out_per_group; + + for (uint32_t cin_in_group = 0; cin_in_group < ch_in_per_group; + ++cin_in_group) { + uint32_t cin = group_idx * ch_in_per_group + cin_in_group; + + for (uint32_t h_in = 0; h_in < H_in; ++h_in) { + for (uint32_t w_in = 0; w_in < W_in; ++w_in) { + float32_t val = pSrcA[(cin * H_in + h_in) * W_in + w_in]; + + for (uint32_t kh = 0; kh < K_h; ++kh) { + int32_t h_out = + (int32_t)(h_in * stride_h + kh * dilation_h) - (int32_t)pad_top; + + if (h_out < 0 || h_out >= (int32_t)H_out) { + continue; + } + + for (uint32_t kw = 0; kw < K_w; ++kw) { + int32_t w_out = (int32_t)(w_in * stride_w + kw * dilation_w) - + (int32_t)pad_left; + + if (w_out < 0 || w_out >= (int32_t)W_out) { + continue; + } + + uint32_t weight_idx = + (((cin * ch_out_per_group + cout_in_group) * K_h + kh) * + K_w) + + kw; + uint32_t out_idx = + (cout * H_out + (uint32_t)h_out) * W_out + (uint32_t)w_out; + + pDstC[out_idx] += val * pSrcB[weight_idx]; + } + } + } + } + } + } +} diff --git a/TargetLibraries/PULPOpen/src/MaxPool.c b/TargetLibraries/PULPOpen/src/MaxPool.c index 3b630b97cc..5ae8a2adb9 100644 --- a/TargetLibraries/PULPOpen/src/MaxPool.c +++ b/TargetLibraries/PULPOpen/src/MaxPool.c @@ -8,7 +8,7 @@ #include "pmsis.h" void PULP_MaxPool2d_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA, - uint32_t W, uint32_t H, uint32_t C, + uint32_t H, uint32_t W, uint32_t C, uint32_t Q, uint32_t P, uint32_t SQ, uint32_t SP, float32_t *__restrict__ pDstC, uint32_t pad_top, uint32_t pad_bottom, @@ -60,4 +60,4 @@ void PULP_MaxPool2d_fp32_fp32_HWC(const float32_t *__restrict__ pSrcA, } } } -} \ No newline at end of file +}