diff --git a/.github/workflows/_runner-siracusa-redmule-tiled.yml b/.github/workflows/_runner-siracusa-redmule-tiled.yml new file mode 100644 index 0000000000..73f0c9e0a6 --- /dev/null +++ b/.github/workflows/_runner-siracusa-redmule-tiled.yml @@ -0,0 +1,45 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +--- +name: _runner-siracusa-redmule-tiled-sequential + +"on": + workflow_call: + inputs: + runner: + required: true + type: string + docker-image: + required: true + type: string + pytest-marker: + required: true + type: string + +jobs: + test-runner-siracusa-redmule-tiled: + runs-on: ${{ inputs.runner }} + container: + image: ${{ inputs.docker-image }} + credentials: + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + steps: + - name: Mark workspace as safe + run: git config --global --add safe.directory '*' + - name: Checkout Repo + uses: actions/checkout@v4 + with: + submodules: recursive + - name: Build Deeploy + shell: bash + run: pip install -e . + - name: Run Test + run: | + cd DeeployTest + mkdir -p /app/.ccache + export CCACHE_DIR=/app/.ccache + pytest test_platforms.py -v -n 4 -m "siracusa_redmule_tiled and ${{ inputs.pytest-marker }}" + shell: bash diff --git a/.github/workflows/ci-platform-gap9-tiled.yml b/.github/workflows/ci-platform-gap9-tiled.yml index 61cab4ea70..bd6ea822a9 100644 --- a/.github/workflows/ci-platform-gap9-tiled.yml +++ b/.github/workflows/ci-platform-gap9-tiled.yml @@ -24,12 +24,16 @@ concurrency: cancel-in-progress: true jobs: + # GAP9 CI requires access to the private ghcr.io/pulp-platform/deeploy-gap9 + # image; gate on upstream org so forks skip cleanly. select-env: + if: github.repository_owner == 'pulp-platform' uses: ./.github/workflows/_select-env.yml with: docker_image_deeploy: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/pulp-platform/deeploy-gap9:devel' }} gap9-kernels-tiled-singlebuffer-L2: + if: github.repository_owner == 'pulp-platform' needs: select-env uses: ./.github/workflows/_runner-gap9-tiled.yml with: @@ -38,6 +42,7 @@ jobs: pytest-markers: "gap9_tiled and kernels and singlebuffer and l2" gap9-kernels-tiled-doublebuffer-L2: + if: github.repository_owner == 'pulp-platform' needs: select-env uses: ./.github/workflows/_runner-gap9-tiled.yml with: @@ -46,6 +51,7 @@ jobs: pytest-markers: "gap9_tiled and kernels and doublebuffer and l2" gap9-models-tiled-singlebuffer-L2: + if: github.repository_owner == 'pulp-platform' needs: select-env uses: ./.github/workflows/_runner-gap9-tiled.yml with: @@ -54,6 +60,7 @@ jobs: pytest-markers: "gap9_tiled and models and singlebuffer and l2" gap9-models-tiled-doublebuffer-L2: + if: github.repository_owner == 'pulp-platform' needs: select-env uses: ./.github/workflows/_runner-gap9-tiled.yml with: diff --git a/.github/workflows/ci-platform-gap9.yml b/.github/workflows/ci-platform-gap9.yml index 014828d6ce..fbf5398485 100644 --- a/.github/workflows/ci-platform-gap9.yml +++ b/.github/workflows/ci-platform-gap9.yml @@ -25,12 +25,16 @@ concurrency: cancel-in-progress: true jobs: + # GAP9 CI requires access to the private ghcr.io/pulp-platform/deeploy-gap9 + # image; gate on upstream org so forks skip cleanly. select-env: + if: github.repository_owner == 'pulp-platform' uses: ./.github/workflows/_select-env.yml with: docker_image_deeploy: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/pulp-platform/deeploy-gap9:devel' }} gap9-kernels: + if: github.repository_owner == 'pulp-platform' needs: select-env uses: ./.github/workflows/_runner-gap9.yml with: @@ -39,6 +43,7 @@ jobs: pytest-marker: "kernels" gap9-models: + if: github.repository_owner == 'pulp-platform' needs: select-env uses: ./.github/workflows/_runner-gap9.yml with: diff --git a/.github/workflows/ci-platform-siracusa-redmule-tiled.yml b/.github/workflows/ci-platform-siracusa-redmule-tiled.yml new file mode 100644 index 0000000000..3fe8b8cb4f --- /dev/null +++ b/.github/workflows/ci-platform-siracusa-redmule-tiled.yml @@ -0,0 +1,50 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 + +--- +name: CI • Siracusa + RedMulE (Tiled) + +"on": + push: + branches: + - "**" + tags: + - "v*.*.*" + pull_request: + workflow_dispatch: + inputs: + docker_image_deeploy: + description: "Deeploy Image to use" + required: false + default: "ghcr.io/runwangdl/deeploy:redmule" + +concurrency: + group: ${{ github.workflow }}-${{ github.ref }} + cancel-in-progress: true + +jobs: + select-env: + uses: ./.github/workflows/_select-env.yml + with: + # RedMulE CI needs the fork's custom Docker image that bundles a + # GVSoC build with the light_redmule model. Fall back to + # runwangdl/deeploy:redmule on push/PR events (when no input is + # provided) rather than the upstream devel image. + docker_image_deeploy: ${{ inputs.docker_image_deeploy || 'ghcr.io/runwangdl/deeploy:redmule' }} + + siracusa-redmule-kernels-tiled-singlebuffer-L2: + needs: select-env + uses: ./.github/workflows/_runner-siracusa-redmule-tiled.yml + with: + runner: ${{ needs.select-env.outputs.runner }} + docker-image: ${{ needs.select-env.outputs.image }} + pytest-marker: "kernels and singlebuffer and l2" + + siracusa-redmule-kernels-tiled-doublebuffer-L2: + needs: select-env + uses: ./.github/workflows/_runner-siracusa-redmule-tiled.yml + with: + runner: ${{ needs.select-env.outputs.runner }} + docker-image: ${{ needs.select-env.outputs.image }} + pytest-marker: "kernels and doublebuffer and l2" diff --git a/.github/workflows/infra-generate-documentation.yml b/.github/workflows/infra-generate-documentation.yml index de42648f63..1394805c70 100644 --- a/.github/workflows/infra-generate-documentation.yml +++ b/.github/workflows/infra-generate-documentation.yml @@ -32,12 +32,12 @@ jobs: sphinx-build docs _build - name: Prepare Multipages uses: xeratec/gh-pages-multibranch@pr/support_tags - if: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }} + if: ${{ (github.event_name == 'push' || github.event_name == 'workflow_dispatch') && github.repository_owner == 'pulp-platform' }} with: directory: _build - name: Deploy to GitHub Pages uses: peaceiris/actions-gh-pages@v3 - if: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch'}} + if: ${{ (github.event_name == 'push' || github.event_name == 'workflow_dispatch') && github.repository_owner == 'pulp-platform' }} with: publish_branch: gh-pages github_token: ${{ secrets.GITHUB_TOKEN }} diff --git a/CMakeLists.txt b/CMakeLists.txt index 0e07d64a9e..3bafd2257c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -20,8 +20,8 @@ if(TOOLCHAIN STREQUAL GCC) set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE) endif() -set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, GAP9, Generic, Snitch)") -set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka PULP-Open GAP9 Generic Snitch) +set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, Siracusa_w_redmule, PULP-Open, GAP9, Generic, Snitch)") +set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka Siracusa_w_redmule PULP-Open GAP9 Generic Snitch) if(platform STREQUAL MemPool) message(STATUS "Building for platform 'MemPool'") @@ -31,6 +31,8 @@ elseif(platform STREQUAL Siracusa) message(STATUS "Building for platform 'Siracusa'") elseif(platform STREQUAL Siracusa_w_neureka) message(STATUS "Building for platform 'Siracusa_w_neureka'") +elseif(platform STREQUAL Siracusa_w_redmule) + message(STATUS "Building for platform 'Siracusa_w_redmule'") elseif(platform STREQUAL PULPOpen) message(STATUS "Building for platform 'PULP-Open'") elseif(platform STREQUAL GAP9) @@ -196,7 +198,7 @@ if(platform STREQUAL QEMU-ARM) endif() -if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platform STREQUAL PULPOpen) +if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platform STREQUAL Siracusa_w_redmule OR platform STREQUAL PULPOpen) if(TOOLCHAIN STREQUAL LLVM) set(CMAKE_TOOLCHAIN_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/toolchain_llvm.cmake) @@ -206,7 +208,7 @@ if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platfor include(${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/pulp.cmake) - if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka) + if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platform STREQUAL Siracusa_w_redmule) include(${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/siracusa/siracusa.cmake) elseif(platform STREQUAL PULPOpen) include(${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/pulp-open/pulp-open.cmake) diff --git a/Deeploy/Targets/Chimera/__init__.py b/Deeploy/Targets/Chimera/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py index 59499706e5..ec7df7d326 100644 --- a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py +++ b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py @@ -4,9 +4,12 @@ from typing import Dict, List, Tuple -from Deeploy.AbstractDataTypes import float32_tPtr +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import float32_t from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation +float32_tPtr = PointerClass(float32_t) + class PULPFloatGEMMTemplate(NodeTemplate): diff --git a/Deeploy/Targets/Redmule/Bindings.py b/Deeploy/Targets/Redmule/Bindings.py new file mode 100644 index 0000000000..fa06d789d7 --- /dev/null +++ b/Deeploy/Targets/Redmule/Bindings.py @@ -0,0 +1,52 @@ +# ---------------------------------------------------------------------- +# +# File: NeurekaBindings.py +# +# Last edited: 10.07.2024 +# +# Copyright (C) 2024, ETH Zurich and University of Bologna. +# +# Author: +# Luka Macan, University of Bologna +# Moritz Scherer, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import float32_t +from Deeploy.DeeployTypes import NodeBinding +from Deeploy.Targets.Generic.TypeCheckers import ConvChecker, GEMMChecker, MatMulChecker +from Deeploy.Targets.PULPOpen.Bindings import ForkTransformer +from Deeploy.Targets.Redmule.Templates import ConvTemplate, GEMMTemplate, MatmulTemplate + +RedmuleMatmulBindings = [ + NodeBinding(MatMulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]), + MatmulTemplate.referenceTemplate, ForkTransformer) +] + +RedmuleConv2DBindings = [ + NodeBinding( + ConvChecker([PointerClass(float32_t), PointerClass(float32_t), + PointerClass(float32_t)], [PointerClass(float32_t)]), ConvTemplate.reference2DIm2ColTemplate, + ForkTransformer) +] + +RedmuleGEMMBindings = [ + NodeBinding( + GEMMChecker([PointerClass(float32_t), PointerClass(float32_t), + PointerClass(float32_t)], [PointerClass(float32_t)]), GEMMTemplate.referenceTemplate, + ForkTransformer) +] diff --git a/Deeploy/Targets/Redmule/Deployer.py b/Deeploy/Targets/Redmule/Deployer.py new file mode 100644 index 0000000000..7344f63a89 --- /dev/null +++ b/Deeploy/Targets/Redmule/Deployer.py @@ -0,0 +1,55 @@ +# ---------------------------------------------------------------------- +# +# File: Deployer.py +# +# Last edited: 08.05.2025 +# +# Copyright (C) 2024, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Callable, Dict, Type + +import onnx_graphsurgeon as gs + +from Deeploy.AbstractDataTypes import Pointer +from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer +from Deeploy.Targets.PULPOpen.Deployer import PULPDeployer +from Deeploy.Targets.Redmule.TopologyOptimizationPasses.Passes import RedMuleAdjustWeightMemoryLayoutPass, \ + RedMuleGEMMTransposePass + + +class RedmuleDeployer(PULPDeployer): + + def __init__(self, + graph: gs.Graph, + deploymentPlatform: DeploymentPlatform, + inputTypes: Dict[str, Type[Pointer]], + loweringOptimizer: TopologyOptimizer, + scheduler: Callable = lambda graph: list(graph.nodes), + name: str = 'DeeployNetwork', + default_channels_first = False, + deeployStateDir: str = "DeeployStateDir", + inputOffsets = {}): + super().__init__(graph, deploymentPlatform, inputTypes, loweringOptimizer, scheduler, name, + default_channels_first, deeployStateDir, inputOffsets) + + self.loweringOptimizer.passes += [ + RedMuleAdjustWeightMemoryLayoutPass("Redmule"), + RedMuleGEMMTransposePass("Redmule") + ] diff --git a/Deeploy/Targets/Redmule/Engine.py b/Deeploy/Targets/Redmule/Engine.py new file mode 100644 index 0000000000..bff02fc3d1 --- /dev/null +++ b/Deeploy/Targets/Redmule/Engine.py @@ -0,0 +1,60 @@ +# ---------------------------------------------------------------------- +# +# File: Engine.py +# +# Last edited: 26.07.2024 +# +# Copyright (C) 2024, ETH Zurich and University of Bologna. +# +# Author: Moritz Scherer, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import List + +from Deeploy.DeeployTypes import DeploymentEngine, NodeMapper +from Deeploy.Targets.Generic.Layers import ConvLayer, GEMMLayer, MatMulLayer +from Deeploy.Targets.Generic.Parsers import MatMulParser +from Deeploy.Targets.PULPOpen.Parsers import PULPFPConv2DParser +from Deeploy.Targets.Redmule.Parsers import GEMMRedmuleParser +from Deeploy.Targets.Redmule.Tiler import RedmuleConvTilingReadyBindings, RedmuleGEMMTilingReadyBindings, \ + RedmuleMatMulTilingReadyBindings + +MatMulRedmuleMapper = NodeMapper(MatMulParser(), RedmuleMatMulTilingReadyBindings) +Conv2DRedmuleMapper = NodeMapper(PULPFPConv2DParser(), RedmuleConvTilingReadyBindings) +GEMMMRedmuleMapper = NodeMapper(GEMMRedmuleParser(), RedmuleGEMMTilingReadyBindings) + +RedmuleMapping = { + 'MatMul': MatMulLayer([MatMulRedmuleMapper]), + 'Conv': ConvLayer([Conv2DRedmuleMapper]), + 'Gemm': GEMMLayer([GEMMMRedmuleMapper]), +} + +_includeList = [] + +_redmuleInitCode = r""" +// Redmule engine initialization +""" + + +class RedmuleEngine(DeploymentEngine): + + def __init__(self, + name: str, + Mapping = RedmuleMapping, + initCode: str = _redmuleInitCode, + includeList: List[str] = _includeList) -> None: + super().__init__(name, Mapping, initCode, includeList) diff --git a/Deeploy/Targets/Redmule/Parsers.py b/Deeploy/Targets/Redmule/Parsers.py new file mode 100644 index 0000000000..ca3bc897cf --- /dev/null +++ b/Deeploy/Targets/Redmule/Parsers.py @@ -0,0 +1,95 @@ +# ---------------------------------------------------------------------- +# +# File: BasicParsers.py +# +# Last edited: 15.12.2021 +# +# Copyright (C) 2021, ETH Zurich and University of Bologna. +# +# Authors: +# - Moritz Scherer, ETH Zurich +# - Victor Jung, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Tuple + +import numpy as np +import onnx_graphsurgeon as gs + +from Deeploy.DeeployTypes import NetworkContext +from Deeploy.Targets.Generic.Parsers import MatMulParser + + +class GEMMRedmuleParser(MatMulParser): + + def __init__(self, noBiasHoisting = True): + self.noBiasHoisting = noBiasHoisting + super().__init__() + + def parseNode(self, node: gs.Node) -> (bool): + + ret = all([len(node.inputs) >= 2, len(node.outputs) == 1, node.attrs['alpha'] == 1]) + + if ret: + if 'transA' in node.attrs: + self.operatorRepresentation['transA'] = node.attrs['transA'] + else: + self.operatorRepresentation['transA'] = 0 + + if 'transB' in node.attrs: + self.operatorRepresentation['transB'] = node.attrs['transB'] + else: + self.operatorRepresentation['transB'] = 0 + if 'alpha' in node.attrs: + self.operatorRepresentation['alpha'] = node.attrs['alpha'] + else: + self.operatorRepresentation['alpha'] = 1 + if 'beta' in node.attrs: + self.operatorRepresentation['beta'] = node.attrs['beta'] + else: + self.operatorRepresentation['beta'] = 1 + + return ret + + def parseNodeCtxt(self, + ctxt: NetworkContext, + node: gs.Node, + channels_first: bool = True) -> Tuple[NetworkContext, bool]: + + newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first) + + if ret: + inputs = ['A', 'B'] + outputs = ['data_out'] + + for idx, inputNode in enumerate(node.inputs): + if idx < len(inputs): + self.operatorRepresentation[inputs[idx]] = newCtxt.lookup(inputNode.name).name + for idx, outputNode in enumerate(node.outputs): + self.operatorRepresentation[outputs[idx]] = newCtxt.lookup(outputNode.name).name + + if len(node.inputs) == 3: + self.operatorRepresentation['C'] = newCtxt.lookup(node.inputs[2].name).name + elif not self.noBiasHoisting: + values = np.zeros((1)) + zeroTensor = gs.Constant(f'{node.name}_C_Tensor', values = values) + newCtxt.hoistConstant(zeroTensor) + self.operatorRepresentation['C'] = f'{node.name}_C_Tensor' + + self.operatorRepresentation['size'] = np.prod(newCtxt.lookup(node.inputs[0].name).shape) + + return newCtxt, ret diff --git a/Deeploy/Targets/Redmule/Platform.py b/Deeploy/Targets/Redmule/Platform.py new file mode 100644 index 0000000000..54dfa50ba0 --- /dev/null +++ b/Deeploy/Targets/Redmule/Platform.py @@ -0,0 +1,42 @@ +# ---------------------------------------------------------------------- +# +# File: Platform.py +# +# Last edited: 08.05.2025 +# +# Copyright (C) 2024, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from Deeploy.DeeployTypes import TopologyOptimizer +from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine, PULPConstantBuffer, PULPOptimizer, PULPPlatform, \ + PULPStructBuffer, PULPTransientBuffer, PULPVariableBuffer +from Deeploy.Targets.Redmule.Engine import RedmuleEngine + +RedmuleOptimizer = TopologyOptimizer([*PULPOptimizer.passes]) + + +class RedmulePlatform(PULPPlatform): + + def __init__(self, + engines = [RedmuleEngine("Redmule"), PULPClusterEngine("PULPCluster")], + variableBuffer = PULPVariableBuffer, + constantBuffer = PULPConstantBuffer, + structBuffer = PULPStructBuffer, + transientBuffer = PULPTransientBuffer) -> None: + super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer) diff --git a/Deeploy/Targets/Redmule/Templates/ConvTemplate.py b/Deeploy/Targets/Redmule/Templates/ConvTemplate.py new file mode 100644 index 0000000000..ba344655b6 --- /dev/null +++ b/Deeploy/Targets/Redmule/Templates/ConvTemplate.py @@ -0,0 +1,87 @@ +# ---------------------------------------------------------------------- +# +# File: ConvTemplate.py +# +# Last edited: 09.05.2025 +# +# Copyright (C) 2023, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Tuple, Union + +from ortools.constraint_solver.pywrapcp import IntVar + +from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation + + +class RedmuleFloatConvIm2ColTemplate(NodeTemplate): + + def __init__(self, templateStr): + super().__init__(templateStr) + + @staticmethod + def computeTransientBuffersSize( + ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]: + im2col_dim = 4 * 8 * (operatorRepresentation['ch_im_in'] * operatorRepresentation['dim_kernel_x'] * + operatorRepresentation['dim_kernel_y']) + im2col_name = operatorRepresentation['nodeName'] + "_buffer" + return [(im2col_name, im2col_dim)] + + def hoistTransientBuffers(self, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]: + im2col_name, im2col_dim = RedmuleFloatConvIm2ColTemplate.computeTransientBuffersSize( + ctxt, operatorRepresentation)[0] + ctxt.hoistTransientBuffer(im2col_name, im2col_dim) + + operatorRepresentation['ctxtBuffer'] = im2col_name + operatorRepresentation['ctxtBufferSize'] = im2col_dim + return ctxt, operatorRepresentation, [im2col_name] + + +reference2DIm2ColTemplate = RedmuleFloatConvIm2ColTemplate(""" +// 2D FP Conv HWC Parallel with Im2Col (Name: ${nodeName}, Op: ${nodeOp}) +${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in}; +${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out}; + +for (uint32_t n=0; n<${batch}; ++n) { + + Conv2d_Im2Col_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC_8_Redmule( + ref_${data_out}_${data_in}, + ${dim_im_in_y}, + ${dim_im_in_x}, + ${ch_im_in}, + ${weight}, + ${dim_kernel_y}, + ${dim_kernel_x}, + ${stride_y}, + ${stride_x}, + ref_${data_out}_${data_out}, + ${ch_im_out}, + ${padding_y_top}, + ${padding_y_bottom}, + ${padding_x_left}, + ${padding_x_right}, + ${ctxtBuffer} + ); + + ref_${data_out}_${data_in} += ${ch_im_in} * ${dim_im_in_x} * ${dim_im_in_y}; + ref_${data_out}_${data_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y}; +} +""") diff --git a/Deeploy/Targets/Redmule/Templates/GEMMTemplate.py b/Deeploy/Targets/Redmule/Templates/GEMMTemplate.py new file mode 100644 index 0000000000..ba41ab7693 --- /dev/null +++ b/Deeploy/Targets/Redmule/Templates/GEMMTemplate.py @@ -0,0 +1,61 @@ +# ---------------------------------------------------------------------- +# +# File: MatMul.py.py +# +# Last edited: 27.01.2025 +# +# Copyright (C) 2023, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the Licens +from Deeploy.DeeployTypes import NodeTemplate + +referenceTemplate = NodeTemplate(""" +// GEMM using RedMule hardware accelerator (Name: ${nodeName}, Op: ${nodeOp}) + +int8_t ${nodeName}_core_id = pi_core_id(); + +if (${nodeName}_core_id == 0) { + for(uint32_t b=0; b<${batch}; b++) { + ${A_type.typeName} batch_A = ${A} + b * ${M} * ${N}; + ${B_type.typeName} batch_B = ${B} + b * ${N} * ${O}; + ${C_type.typeName} batch_C = ${C} + b * ${M} * ${O}; + ${data_out_type.typeName} batch_out = ${data_out} + b * ${M} * ${O}; + + % if beta == 0: + MatMul_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_Redmule( + (const float32_t *) batch_A, + (const float32_t *) batch_B, + (float32_t *) batch_out, + ${M}, + ${N}, + ${O} + ); + % else: + Gemm_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_Redmule( + (const float32_t *) batch_A, + (const float32_t *) batch_B, + (const float32_t *) batch_C, + (float32_t *) batch_out, + ${M}, + ${N}, + ${O} + ); + % endif + } +} +""") diff --git a/Deeploy/Targets/Redmule/Templates/MatmulTemplate.py b/Deeploy/Targets/Redmule/Templates/MatmulTemplate.py new file mode 100644 index 0000000000..cb077ca897 --- /dev/null +++ b/Deeploy/Targets/Redmule/Templates/MatmulTemplate.py @@ -0,0 +1,49 @@ +# ---------------------------------------------------------------------- +# +# File: MatMul.py.py +# +# Last edited: 27.01.2025 +# +# Copyright (C) 2023, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the Licens +from Deeploy.DeeployTypes import NodeTemplate + +referenceTemplate = NodeTemplate(""" +// Matmul using RedMule hardware accelerator (Name: ${nodeName}, Op: ${nodeOp}) + +int8_t ${nodeName}_core_id = pi_core_id(); +int8_t ${nodeName}_num_cores = NUM_CORES; + +if (${nodeName}_core_id == 0) { + for(uint32_t b=0; b<${batch}; b++) { + ${A_type.typeName} batch_A = ${A} + b * ${M} * ${N}; + ${B_type.typeName} batch_B = ${B} + b * ${N} * ${O}; + ${data_out_type.typeName} batch_out = ${data_out} + b * ${M} * ${O}; + + MatMul_fp32_fp32_fp32_Redmule( + (const float32_t *) batch_A, + (const float32_t *) batch_B, + (float32_t *) batch_out, + ${M}, + ${N}, + ${O} + ); + } +} +""") \ No newline at end of file diff --git a/Deeploy/Targets/Redmule/Templates/__init__.py b/Deeploy/Targets/Redmule/Templates/__init__.py new file mode 100644 index 0000000000..a73187ca8f --- /dev/null +++ b/Deeploy/Targets/Redmule/Templates/__init__.py @@ -0,0 +1,26 @@ +# ---------------------------------------------------------------------- +# +# File: __init__.py +# +# Last edited: 08.05.2025 +# +# Copyright (C) 2024, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import * diff --git a/Deeploy/Targets/Redmule/TileConstraints/ConvTileConstraint.py b/Deeploy/Targets/Redmule/TileConstraints/ConvTileConstraint.py new file mode 100644 index 0000000000..1b3a93f600 --- /dev/null +++ b/Deeploy/Targets/Redmule/TileConstraints/ConvTileConstraint.py @@ -0,0 +1,279 @@ +# ---------------------------------------------------------------------- +# +# File: ConvTileConstraint.py +# +# Last edited: 09.05.2025 +# +# Copyright (C) 2023, ETH Zurich and University of Bologna. +# +# Author: +# - Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Tuple, Union + +from ortools.constraint_solver.pywrapcp import IntVar + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import uint8_t, uint16_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import PerformanceHint, TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class RedmuleConv2DTileConstraint(TileConstraint): + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + + # Get to-be-tiled tensor's buffers + inputBufferName = parseDict['data_in'] + weightBufferName = parseDict['weight'] + outputBufferName = parseDict['data_out'] + + strides = parseDict["strides"] + padding = parseDict["pads"] + dilation = parseDict["dilations"] + + # Add I/O dimensions to the model as variables + for bufferName in [inputBufferName, weightBufferName, outputBufferName]: + tilerModel.addTensorDimToModel(ctxt, bufferName) + + inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 0) + inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1) + inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 2) + inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 3) + + # Updated dimension indexes for (H, W, Cin, Cout) format + weightHeightVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 0) + weightWidthVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 1) + weightInChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 2) + weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 3) + + outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0) + outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1) + outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2) + outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3) + + # Map output dims to inputs dims + tilerModel.addConstraint(outputBatchVar == inputBatchVar) # Batch + tilerModel.addConstraint(outputChannelVar == weightOutChannelVar) # Output Channel (now at index 3) + + inputBuffer = ctxt.lookup(inputBufferName) + + effectiveHeight = inputHeightVar + ((padding[0] + padding[2]) * (inputHeightVar == inputBuffer.shape[1])) + effectiveWidth = inputWidthVar + ((padding[1] + padding[3]) * (inputWidthVar == inputBuffer.shape[2])) + + tilerModel.addConstraint((outputHeightVar == (effectiveHeight - (weightHeightVar - 1) - 1) // strides[0] + 1)) + tilerModel.addConstraint((outputWidthVar == (effectiveWidth - (weightWidthVar - 1) - 1) // strides[1] + 1)) + + return tilerModel + + @staticmethod + def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + + # Get to-be-tiled tensor's buffers + inputBuffer = ctxt.lookup(name = parseDict['data_in']) + weightBuffer = ctxt.lookup(name = parseDict['weight']) + + inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 1) + inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 2) + inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 3) + + # Updated dimension indexes for (H, W, Cin, Cout) format + weightHeightVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 0) + weightWidthVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 1) + weightInChannelVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 2) + weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 3) + + strides = parseDict["strides"] + padding = parseDict["pads"] + + tilerModel.addConstraint(inputChannelVar == parseDict['ch_im_in']) + # RW: Conv only tiled on outchannel + tilerModel.addConstraint(inputHeightVar == parseDict['dim_im_in_x']) + tilerModel.addConstraint(inputWidthVar == parseDict['dim_im_in_y']) + tilerModel.addConstraint(inputChannelVar == parseDict['ch_im_in']) + + tilerModel.addConstraint(weightHeightVar == parseDict['dim_kernel_x']) + tilerModel.addConstraint(weightWidthVar == parseDict['dim_kernel_y']) + tilerModel.addConstraint(weightInChannelVar == parseDict['ch_im_in']) + + outChannel = parseDict["ch_im_out"] + if outChannel >= 12: + tilerModel.addTileSizeDivisibleConstraint(parseDict, + "ch_im_out", + weightOutChannelVar, + 12, + strategy = PerformanceHint(priority = 1)) + else: + tilerModel.addConstraint(weightOutChannelVar == weightOutChannelVar.Max(), strategy = PerformanceHint(1)) + + return tilerModel + + @staticmethod + def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict, + ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]: + + inputBuffer = ctxt.lookup(name = parseDict['data_in']) + weightBuffer = ctxt.lookup(name = parseDict['weight']) + + symbolicParseDict = parseDict.copy() + symbolicParseDict['dim_im_in_x'] = tilerModel.getTensorDimVar(inputBuffer.name, 1) + # Using updated dimension indexes for kernel dimensions + symbolicParseDict['dim_kernel_x'] = tilerModel.getTensorDimVar(weightBuffer.name, 0) + symbolicParseDict['dim_kernel_y'] = tilerModel.getTensorDimVar(weightBuffer.name, 1) + + return symbolicParseDict + + @staticmethod + def computeMargins(kernelShape: Tuple[int, ...]) -> Tuple[int, ...]: + if kernelShape[1] % 2 == 0: + leftMargin = 0 + rightMargin = 0 + else: + leftMargin = ((kernelShape[1]) // 2) + rightMargin = ((kernelShape[1]) // 2) + + if kernelShape[0] % 2 == 0: + topMargin = 0 + bottomMargin = 0 + else: + topMargin = ((kernelShape[0]) // 2) + bottomMargin = ((kernelShape[0]) // 2) + + return leftMargin, rightMargin, topMargin, bottomMargin + + @staticmethod + def computeInputCube(kernelShape: Tuple[int, ...], pads: Tuple[int, ...], strides: Tuple[int, ...], + weightChannels: int, outputCube: HyperRectangle, + outputDims: Tuple[int, ...]) -> Tuple[HyperRectangle, Tuple[int, ...]]: + + (BatchOffset, HOffset, WOffset, COffset) = outputCube.offset + (BatchSize, HSize, WSize, CSize) = outputCube.dims + + leftMargin, rightMargin, topMargin, bottomMargin = RedmuleConv2DTileConstraint.computeMargins(kernelShape) + + padding_top = (HOffset == 0) * pads[0] + padding_bottom = (HOffset + HSize == outputDims[1]) * pads[2] + + padding_left = (WOffset == 0) * pads[1] + padding_right = (WOffset + WSize == outputDims[2]) * pads[3] + + inputHOffset = HOffset * strides[0] - topMargin * (HOffset != 0) + inputWOffset = WOffset * strides[1] - leftMargin * (WOffset != 0) + + inputHSize = HSize * strides[0] + (topMargin + bottomMargin) - (padding_top + padding_bottom) + inputWSize = WSize * strides[1] + (leftMargin + rightMargin) - (padding_left + padding_right) + + InCube = HyperRectangle((BatchOffset, inputHOffset, inputWOffset, 0), + (BatchSize, inputHSize, inputWSize, weightChannels)) + + return InCube, (padding_left, padding_right, padding_top, padding_bottom) + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + addrNames = ['data_in', 'weight', 'data_out'] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + varWeight = operatorRepresentation['weight'] + varOut = operatorRepresentation['data_out'] + + inputInCubes = [] + inputWeightCubes = [] + replacements: Dict[str, List[int]] = { + "dim_im_in_x": [], + "dim_im_in_y": [], + "dim_im_out_x": [], + "dim_im_out_y": [], + "ch_im_out": [], + "padding_y_top": [], + "padding_y_bottom": [], + "padding_x_left": [], + "padding_x_right": [] + } + + replacementTypes = { + "dim_im_in_x": PointerClass(uint16_t), + "dim_im_in_y": PointerClass(uint16_t), + "dim_im_out_x": PointerClass(uint16_t), + "dim_im_out_y": PointerClass(uint16_t), + "ch_im_out": PointerClass(uint16_t), + "padding_y_top": PointerClass(uint8_t), + "padding_y_bottom": PointerClass(uint8_t), + "padding_x_left": PointerClass(uint8_t), + "padding_x_right": PointerClass(uint8_t) + } + + # Updated dimension indexes for (H, W, Cin, Cout) format + weightH = ctxt.lookup(varWeight).shape[0] # Now index 0 + weightW = ctxt.lookup(varWeight).shape[1] # Now index 1 + weightC = ctxt.lookup(varWeight).shape[2] # Now index 2 (Cin) + + pads = operatorRepresentation['pads'] + strides = operatorRepresentation['strides'] + + for cube in outputCubes: + (BatchOffset, HOffset, WOffset, COffset) = cube.offset + (BatchSize, HSize, WSize, CSize) = cube.dims + + InCube, padding_tuple = RedmuleConv2DTileConstraint.computeInputCube((weightH, weightW), pads, strides, + weightC, cube, + ctxt.lookup(varOut).shape) + + padding_left, padding_right, padding_top, padding_bottom = padding_tuple + + replacements['dim_im_in_x'].append(InCube.dims[1]) + replacements['dim_im_in_y'].append(InCube.dims[2]) + replacements['dim_im_out_x'].append(HSize) + replacements['dim_im_out_y'].append(WSize) + replacements['ch_im_out'].append(CSize) + + replacements['padding_y_top'].append(padding_top) + replacements['padding_y_bottom'].append(padding_bottom) + replacements['padding_x_left'].append(padding_left) + replacements['padding_x_right'].append(padding_right) + + inputInCubes.append(InCube) + + # Updated WeightCube for (H, W, Cin, Cout) format + # COffset is now applied to dimension 3 (Cout) + WeightCube = HyperRectangle((0, 0, 0, COffset), (weightH, weightW, weightC, CSize)) + + inputWeightCubes.append(WeightCube) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + for a, b in zip(inputInCubes, inputWeightCubes): + inputLoadSchedule.append({"data_in": a, "weight": b}) + + for out in outputCubes: + outputLoadSchedule.append({"data_out": out}) + + tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes) + + return variableReplacementSchedule, tilingSchedule diff --git a/Deeploy/Targets/Redmule/TileConstraints/GEMMTileConstraint.py b/Deeploy/Targets/Redmule/TileConstraints/GEMMTileConstraint.py new file mode 100644 index 0000000000..fbae4824f8 --- /dev/null +++ b/Deeploy/Targets/Redmule/TileConstraints/GEMMTileConstraint.py @@ -0,0 +1,198 @@ +# ---------------------------------------------------------------------- +# +# File: GEMMTileConstraint.py +# +# Last edited: 02.06.2023 +# +# Copyright (C) 2023, ETH Zurich and University of Bologna. +# +# Author: +# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich +# - Moritz Scherer, scheremo@iis.ee.ethz.ch, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Tuple + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import uint8_t, uint16_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class RedmuleGEMMTileConstraint(TileConstraint): + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + + # Get to-be-tiled tensor's buffers + bufferA = ctxt.lookup(name = parseDict['A']) + bufferB = ctxt.lookup(name = parseDict['B']) + bufferC = ctxt.lookup(name = parseDict['C']) + outputBuffer = ctxt.lookup(name = parseDict['data_out']) + + # Add I/O dimensions to the model as variables + for bufferName in [bufferA.name, bufferB.name, bufferC.name, outputBuffer.name]: + tilerModel.addTensorDimToModel(ctxt, bufferName) + + dimOffsetA = len(bufferA.shape) - 2 + dimOffsetB = len(bufferB.shape) - 2 + dimOffsetC = len(bufferC.shape) - 2 + dimOffsetOut = len(outputBuffer.shape) - 2 + + AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = dimOffsetA + parseDict['transA']) + ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, + dimIdx = dimOffsetA + 1 - parseDict['transA']) + BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, dimIdx = dimOffsetB + parseDict['transB']) + BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, + dimIdx = dimOffsetB + 1 - parseDict['transB']) + outputFirstDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = dimOffsetOut) + outputSecondDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = dimOffsetOut + 1) + + # Map output dims to inputs dims + tilerModel.addConstraint(outputFirstDimVar == AFirstDimVar) + tilerModel.addConstraint(outputSecondDimVar == BSecondDimVar) + + # Add GEMM Geometrical constraints + tilerModel.addConstraint(ASecondDimVar == BFirstDimVar) + + addDimVar_1 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = dimOffsetC) + addDimVar_2 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = dimOffsetC + 1) + tilerModel.addConstraint(outputFirstDimVar == addDimVar_1) + tilerModel.addConstraint(outputSecondDimVar == addDimVar_2) + + return tilerModel + + @staticmethod + def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + from Deeploy.TilingExtension.TilerModel import PerformanceHint + + bufferA = ctxt.lookup(name = parseDict['A']) + bufferB = ctxt.lookup(name = parseDict['B']) + + tensorsShapeLen = min(len(bufferA.shape), len(bufferB.shape)) + + dimOffsetA = len(bufferA.shape) - 2 + dimOffsetB = len(bufferB.shape) - 2 + + AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = dimOffsetA + parseDict['transA']) + ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, + dimIdx = dimOffsetA + 1 - parseDict['transA']) + BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, dimIdx = dimOffsetB + parseDict['transB']) + BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, + dimIdx = dimOffsetB + 1 - parseDict['transB']) + + # VIC: We don't want to deal with intermediate results between kernel calls + tilerModel.addConstraint(ASecondDimVar == parseDict['N']) + tilerModel.addConstraint(BFirstDimVar == parseDict['N']) + + tilerModel.addConstraint(AFirstDimVar == AFirstDimVar.Max(), strategy = PerformanceHint(1)) + tilerModel.addConstraint(BSecondDimVar == BSecondDimVar.Max(), strategy = PerformanceHint(1)) + + return tilerModel + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + addrNames = ['A', 'B', 'C', 'data_out'] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + transA = operatorRepresentation['transA'] + transB = operatorRepresentation['transB'] + + varA = operatorRepresentation['A'] + varB = operatorRepresentation['B'] + + if transA == 0: + NSize = ctxt.lookup(varA).shape[-1] + else: + NSize = ctxt.lookup(varA).shape[-2] + + NOffset = 0 + + inputACubes = [] + inputBCubes = [] + inputAddCubes = [] + + replacements = {"M": [], "O": [], "batch": []} + + # Every output is constructed by a pair of inputs. Reconstruct this pair. + for cube in outputCubes: + + BSize = 1 + BOffset = 0 + BatchSize = 1 + BatchOffset = 0 + + if len(cube.offset) == 2: + (MOffset, OOffset) = cube.offset + (MSize, OSize) = cube.dims + elif len(cube.offset) == 3: + (BatchOffset, MOffset, OOffset) = cube.offset + (BatchSize, MSize, OSize) = cube.dims + else: + (BatchOffset, BOffset, MOffset, OOffset) = cube.offset + (BatchSize, BSize, MSize, OSize) = cube.dims + + replacements["M"].append(MSize) + replacements["O"].append(OSize) + replacements["batch"].append(BSize) + + if transA == 0: + ACube = HyperRectangle((BatchOffset, BOffset, MOffset, NOffset), (BatchSize, BSize, MSize, NSize)) + else: + ACube = HyperRectangle((BatchOffset, BOffset, NOffset, MOffset), (BatchSize, BSize, NSize, MSize)) + + if transB == 0: + BCube = HyperRectangle((BatchOffset, BOffset, NOffset, OOffset), (BatchSize, BSize, NSize, OSize)) + else: + BCube = HyperRectangle((BatchOffset, BOffset, OOffset, NOffset), (BatchSize, BSize, OSize, NSize)) + + CCube = HyperRectangle(cube.offset, cube.dims) + + inputACubes.append(ACube) + inputBCubes.append(BCube) + inputAddCubes.append(CCube) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + replacements["N"] = [NSize] * len(outputCubes) + + replacementTypes = { + "M": PointerClass(uint16_t), + "N": PointerClass(uint16_t), + "O": PointerClass(uint16_t), + "batch": PointerClass(uint8_t) + } + + for a, b, c in zip(inputACubes, inputBCubes, inputAddCubes): + inputLoadSchedule.append({"A": a, "B": b, "C": c}) + + for out in outputCubes: + outputLoadSchedule.append({"data_out": out}) + + schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + + return VariableReplacementScheme(replacements, replacementTypes), schedule diff --git a/Deeploy/Targets/Redmule/TileConstraints/MatmulTileConstraint.py b/Deeploy/Targets/Redmule/TileConstraints/MatmulTileConstraint.py new file mode 100644 index 0000000000..1b14ccc464 --- /dev/null +++ b/Deeploy/Targets/Redmule/TileConstraints/MatmulTileConstraint.py @@ -0,0 +1,197 @@ +# ---------------------------------------------------------------------- +# +# File: MatMulTileConstraint.py +# +# Last edited: 28.04.2025 +# +# Copyright (C) 2025, ETH Zurich and University of Bologna. +# +# Author: [Your Name] +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Tuple + +from Deeploy.AbstractDataTypes import PointerClass +from Deeploy.CommonExtensions.DataTypes import int8_t +from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation +from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint +from Deeploy.TilingExtension.TileConstraint import TileConstraint +from Deeploy.TilingExtension.TilerModel import PerformanceHint, TilerModel +from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \ + VariableReplacementScheme + + +class RedmuleMatmulTileConstraint(TileConstraint): + + @staticmethod + def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + + # Get to-be-tiled tensor's buffers + bufferA = ctxt.lookup(name = parseDict['A']) + bufferB = ctxt.lookup(name = parseDict['B']) + outputBuffer = ctxt.lookup(name = parseDict['data_out']) + + # Add I/O dimensions to the model as variables + for _buffer in [bufferA, bufferB, outputBuffer]: + tilerModel.addTensorDimToModel(ctxt, _buffer.name) + + tensorsShapeLen = len(bufferA.shape) + + AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, + dimIdx = (tensorsShapeLen - 2) + parseDict['transA']) + ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, + dimIdx = (tensorsShapeLen - 1) - parseDict['transA']) + BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, + dimIdx = (tensorsShapeLen - 2) + parseDict['transB']) + BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, + dimIdx = (tensorsShapeLen - 1) - parseDict['transB']) + outputFirstDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = (tensorsShapeLen - 2)) + outputSecondDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = (tensorsShapeLen - 1)) + + # Map output dims to inputs dims + for idx in range(tensorsShapeLen - 2): + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = idx) == tilerModel.getTensorDimVar( + tensorName = bufferA.name, dimIdx = idx)) + tilerModel.addConstraint( + tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = idx) == tilerModel.getTensorDimVar( + tensorName = bufferB.name, dimIdx = idx)) + + tilerModel.addConstraint(outputFirstDimVar == AFirstDimVar) + tilerModel.addConstraint(outputSecondDimVar == BSecondDimVar) + + # Add GEMM Geometrical constraints + tilerModel.addConstraint(ASecondDimVar == BFirstDimVar) + + return tilerModel + + @staticmethod + def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel: + + bufferA = ctxt.lookup(name = parseDict['A']) + bufferB = ctxt.lookup(name = parseDict['B']) + + tensorsShapeLen = len(bufferA.shape) + + AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, + dimIdx = (tensorsShapeLen - 2) + parseDict['transA']) + ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, + dimIdx = (tensorsShapeLen - 1) - parseDict['transA']) + BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, + dimIdx = (tensorsShapeLen - 2) + parseDict['transB']) + BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, + dimIdx = (tensorsShapeLen - 1) - parseDict['transB']) + + # VIC: We don't want to deal with intermediate results between kernel calls + tilerModel.addConstraint(ASecondDimVar == parseDict['N']) + tilerModel.addConstraint(BFirstDimVar == parseDict['N']) + + # Hardware-specific constraints for 4x12 accelerator + tilerModel.addConstraint(BSecondDimVar == BSecondDimVar.Max(), strategy = PerformanceHint(1)) + + M_full_size = ctxt.lookup(bufferA.name).shape[(tensorsShapeLen - 2) + parseDict['transA']] + if M_full_size >= 16: + tilerModel.addTileSizeDivisibleConstraint(parseDict, + "M", + AFirstDimVar, + 16, + strategy = PerformanceHint(priority = 1)) + else: + tilerModel.addConstraint(AFirstDimVar == AFirstDimVar.Max(), strategy = PerformanceHint(1)) + + N_full_size = ctxt.lookup(bufferB.name).shape[(tensorsShapeLen - 2) + parseDict['transB']] + if N_full_size >= 12: + tilerModel.addTileSizeDivisibleConstraint(parseDict, + "O", + BSecondDimVar, + 12, + strategy = PerformanceHint(priority = 1)) + else: + tilerModel.addConstraint(BSecondDimVar == BSecondDimVar.Max(), strategy = PerformanceHint(1)) + + return tilerModel + + @classmethod + def serializeTilingSolution( + cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle], + targetMemLevel: str, ctxt: NetworkContext, + operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]: + outputCubes = [cube.rectangle for cube in absoluteOutputCubes] + + addrNames = ['A', 'B', 'data_out'] + inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel, + operatorRepresentation, addrNames) + + varA = operatorRepresentation['A'] + + NSize = ctxt.lookup(varA).shape[-1] + NOffset = 0 + + inputACubes = [] + inputBCubes = [] + + replacements = {"M": [], "O": [], "batch": []} + + # Every output is constructed by a pair of inputs. Reconstruct this pair. + for cube in outputCubes: + + BSize = 1 + BOffset = 0 + BatchSize = 1 + BatchOffset = 0 + + if len(cube.offset) == 2: + (MOffset, OOffset) = cube.offset + (MSize, OSize) = cube.dims + elif len(cube.offset) == 3: + (BatchOffset, MOffset, OOffset) = cube.offset + (BatchSize, MSize, OSize) = cube.dims + else: + (BatchOffset, BOffset, MOffset, OOffset) = cube.offset + (BatchSize, BSize, MSize, OSize) = cube.dims + + replacements["M"].append(MSize) + replacements["O"].append(OSize) + replacements["batch"].append(BSize) + + ACube = HyperRectangle((BatchOffset, BOffset, MOffset, NOffset), (BatchSize, BSize, MSize, NSize)) + BCube = HyperRectangle((BatchOffset, BOffset, NOffset, OOffset), (BatchSize, BSize, NSize, OSize)) + + inputACubes.append(ACube) + inputBCubes.append(BCube) + + inputLoadSchedule = [] + outputLoadSchedule = [] + + replacements["N"] = [NSize] * len(outputCubes) + + replacementTypes = { + "M": PointerClass(int8_t), + "N": PointerClass(int8_t), + "O": PointerClass(int8_t), + "batch": PointerClass(int8_t) + } + + for a, b in zip(inputACubes, inputBCubes): + inputLoadSchedule.append({"A": a, "B": b}) + + for out in outputCubes: + outputLoadSchedule.append({"data_out": out}) + + schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule) + + return VariableReplacementScheme(replacements, replacementTypes), schedule diff --git a/Deeploy/Targets/Redmule/TileConstraints/__init__.py b/Deeploy/Targets/Redmule/TileConstraints/__init__.py new file mode 100644 index 0000000000..a73187ca8f --- /dev/null +++ b/Deeploy/Targets/Redmule/TileConstraints/__init__.py @@ -0,0 +1,26 @@ +# ---------------------------------------------------------------------- +# +# File: __init__.py +# +# Last edited: 08.05.2025 +# +# Copyright (C) 2024, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import * diff --git a/Deeploy/Targets/Redmule/Tiler.py b/Deeploy/Targets/Redmule/Tiler.py new file mode 100644 index 0000000000..1db49cb4a3 --- /dev/null +++ b/Deeploy/Targets/Redmule/Tiler.py @@ -0,0 +1,37 @@ +# ---------------------------------------------------------------------- +# +# File: Tiler.py +# +# Last edited: 26.07.2024 +# +# Copyright (C) 2024, ETH Zurich and University of Bologna. +# +# Author: Moritz Scherer, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from Deeploy.Targets.Redmule.Bindings import RedmuleConv2DBindings, RedmuleGEMMBindings, RedmuleMatmulBindings +from Deeploy.Targets.Redmule.TileConstraints.ConvTileConstraint import RedmuleConv2DTileConstraint +from Deeploy.Targets.Redmule.TileConstraints.GEMMTileConstraint import RedmuleGEMMTileConstraint +from Deeploy.Targets.Redmule.TileConstraints.MatmulTileConstraint import RedmuleMatmulTileConstraint +from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings + +RedmuleMatMulTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = RedmuleMatmulBindings, + tileConstraint = RedmuleMatmulTileConstraint()) +RedmuleConvTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = RedmuleConv2DBindings, + tileConstraint = RedmuleConv2DTileConstraint()) +RedmuleGEMMTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = RedmuleGEMMBindings, + tileConstraint = RedmuleGEMMTileConstraint()) diff --git a/Deeploy/Targets/Redmule/TopologyOptimizationPasses/Passes.py b/Deeploy/Targets/Redmule/TopologyOptimizationPasses/Passes.py new file mode 100644 index 0000000000..f9d3d95b57 --- /dev/null +++ b/Deeploy/Targets/Redmule/TopologyOptimizationPasses/Passes.py @@ -0,0 +1,150 @@ +# ---------------------------------------------------------------------- +# +# File: RedMulePasses.py +# +# Last edited: 09.05.2025 +# +# Copyright (C) 2025, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import numpy as np +import onnx_graphsurgeon as gs + +from Deeploy.CommonExtensions.OptimizationPasses.Matchers import Match +from Deeploy.CommonExtensions.OptimizationPasses.PassClasses import ReplaceSequentialPatternPass, contextagnostic +from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \ + _appendTranspose + + +def _redmule_weight_layout_fun(graph: gs.Graph, match: Match, name: str): + """Convert Conv weights from [cout, h, w, cin] to [h,w,cin, cout] for RedMule accelerator""" + node = list(match.nodes_map.values())[0] + + weightTensor = node.inputs[1] + if isinstance(weightTensor, gs.Constant): + weightTensor.values = np.transpose(weightTensor.values, (1, 2, 3, 0)) + + return graph + + +@contextagnostic +class RedMuleAdjustWeightMemoryLayoutPass(ReplaceSequentialPatternPass): + """Pass to convert Conv weights from [cout, h, w, cin] to [hwcin, cout] for RedMule accelerator""" + + def __init__(self, redmuleEngineName: str): + graph = gs.Graph() + _input = gs.Variable(name = 'input_1') + output = graph.layer(inputs = [_input], outputs = ['convOut'], op = 'Conv', name = 'conv') + graph.outputs.append(output) + graph.inputs.append(_input) + + super().__init__(graph, _redmule_weight_layout_fun, "_REDMULE_ADJUST_WEIGHT_MEMORY_LAYOUT_PASS") + + +def _redmule_gemm_transpose_fun(graph: gs.Graph, match: Match, name: str): + """ + Handle GEMM transA and transB attributes for RedMule accelerator + + Properly handles tensors of any dimensionality, ensuring only the last two + dimensions are transposed when needed. + """ + matched_nodes = [m for k, m in match.nodes_map.items()] + gemm_node = matched_nodes[0] + + if 'transA' not in gemm_node.attrs: + gemm_node.attrs['transA'] = 0 + if 'transB' not in gemm_node.attrs: + gemm_node.attrs['transB'] = 0 + if 'alpha' not in gemm_node.attrs: + gemm_node.attrs['alpha'] = 1.0 + if 'beta' not in gemm_node.attrs: + gemm_node.attrs['beta'] = 1.0 + + inputA = gemm_node.inputs[0] + inputB = gemm_node.inputs[1] + + if gemm_node.attrs['transA'] != 0: + if isinstance(inputA, gs.Constant): + print(f"Physical transpose for constant A: {inputA.name}") + + if len(inputA.values.shape) > 2: + perm = list(range(len(inputA.values.shape))) + perm[-1], perm[-2] = perm[-2], perm[-1] + inputA.values = np.transpose(inputA.values, perm) + else: + inputA.values = np.transpose(inputA.values) + + gemm_node.attrs['transA'] = 0 + else: + + perm = list(range(len(inputA.shape))) + perm[-1], perm[-2] = perm[-2], perm[-1] + + anchorTransposeNode = _appendTranspose(inputA, gemm_node, perm) + gemm_node.attrs['transA'] = 0 + graph.nodes.append(anchorTransposeNode) + + if gemm_node.attrs['transB'] != 0: + if isinstance(inputB, gs.Constant): + + if len(inputB.values.shape) > 2: + + perm = list(range(len(inputB.values.shape))) + perm[-1], perm[-2] = perm[-2], perm[-1] + + inputB.values = np.transpose(inputB.values, perm) + else: + inputB.values = np.transpose(inputB.values) + + gemm_node.attrs['transB'] = 0 + else: + print(f"Adding transpose node for variable B: {inputB.name}") + + perm = list(range(len(inputB.shape))) + perm[-1], perm[-2] = perm[-2], perm[-1] + + anchorTransposeNode = _appendTranspose(inputB, gemm_node, perm) + gemm_node.attrs['transB'] = 0 + graph.nodes.append(anchorTransposeNode) + + return graph + + +@contextagnostic +class RedMuleGEMMTransposePass(ReplaceSequentialPatternPass): + """Pass to handle GEMM transA and transB attributes for RedMule accelerator""" + + def __init__(self, redmuleEngineName: str): + + pattern = gs.Graph() + + input_a = gs.Variable(name = "input_a") + input_b = gs.Variable(name = "input_b") + + gemm_output = pattern.layer(op = "Gemm", + name = "gemm_node", + inputs = [input_a, input_b], + outputs = ["gemm_output"]) + + pattern.inputs = [input_a, input_b] + pattern.outputs = [gemm_output] + + super().__init__(pattern = pattern, + replacement_fn = _redmule_gemm_transpose_fun, + name = "_REDMULE_GEMM_TRANSPOSE_PASS") diff --git a/Deeploy/Targets/Redmule/TopologyOptimizationPasses/__init__.py b/Deeploy/Targets/Redmule/TopologyOptimizationPasses/__init__.py new file mode 100644 index 0000000000..63063b6066 --- /dev/null +++ b/Deeploy/Targets/Redmule/TopologyOptimizationPasses/__init__.py @@ -0,0 +1,26 @@ +# ---------------------------------------------------------------------- +# +# File: __init__.py +# +# Last edited: 09.05.2025 +# +# Copyright (C) 2024, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import * diff --git a/Deeploy/Targets/Redmule/__init__.py b/Deeploy/Targets/Redmule/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/Deeploy/Targets/SoftHier/__init__.py b/Deeploy/Targets/SoftHier/__init__.py new file mode 100644 index 0000000000..e69de29bb2 diff --git a/DeeployTest/CMakeLists.txt b/DeeployTest/CMakeLists.txt index b7f3535790..d94bf1af07 100644 --- a/DeeployTest/CMakeLists.txt +++ b/DeeployTest/CMakeLists.txt @@ -41,7 +41,7 @@ elseif(DEEPLOY_ARCH STREQUAL PULP) -Wno-pointer-sign ) - if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka) + if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platform STREQUAL Siracusa_w_redmule) add_subdirectory(Platforms/Siracusa) elseif(platform STREQUAL PULPOpen) add_subdirectory(Platforms/PULPOpen) diff --git a/DeeployTest/conftest.py b/DeeployTest/conftest.py index c7077067d9..a2be5e0cf3 100644 --- a/DeeployTest/conftest.py +++ b/DeeployTest/conftest.py @@ -66,6 +66,8 @@ def pytest_configure(config: pytest.Config) -> None: config.addinivalue_line("markers", "siracusa_tiled: mark test as a Siracusa platform test (tiled)") config.addinivalue_line("markers", "siracusa_neureka_tiled: mark test as a Siracusa + Neureka platform test (tiled)") + config.addinivalue_line("markers", + "siracusa_redmule_tiled: mark test as a Siracusa + RedMulE platform test (tiled)") config.addinivalue_line("markers", "gap9: mark test as a GAP9 platform test") config.addinivalue_line("markers", "gap9_tiled: mark test as a GAP9 platform test (tiled)") config.addinivalue_line("markers", "kernels: mark test as a kernel test (individual operators)") diff --git a/DeeployTest/testRunner_tiled_siracusa_w_redmule.py b/DeeployTest/testRunner_tiled_siracusa_w_redmule.py new file mode 100644 index 0000000000..9ebd9c6358 --- /dev/null +++ b/DeeployTest/testRunner_tiled_siracusa_w_redmule.py @@ -0,0 +1,49 @@ +# ---------------------------------------------------------------------- +# +# File: testRunner_tiled_siracusa_w_neureka.py +# +# Last edited: 08.05.2025 +# +# Copyright (C) 2023, ETH Zurich and University of Bologna. +# +# Author: Run Wang, ETH Zurich +# +# ---------------------------------------------------------------------- +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from testUtils.testRunner import TestRunner, TestRunnerArgumentParser + +if __name__ == "__main__": + + parser = TestRunnerArgumentParser( + tiling_arguments = True, + description = "Deeploy Code Generation Utility for the Siracusa Platform (Tiling & Redmule).") + + parser.add_argument('--cores', + metavar = '', + dest = 'cores', + type = int, + default = 1, + help = 'Set number of cluster cores') + args = parser.parse_args() + + testRunner = TestRunner(platform = "Siracusa_w_redmule", + simulator = "gvsoc", + tiling = True, + argument_parser = parser) + + testRunner.cmake_args += f" -D NUM_CORES={args.cores}" + + testRunner.run() diff --git a/DeeployTest/testUtils/platformMapping.py b/DeeployTest/testUtils/platformMapping.py index 9d526906f9..32c06c95cd 100644 --- a/DeeployTest/testUtils/platformMapping.py +++ b/DeeployTest/testUtils/platformMapping.py @@ -25,13 +25,17 @@ NeurekaPlatform from Deeploy.Targets.PULPOpen.Deployer import PULPDeployer from Deeploy.Targets.PULPOpen.Platform import MemoryPULPPlatform, MemoryPULPPlatformWrapper, PULPOptimizer, PULPPlatform +from Deeploy.Targets.Redmule.Deployer import RedmuleDeployer +from Deeploy.Targets.Redmule.Platform import RedmuleOptimizer, RedmulePlatform from Deeploy.Targets.Snitch.Deployer import SnitchDeployer from Deeploy.Targets.Snitch.Platform import SnitchOptimizer, SnitchPlatform from Deeploy.Targets.SoftHier.Deployer import SoftHierDeployer from Deeploy.Targets.SoftHier.Platform import SoftHierOptimizer, SoftHierPlatform _SIGNPROP_PLATFORMS = ["Apollo3", "Apollo4", "QEMU-ARM", "Generic", "MemPool", "SoftHier"] -_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Chimera", "GAP9"] +_NONSIGNPROP_PLATFORMS = [ + "Siracusa", "Siracusa_w_neureka", "Siracusa_w_redmule", "PULPOpen", "Snitch", "Chimera", "GAP9" +] _PLATFORMS = _SIGNPROP_PLATFORMS + _NONSIGNPROP_PLATFORMS @@ -67,6 +71,9 @@ def mapPlatform(platformName: str) -> Tuple[DeploymentPlatform, bool]: elif platformName == "Siracusa_w_neureka": Platform = NeurekaPlatform() + elif platformName == "Siracusa_w_redmule": + Platform = RedmulePlatform() + elif platformName == "Snitch": Platform = SnitchPlatform() @@ -84,7 +91,7 @@ def mapPlatform(platformName: str) -> Tuple[DeploymentPlatform, bool]: def setupMemoryPlatform(platform: DeploymentPlatform, memoryHierarchy: MemoryHierarchy, defaultTargetMemoryLevel: MemoryLevel) -> Union[MemoryPlatform, MemoryPlatformWrapper]: - if isinstance(platform, PULPPlatform): + if isinstance(platform, (PULPPlatform, RedmulePlatform)): return MemoryPULPPlatformWrapper(platform, memoryHierarchy, defaultTargetMemoryLevel) elif isinstance(platform, NeurekaPlatform): weightMemoryLevel = memoryHierarchy.memoryLevels["WeightMemory_SRAM"] \ @@ -207,6 +214,24 @@ def mapDeployer(platform: DeploymentPlatform, default_channels_first = default_channels_first, deeployStateDir = deeployStateDir) + elif isinstance(platform, RedmulePlatform): + + if loweringOptimizer is None: + loweringOptimizer = RedmuleOptimizer + + if default_channels_first is None: + default_channels_first = False + + deployer = RedmuleDeployer(graph, + platform, + inputTypes, + loweringOptimizer, + scheduler, + name = name, + default_channels_first = default_channels_first, + deeployStateDir = deeployStateDir, + inputOffsets = inputOffsets) + elif isinstance(platform, (GAP9Platform, MemoryGAP9Platform, MemoryGAP9PlatformWrapper)): if loweringOptimizer is None: diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py index 6d9f3cfcd7..4add6e1fc9 100644 --- a/DeeployTest/test_platforms.py +++ b/DeeployTest/test_platforms.py @@ -33,6 +33,9 @@ from test_siracusa_neureka_tiled_config import L3_DOUBLEBUFFER_MODELS as NEUREKA_L3_DOUBLEBUFFER_MODELS from test_siracusa_neureka_tiled_config import L3_DOUBLEBUFFER_MODELS_WMEM as NEUREKA_L3_DOUBLEBUFFER_MODELS_WMEM from test_siracusa_neureka_tiled_config import L3_SINGLEBUFFER_MODELS as NEUREKA_L3_SINGLEBUFFER_MODELS +from test_siracusa_redmule_tiled_config import DEFAULT_CORES as REDMULE_DEFAULT_CORES +from test_siracusa_redmule_tiled_config import L2_DOUBLEBUFFER_KERNELS as REDMULE_L2_DOUBLEBUFFER_KERNELS +from test_siracusa_redmule_tiled_config import L2_SINGLEBUFFER_KERNELS as REDMULE_L2_SINGLEBUFFER_KERNELS from test_siracusa_tiled_config import L2_DOUBLEBUFFER_KERNELS, L2_DOUBLEBUFFER_MODELS, L2_SINGLEBUFFER_KERNELS, \ L2_SINGLEBUFFER_MODELS, L3_DOUBLEBUFFER_MODELS, L3_SINGLEBUFFER_MODELS from test_snitch_config import DEFAULT_NUM_CORES as SNITCH_DEFAULT_NUM_CORES @@ -987,3 +990,61 @@ def test_gap9_tiled_models_l3_doublebuffer(test_params, deeploy_test_dir, toolch double_buffer = True, ) run_and_assert_test(test_name, config, skipgen, skipsim) + + +@pytest.mark.siracusa_redmule_tiled +@pytest.mark.kernels +@pytest.mark.singlebuffer +@pytest.mark.l2 +@pytest.mark.parametrize( + "test_params", + generate_test_params(REDMULE_L2_SINGLEBUFFER_KERNELS, "L2-singlebuffer"), + ids = param_id, +) +def test_siracusa_redmule_tiled_kernels_l2_singlebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir, + cmake_args, skipgen, skipsim) -> None: + test_name, l1, config_name = test_params + config = create_test_config( + test_name = test_name, + platform = "Siracusa_w_redmule", + simulator = "gvsoc", + deeploy_test_dir = deeploy_test_dir, + toolchain = toolchain, + toolchain_dir = toolchain_dir, + cmake_args = cmake_args, + tiling = True, + cores = REDMULE_DEFAULT_CORES, + l1 = l1, + default_mem_level = "L2", + double_buffer = False, + ) + run_and_assert_test(test_name, config, skipgen, skipsim) + + +@pytest.mark.siracusa_redmule_tiled +@pytest.mark.kernels +@pytest.mark.doublebuffer +@pytest.mark.l2 +@pytest.mark.parametrize( + "test_params", + generate_test_params(REDMULE_L2_DOUBLEBUFFER_KERNELS, "L2-doublebuffer"), + ids = param_id, +) +def test_siracusa_redmule_tiled_kernels_l2_doublebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir, + cmake_args, skipgen, skipsim) -> None: + test_name, l1, config_name = test_params + config = create_test_config( + test_name = test_name, + platform = "Siracusa_w_redmule", + simulator = "gvsoc", + deeploy_test_dir = deeploy_test_dir, + toolchain = toolchain, + toolchain_dir = toolchain_dir, + cmake_args = cmake_args, + tiling = True, + cores = REDMULE_DEFAULT_CORES, + l1 = l1, + default_mem_level = "L2", + double_buffer = True, + ) + run_and_assert_test(test_name, config, skipgen, skipsim) diff --git a/DeeployTest/test_siracusa_redmule_tiled_config.py b/DeeployTest/test_siracusa_redmule_tiled_config.py new file mode 100644 index 0000000000..08883e0cb7 --- /dev/null +++ b/DeeployTest/test_siracusa_redmule_tiled_config.py @@ -0,0 +1,21 @@ +# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +# +# SPDX-License-Identifier: Apache-2.0 +"""Test configuration for Siracusa platform with RedMulE accelerator (tiled).""" + +# Siracusa + RedMulE platform with tiling support +# Default configuration: 8 cores, gvsoc simulator + +DEFAULT_CORES = 8 + +# L2 single-buffer kernel tests +# Format: dict of {test_name: [L1_sizes]} +L2_SINGLEBUFFER_KERNELS = { + "Kernels/FP32/GEMM/Regular": [8000], + "Kernels/FP32/GEMM/TransB": [8000], +} + +# L2 double-buffer kernel tests +L2_DOUBLEBUFFER_KERNELS = { + "Kernels/FP32/GEMM/Regular": [8000], +} diff --git a/Makefile b/Makefile index f007f105c1..423c3b8d70 100644 --- a/Makefile +++ b/Makefile @@ -49,7 +49,7 @@ PULP_SDK_COMMIT_HASH ?= 7f4f22516157a1b7c55bcbbc72ca81326180b3b4 MEMPOOL_COMMIT_HASH ?= affd45d94e05e375a6966af6a762deeb182a7bd6 SNITCH_COMMIT_HASH ?= e02cc9e3f24b92d4607455d5345caba3eb6273b2 SOFTHIER_COMMIT_HASH ?= 0 # bowwang: to be updated -GVSOC_COMMIT_HASH ?= edfcd8398840ceb1e151711befa06678b05f06a0 +GVSOC_COMMIT_HASH ?= 35d00d15d7249daaac0de61bd8485fba128e5959 MINIMALLOC_COMMMIT_HASH ?= e9eaf54094025e1c246f9ec231b905f8ef42a29d CHIMERA_SDK_COMMIT_HASH ?= b2392f6efcff75c03f4c65eaf3e12104442b22ea XTL_VERSION ?= 0.7.5 @@ -465,7 +465,7 @@ snitch_runtime: ${SNITCH_INSTALL_DIR} ${TOOLCHAIN_DIR}/gvsoc: cd ${TOOLCHAIN_DIR} && \ - git clone https://github.com/gvsoc/gvsoc.git && \ + git clone https://github.com/runwangdl/gvsoc.git && \ cd ${TOOLCHAIN_DIR}/gvsoc && git checkout ${GVSOC_COMMIT_HASH} && \ git submodule update --init --recursive && \ pip install -r core/requirements.txt && pip install -r gapy/requirements.txt diff --git a/TargetLibraries/PULPOpen/CMakeLists.txt b/TargetLibraries/PULPOpen/CMakeLists.txt index 1a510c945b..d136ed3507 100644 --- a/TargetLibraries/PULPOpen/CMakeLists.txt +++ b/TargetLibraries/PULPOpen/CMakeLists.txt @@ -10,7 +10,7 @@ if(NOT DEFINED ENV{PULP_SDK_HOME}) message(FATAL_ERROR "Environment variable PULP_SDK_HOME not set.") endif() -if(platform STREQUAL "Siracusa" OR platform STREQUAL "Siracusa_w_neureka") +if(platform STREQUAL "Siracusa" OR platform STREQUAL "Siracusa_w_neureka" OR platform STREQUAL "Siracusa_w_redmule") include(cmake/pulp-sdk-siracusa.cmake) elseif(platform STREQUAL "PULPOpen") include(cmake/pulp-sdk-pulp-open.cmake) diff --git a/TargetLibraries/PULPOpen/src/Matmul_fp32_Redmule.c b/TargetLibraries/PULPOpen/src/Matmul_fp32_Redmule.c new file mode 100644 index 0000000000..ad33b66b43 --- /dev/null +++ b/TargetLibraries/PULPOpen/src/Matmul_fp32_Redmule.c @@ -0,0 +1,160 @@ +// SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna +// +// SPDX-License-Identifier: Apache-2.0 + +#include "DeeployBasicMath.h" + +#define REDMULE_BASE_ADDR 0x10201C00 + +#define REG_MNK_M 0x00 +#define REG_MNK_N 0x04 +#define REG_MNK_K 0x08 +#define REG_X_ADDR 0x0C +#define REG_Y_ADDR 0x10 +#define REG_Z_ADDR 0x14 +#define REG_W_ADDR 0x18 +#define REG_COMPUTE_MODE 0x1C +#define REG_TRIGGER 0x20 +#define REG_WAIT 0x28 + +void MatMul_fp32_fp32_fp32_Redmule(const float32_t *__restrict__ pSrcA, + const float32_t *__restrict__ pSrcB, + float32_t *__restrict__ pDstY, uint32_t M, + uint32_t N, uint32_t O) { + + uint32_t total_elements = M * O; + for (uint32_t i = 0; i < total_elements; i++) { + pDstY[i] = 0.0f; + } + + volatile uint16_t *mnk_m = + (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_M); + volatile uint16_t *mnk_n = + (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_N); + volatile uint16_t *mnk_k = + (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_K); + + *mnk_m = (uint16_t)M; + *mnk_n = (uint16_t)N; + *mnk_k = (uint16_t)O; + + volatile uint32_t *x_addr = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_X_ADDR); + volatile uint32_t *y_addr = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Y_ADDR); + volatile uint32_t *z_addr = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Z_ADDR); + volatile uint32_t *w_addr = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_W_ADDR); + + *x_addr = (uint32_t)((uintptr_t)pSrcA); + *y_addr = (uint32_t)((uintptr_t)pDstY); + *z_addr = (uint32_t)((uintptr_t)pDstY); + *w_addr = (uint32_t)((uintptr_t)pSrcB); + + volatile uint32_t *compute_mode = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_COMPUTE_MODE); + *compute_mode = 4; // FP32 mode + + volatile uint32_t *trigger = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_TRIGGER); + *trigger; + + volatile uint32_t *wait_reg = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_WAIT); + uint32_t result = *wait_reg; +} + +void MatMul_fp32_fp32_fp32_Redmule_Async(const float32_t *__restrict__ pSrcA, + const float32_t *__restrict__ pSrcB, + float32_t *__restrict__ pDstY, + uint32_t M, uint32_t N, uint32_t O) { + + uint32_t total_elements = M * O; + for (uint32_t i = 0; i < total_elements; i++) { + pDstY[i] = 0.0f; + } + + volatile uint16_t *mnk_m = + (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_M); + volatile uint16_t *mnk_n = + (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_N); + volatile uint16_t *mnk_k = + (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_K); + + *mnk_m = (uint16_t)M; + *mnk_n = (uint16_t)N; + *mnk_k = (uint16_t)O; + + volatile uint32_t *x_addr = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_X_ADDR); + volatile uint32_t *y_addr = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Y_ADDR); + volatile uint32_t *z_addr = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Z_ADDR); + volatile uint32_t *w_addr = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_W_ADDR); + + *x_addr = (uint32_t)((uintptr_t)pSrcA); + *y_addr = (uint32_t)((uintptr_t)pDstY); + *z_addr = (uint32_t)((uintptr_t)pDstY); + *w_addr = (uint32_t)((uintptr_t)pSrcB); + + volatile uint32_t *compute_mode = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_COMPUTE_MODE); + *compute_mode = 4; // FP32 mode + + volatile uint32_t *trigger = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_TRIGGER); + *trigger; // Trigger without waiting +} + +uint32_t MatMul_fp32_fp32_fp32_Redmule_Wait() { + volatile uint32_t *wait_reg = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_WAIT); + return *wait_reg; +} + +void Gemm_fp32_fp32_fp32_fp32_Redmule(const float32_t *__restrict__ pSrcA, + const float32_t *__restrict__ pSrcB, + const float32_t *__restrict__ pBias, + float32_t *__restrict__ pDstY, uint32_t M, + uint32_t N, uint32_t O) { + + volatile uint16_t *mnk_m = + (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_M); + volatile uint16_t *mnk_n = + (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_N); + volatile uint16_t *mnk_k = + (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_K); + + *mnk_m = (uint16_t)M; + *mnk_n = (uint16_t)N; + *mnk_k = (uint16_t)O; + + volatile uint32_t *x_addr = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_X_ADDR); + volatile uint32_t *y_addr = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Y_ADDR); + volatile uint32_t *z_addr = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Z_ADDR); + volatile uint32_t *w_addr = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_W_ADDR); + + *x_addr = (uint32_t)((uintptr_t)pSrcA); + *y_addr = (uint32_t)((uintptr_t)pBias); + *z_addr = (uint32_t)((uintptr_t)pDstY); + *w_addr = (uint32_t)((uintptr_t)pSrcB); + + volatile uint32_t *compute_mode = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_COMPUTE_MODE); + *compute_mode = 4; // FP32 mode + + volatile uint32_t *trigger = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_TRIGGER); + *trigger; + + volatile uint32_t *wait_reg = + (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_WAIT); + uint32_t result = *wait_reg; +}