diff --git a/.github/workflows/_runner-siracusa-redmule-tiled.yml b/.github/workflows/_runner-siracusa-redmule-tiled.yml
new file mode 100644
index 0000000000..73f0c9e0a6
--- /dev/null
+++ b/.github/workflows/_runner-siracusa-redmule-tiled.yml
@@ -0,0 +1,45 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+name: _runner-siracusa-redmule-tiled-sequential
+
+"on":
+  workflow_call:
+    inputs:
+      runner:
+        required: true
+        type: string
+      docker-image:
+        required: true
+        type: string
+      pytest-marker:
+        required: true
+        type: string
+
+jobs:
+  test-runner-siracusa-redmule-tiled:
+    runs-on: ${{ inputs.runner }}
+    container:
+      image: ${{ inputs.docker-image }}
+      credentials:
+        username: ${{ github.actor }}
+        password: ${{ secrets.GITHUB_TOKEN }}
+    steps:
+      - name: Mark workspace as safe
+        run: git config --global --add safe.directory '*'
+      - name: Checkout Repo
+        uses: actions/checkout@v4
+        with:
+          submodules: recursive
+      - name: Build Deeploy
+        shell: bash
+        run: pip install -e .
+      - name: Run Test
+        run: |
+          cd DeeployTest
+          mkdir -p /app/.ccache
+          export CCACHE_DIR=/app/.ccache
+          pytest test_platforms.py -v -n 4 -m "siracusa_redmule_tiled and ${{ inputs.pytest-marker }}"
+        shell: bash
diff --git a/.github/workflows/ci-platform-gap9-tiled.yml b/.github/workflows/ci-platform-gap9-tiled.yml
index 61cab4ea70..bd6ea822a9 100644
--- a/.github/workflows/ci-platform-gap9-tiled.yml
+++ b/.github/workflows/ci-platform-gap9-tiled.yml
@@ -24,12 +24,16 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  # GAP9 CI requires access to the private ghcr.io/pulp-platform/deeploy-gap9
+  # image; gate on upstream org so forks skip cleanly.
   select-env:
+    if: github.repository_owner == 'pulp-platform'
     uses: ./.github/workflows/_select-env.yml
     with:
       docker_image_deeploy: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/pulp-platform/deeploy-gap9:devel' }}
 
   gap9-kernels-tiled-singlebuffer-L2:
+    if: github.repository_owner == 'pulp-platform'
     needs: select-env
     uses: ./.github/workflows/_runner-gap9-tiled.yml
     with:
@@ -38,6 +42,7 @@ jobs:
       pytest-markers: "gap9_tiled and kernels and singlebuffer and l2"
 
   gap9-kernels-tiled-doublebuffer-L2:
+    if: github.repository_owner == 'pulp-platform'
     needs: select-env
     uses: ./.github/workflows/_runner-gap9-tiled.yml
     with:
@@ -46,6 +51,7 @@ jobs:
       pytest-markers: "gap9_tiled and kernels and doublebuffer and l2"
 
   gap9-models-tiled-singlebuffer-L2:
+    if: github.repository_owner == 'pulp-platform'
     needs: select-env
     uses: ./.github/workflows/_runner-gap9-tiled.yml
     with:
@@ -54,6 +60,7 @@ jobs:
       pytest-markers: "gap9_tiled and models and singlebuffer and l2"
 
   gap9-models-tiled-doublebuffer-L2:
+    if: github.repository_owner == 'pulp-platform'
     needs: select-env
     uses: ./.github/workflows/_runner-gap9-tiled.yml
     with:
diff --git a/.github/workflows/ci-platform-gap9.yml b/.github/workflows/ci-platform-gap9.yml
index 014828d6ce..fbf5398485 100644
--- a/.github/workflows/ci-platform-gap9.yml
+++ b/.github/workflows/ci-platform-gap9.yml
@@ -25,12 +25,16 @@ concurrency:
   cancel-in-progress: true
 
 jobs:
+  # GAP9 CI requires access to the private ghcr.io/pulp-platform/deeploy-gap9
+  # image; gate on upstream org so forks skip cleanly.
   select-env:
+    if: github.repository_owner == 'pulp-platform'
     uses: ./.github/workflows/_select-env.yml
     with:
       docker_image_deeploy: ${{ github.event.inputs.docker_image_deeploy || 'ghcr.io/pulp-platform/deeploy-gap9:devel' }}
 
   gap9-kernels:
+    if: github.repository_owner == 'pulp-platform'
     needs: select-env
     uses: ./.github/workflows/_runner-gap9.yml
     with:
@@ -39,6 +43,7 @@ jobs:
       pytest-marker: "kernels"
 
   gap9-models:
+    if: github.repository_owner == 'pulp-platform'
     needs: select-env
     uses: ./.github/workflows/_runner-gap9.yml
     with:
diff --git a/.github/workflows/ci-platform-siracusa-redmule-tiled.yml b/.github/workflows/ci-platform-siracusa-redmule-tiled.yml
new file mode 100644
index 0000000000..3fe8b8cb4f
--- /dev/null
+++ b/.github/workflows/ci-platform-siracusa-redmule-tiled.yml
@@ -0,0 +1,50 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+
+---
+name: CI • Siracusa + RedMulE (Tiled)
+
+"on":
+  push:
+    branches:
+      - "**"
+    tags:
+      - "v*.*.*"
+  pull_request:
+  workflow_dispatch:
+    inputs:
+      docker_image_deeploy:
+        description: "Deeploy Image to use"
+        required: false
+        default: "ghcr.io/runwangdl/deeploy:redmule"
+
+concurrency:
+  group: ${{ github.workflow }}-${{ github.ref }}
+  cancel-in-progress: true
+
+jobs:
+  select-env:
+    uses: ./.github/workflows/_select-env.yml
+    with:
+      # RedMulE CI needs the fork's custom Docker image that bundles a
+      # GVSoC build with the light_redmule model. Fall back to
+      # runwangdl/deeploy:redmule on push/PR events (when no input is
+      # provided) rather than the upstream devel image.
+      docker_image_deeploy: ${{ inputs.docker_image_deeploy || 'ghcr.io/runwangdl/deeploy:redmule' }}
+
+  siracusa-redmule-kernels-tiled-singlebuffer-L2:
+    needs: select-env
+    uses: ./.github/workflows/_runner-siracusa-redmule-tiled.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: "kernels and singlebuffer and l2"
+
+  siracusa-redmule-kernels-tiled-doublebuffer-L2:
+    needs: select-env
+    uses: ./.github/workflows/_runner-siracusa-redmule-tiled.yml
+    with:
+      runner: ${{ needs.select-env.outputs.runner }}
+      docker-image: ${{ needs.select-env.outputs.image }}
+      pytest-marker: "kernels and doublebuffer and l2"
diff --git a/.github/workflows/infra-generate-documentation.yml b/.github/workflows/infra-generate-documentation.yml
index de42648f63..1394805c70 100644
--- a/.github/workflows/infra-generate-documentation.yml
+++ b/.github/workflows/infra-generate-documentation.yml
@@ -32,12 +32,12 @@ jobs:
           sphinx-build docs _build
       - name: Prepare Multipages
         uses: xeratec/gh-pages-multibranch@pr/support_tags
-        if: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch' }}
+        if: ${{ (github.event_name == 'push' || github.event_name == 'workflow_dispatch') && github.repository_owner == 'pulp-platform' }}
         with:
           directory: _build
       - name: Deploy to GitHub Pages
         uses: peaceiris/actions-gh-pages@v3
-        if: ${{ github.event_name == 'push' || github.event_name == 'workflow_dispatch'}}
+        if: ${{ (github.event_name == 'push' || github.event_name == 'workflow_dispatch') && github.repository_owner == 'pulp-platform' }}
         with:
           publish_branch: gh-pages
           github_token: ${{ secrets.GITHUB_TOKEN }}
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 0e07d64a9e..3bafd2257c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -20,8 +20,8 @@ if(TOOLCHAIN STREQUAL GCC)
   set(CMAKE_INTERPROCEDURAL_OPTIMIZATION TRUE)
 endif()
 
-set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, PULP-Open, GAP9, Generic, Snitch)")
-set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka PULP-Open GAP9 Generic Snitch)
+set(platform MemPool CACHE STRING "Platform (MemPool, SoftHier, QEMU, Siracusa, Siracusa_w_neureka, Siracusa_w_redmule, PULP-Open, GAP9, Generic, Snitch)")
+set_property(CACHE platform PROPERTY STRINGS MemPool SoftHier QEMU Siracusa Siracusa_w_neureka Siracusa_w_redmule PULP-Open GAP9 Generic Snitch)
 
 if(platform STREQUAL MemPool)
   message(STATUS "Building for platform 'MemPool'")
@@ -31,6 +31,8 @@ elseif(platform STREQUAL Siracusa)
   message(STATUS "Building for platform 'Siracusa'")
 elseif(platform STREQUAL Siracusa_w_neureka)
   message(STATUS "Building for platform 'Siracusa_w_neureka'")
+elseif(platform STREQUAL Siracusa_w_redmule)
+  message(STATUS "Building for platform 'Siracusa_w_redmule'")
 elseif(platform STREQUAL PULPOpen)
   message(STATUS "Building for platform 'PULP-Open'")
 elseif(platform STREQUAL GAP9)
@@ -196,7 +198,7 @@ if(platform STREQUAL QEMU-ARM)
 
 endif()
 
-if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platform STREQUAL PULPOpen)
+if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platform STREQUAL Siracusa_w_redmule OR platform STREQUAL PULPOpen)
 
   if(TOOLCHAIN STREQUAL LLVM)
     set(CMAKE_TOOLCHAIN_FILE ${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/toolchain_llvm.cmake)
@@ -206,7 +208,7 @@ if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platfor
 
   include(${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/pulp.cmake)
 
-  if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka)
+  if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platform STREQUAL Siracusa_w_redmule)
     include(${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/siracusa/siracusa.cmake)
   elseif(platform STREQUAL PULPOpen)
     include(${CMAKE_CURRENT_LIST_DIR}/cmake/pulp/pulp-open/pulp-open.cmake)
diff --git a/Deeploy/Targets/Chimera/__init__.py b/Deeploy/Targets/Chimera/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py
index 59499706e5..ec7df7d326 100644
--- a/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py
+++ b/Deeploy/Targets/PULPOpen/Templates/FloatGemmTemplate.py
@@ -4,9 +4,12 @@
 
 from typing import Dict, List, Tuple
 
-from Deeploy.AbstractDataTypes import float32_tPtr
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import float32_t
 from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
 
+float32_tPtr = PointerClass(float32_t)
+
 
 class PULPFloatGEMMTemplate(NodeTemplate):
 
diff --git a/Deeploy/Targets/Redmule/Bindings.py b/Deeploy/Targets/Redmule/Bindings.py
new file mode 100644
index 0000000000..fa06d789d7
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Bindings.py
@@ -0,0 +1,52 @@
+# ----------------------------------------------------------------------
+#
+# File: NeurekaBindings.py
+#
+# Last edited: 10.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author:
+# Luka Macan, University of Bologna
+# Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import float32_t
+from Deeploy.DeeployTypes import NodeBinding
+from Deeploy.Targets.Generic.TypeCheckers import ConvChecker, GEMMChecker, MatMulChecker
+from Deeploy.Targets.PULPOpen.Bindings import ForkTransformer
+from Deeploy.Targets.Redmule.Templates import ConvTemplate, GEMMTemplate, MatmulTemplate
+
+RedmuleMatmulBindings = [
+    NodeBinding(MatMulChecker([PointerClass(float32_t), PointerClass(float32_t)], [PointerClass(float32_t)]),
+                MatmulTemplate.referenceTemplate, ForkTransformer)
+]
+
+RedmuleConv2DBindings = [
+    NodeBinding(
+        ConvChecker([PointerClass(float32_t), PointerClass(float32_t),
+                     PointerClass(float32_t)], [PointerClass(float32_t)]), ConvTemplate.reference2DIm2ColTemplate,
+        ForkTransformer)
+]
+
+RedmuleGEMMBindings = [
+    NodeBinding(
+        GEMMChecker([PointerClass(float32_t), PointerClass(float32_t),
+                     PointerClass(float32_t)], [PointerClass(float32_t)]), GEMMTemplate.referenceTemplate,
+        ForkTransformer)
+]
diff --git a/Deeploy/Targets/Redmule/Deployer.py b/Deeploy/Targets/Redmule/Deployer.py
new file mode 100644
index 0000000000..7344f63a89
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Deployer.py
@@ -0,0 +1,55 @@
+# ----------------------------------------------------------------------
+#
+# File: Deployer.py
+#
+# Last edited: 08.05.2025
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Dict, Type
+
+import onnx_graphsurgeon as gs
+
+from Deeploy.AbstractDataTypes import Pointer
+from Deeploy.DeeployTypes import DeploymentPlatform, TopologyOptimizer
+from Deeploy.Targets.PULPOpen.Deployer import PULPDeployer
+from Deeploy.Targets.Redmule.TopologyOptimizationPasses.Passes import RedMuleAdjustWeightMemoryLayoutPass, \
+    RedMuleGEMMTransposePass
+
+
+class RedmuleDeployer(PULPDeployer):
+
+    def __init__(self,
+                 graph: gs.Graph,
+                 deploymentPlatform: DeploymentPlatform,
+                 inputTypes: Dict[str, Type[Pointer]],
+                 loweringOptimizer: TopologyOptimizer,
+                 scheduler: Callable = lambda graph: list(graph.nodes),
+                 name: str = 'DeeployNetwork',
+                 default_channels_first = False,
+                 deeployStateDir: str = "DeeployStateDir",
+                 inputOffsets = {}):
+        super().__init__(graph, deploymentPlatform, inputTypes, loweringOptimizer, scheduler, name,
+                         default_channels_first, deeployStateDir, inputOffsets)
+
+        self.loweringOptimizer.passes += [
+            RedMuleAdjustWeightMemoryLayoutPass("Redmule"),
+            RedMuleGEMMTransposePass("Redmule")
+        ]
diff --git a/Deeploy/Targets/Redmule/Engine.py b/Deeploy/Targets/Redmule/Engine.py
new file mode 100644
index 0000000000..bff02fc3d1
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Engine.py
@@ -0,0 +1,60 @@
+# ----------------------------------------------------------------------
+#
+# File: Engine.py
+#
+# Last edited: 26.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List
+
+from Deeploy.DeeployTypes import DeploymentEngine, NodeMapper
+from Deeploy.Targets.Generic.Layers import ConvLayer, GEMMLayer, MatMulLayer
+from Deeploy.Targets.Generic.Parsers import MatMulParser
+from Deeploy.Targets.PULPOpen.Parsers import PULPFPConv2DParser
+from Deeploy.Targets.Redmule.Parsers import GEMMRedmuleParser
+from Deeploy.Targets.Redmule.Tiler import RedmuleConvTilingReadyBindings, RedmuleGEMMTilingReadyBindings, \
+    RedmuleMatMulTilingReadyBindings
+
+MatMulRedmuleMapper = NodeMapper(MatMulParser(), RedmuleMatMulTilingReadyBindings)
+Conv2DRedmuleMapper = NodeMapper(PULPFPConv2DParser(), RedmuleConvTilingReadyBindings)
+GEMMMRedmuleMapper = NodeMapper(GEMMRedmuleParser(), RedmuleGEMMTilingReadyBindings)
+
+RedmuleMapping = {
+    'MatMul': MatMulLayer([MatMulRedmuleMapper]),
+    'Conv': ConvLayer([Conv2DRedmuleMapper]),
+    'Gemm': GEMMLayer([GEMMMRedmuleMapper]),
+}
+
+_includeList = []
+
+_redmuleInitCode = r"""
+// Redmule engine initialization
+"""
+
+
+class RedmuleEngine(DeploymentEngine):
+
+    def __init__(self,
+                 name: str,
+                 Mapping = RedmuleMapping,
+                 initCode: str = _redmuleInitCode,
+                 includeList: List[str] = _includeList) -> None:
+        super().__init__(name, Mapping, initCode, includeList)
diff --git a/Deeploy/Targets/Redmule/Parsers.py b/Deeploy/Targets/Redmule/Parsers.py
new file mode 100644
index 0000000000..ca3bc897cf
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Parsers.py
@@ -0,0 +1,95 @@
+# ----------------------------------------------------------------------
+#
+# File: BasicParsers.py
+#
+# Last edited: 15.12.2021
+#
+# Copyright (C) 2021, ETH Zurich and University of Bologna.
+#
+# Authors:
+# - Moritz Scherer, ETH Zurich
+# - Victor Jung, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Tuple
+
+import numpy as np
+import onnx_graphsurgeon as gs
+
+from Deeploy.DeeployTypes import NetworkContext
+from Deeploy.Targets.Generic.Parsers import MatMulParser
+
+
+class GEMMRedmuleParser(MatMulParser):
+
+    def __init__(self, noBiasHoisting = True):
+        self.noBiasHoisting = noBiasHoisting
+        super().__init__()
+
+    def parseNode(self, node: gs.Node) -> (bool):
+
+        ret = all([len(node.inputs) >= 2, len(node.outputs) == 1, node.attrs['alpha'] == 1])
+
+        if ret:
+            if 'transA' in node.attrs:
+                self.operatorRepresentation['transA'] = node.attrs['transA']
+            else:
+                self.operatorRepresentation['transA'] = 0
+
+            if 'transB' in node.attrs:
+                self.operatorRepresentation['transB'] = node.attrs['transB']
+            else:
+                self.operatorRepresentation['transB'] = 0
+            if 'alpha' in node.attrs:
+                self.operatorRepresentation['alpha'] = node.attrs['alpha']
+            else:
+                self.operatorRepresentation['alpha'] = 1
+            if 'beta' in node.attrs:
+                self.operatorRepresentation['beta'] = node.attrs['beta']
+            else:
+                self.operatorRepresentation['beta'] = 1
+
+        return ret
+
+    def parseNodeCtxt(self,
+                      ctxt: NetworkContext,
+                      node: gs.Node,
+                      channels_first: bool = True) -> Tuple[NetworkContext, bool]:
+
+        newCtxt, ret = super().parseNodeCtxt(ctxt, node, channels_first)
+
+        if ret:
+            inputs = ['A', 'B']
+            outputs = ['data_out']
+
+            for idx, inputNode in enumerate(node.inputs):
+                if idx < len(inputs):
+                    self.operatorRepresentation[inputs[idx]] = newCtxt.lookup(inputNode.name).name
+            for idx, outputNode in enumerate(node.outputs):
+                self.operatorRepresentation[outputs[idx]] = newCtxt.lookup(outputNode.name).name
+
+            if len(node.inputs) == 3:
+                self.operatorRepresentation['C'] = newCtxt.lookup(node.inputs[2].name).name
+            elif not self.noBiasHoisting:
+                values = np.zeros((1))
+                zeroTensor = gs.Constant(f'{node.name}_C_Tensor', values = values)
+                newCtxt.hoistConstant(zeroTensor)
+                self.operatorRepresentation['C'] = f'{node.name}_C_Tensor'
+
+            self.operatorRepresentation['size'] = np.prod(newCtxt.lookup(node.inputs[0].name).shape)
+
+        return newCtxt, ret
diff --git a/Deeploy/Targets/Redmule/Platform.py b/Deeploy/Targets/Redmule/Platform.py
new file mode 100644
index 0000000000..54dfa50ba0
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Platform.py
@@ -0,0 +1,42 @@
+# ----------------------------------------------------------------------
+#
+# File: Platform.py
+#
+# Last edited: 08.05.2025
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.DeeployTypes import TopologyOptimizer
+from Deeploy.Targets.PULPOpen.Platform import PULPClusterEngine, PULPConstantBuffer, PULPOptimizer, PULPPlatform, \
+    PULPStructBuffer, PULPTransientBuffer, PULPVariableBuffer
+from Deeploy.Targets.Redmule.Engine import RedmuleEngine
+
+RedmuleOptimizer = TopologyOptimizer([*PULPOptimizer.passes])
+
+
+class RedmulePlatform(PULPPlatform):
+
+    def __init__(self,
+                 engines = [RedmuleEngine("Redmule"), PULPClusterEngine("PULPCluster")],
+                 variableBuffer = PULPVariableBuffer,
+                 constantBuffer = PULPConstantBuffer,
+                 structBuffer = PULPStructBuffer,
+                 transientBuffer = PULPTransientBuffer) -> None:
+        super().__init__(engines, variableBuffer, constantBuffer, structBuffer, transientBuffer)
diff --git a/Deeploy/Targets/Redmule/Templates/ConvTemplate.py b/Deeploy/Targets/Redmule/Templates/ConvTemplate.py
new file mode 100644
index 0000000000..ba344655b6
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Templates/ConvTemplate.py
@@ -0,0 +1,87 @@
+# ----------------------------------------------------------------------
+#
+# File: ConvTemplate.py
+#
+# Last edited: 09.05.2025
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple, Union
+
+from ortools.constraint_solver.pywrapcp import IntVar
+
+from Deeploy.DeeployTypes import NetworkContext, NodeTemplate, OperatorRepresentation
+
+
+class RedmuleFloatConvIm2ColTemplate(NodeTemplate):
+
+    def __init__(self, templateStr):
+        super().__init__(templateStr)
+
+    @staticmethod
+    def computeTransientBuffersSize(
+            ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> List[Tuple[str, Union[int, IntVar]]]:
+        im2col_dim = 4 * 8 * (operatorRepresentation['ch_im_in'] * operatorRepresentation['dim_kernel_x'] *
+                              operatorRepresentation['dim_kernel_y'])
+        im2col_name = operatorRepresentation['nodeName'] + "_buffer"
+        return [(im2col_name, im2col_dim)]
+
+    def hoistTransientBuffers(self, ctxt: NetworkContext,
+                              operatorRepresentation: OperatorRepresentation) -> Tuple[NetworkContext, Dict, List[str]]:
+        im2col_name, im2col_dim = RedmuleFloatConvIm2ColTemplate.computeTransientBuffersSize(
+            ctxt, operatorRepresentation)[0]
+        ctxt.hoistTransientBuffer(im2col_name, im2col_dim)
+
+        operatorRepresentation['ctxtBuffer'] = im2col_name
+        operatorRepresentation['ctxtBufferSize'] = im2col_dim
+        return ctxt, operatorRepresentation, [im2col_name]
+
+
+reference2DIm2ColTemplate = RedmuleFloatConvIm2ColTemplate("""
+// 2D FP Conv HWC Parallel with Im2Col (Name: ${nodeName}, Op: ${nodeOp})
+${data_in_type.typeName} ref_${data_out}_${data_in} = ${data_in};
+${data_out_type.typeName} ref_${data_out}_${data_out} = ${data_out};
+
+for (uint32_t n=0; n<${batch}; ++n) {
+
+    Conv2d_Im2Col_fp${data_in_type.referencedType.typeWidth}_fp${weight_type.referencedType.typeWidth}_fp${data_out_type.referencedType.typeWidth}_HWC_8_Redmule(
+        ref_${data_out}_${data_in},
+        ${dim_im_in_y},
+        ${dim_im_in_x},
+        ${ch_im_in},
+        ${weight},
+        ${dim_kernel_y},
+        ${dim_kernel_x},
+        ${stride_y},
+        ${stride_x},
+        ref_${data_out}_${data_out},
+        ${ch_im_out},
+        ${padding_y_top},
+        ${padding_y_bottom},
+        ${padding_x_left},
+        ${padding_x_right},
+        ${ctxtBuffer}
+    );
+
+    ref_${data_out}_${data_in} += ${ch_im_in} * ${dim_im_in_x} * ${dim_im_in_y};
+    ref_${data_out}_${data_out} += ${ch_im_out} * ${dim_im_out_x} * ${dim_im_out_y};
+}
+""")
diff --git a/Deeploy/Targets/Redmule/Templates/GEMMTemplate.py b/Deeploy/Targets/Redmule/Templates/GEMMTemplate.py
new file mode 100644
index 0000000000..ba41ab7693
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Templates/GEMMTemplate.py
@@ -0,0 +1,61 @@
+# ----------------------------------------------------------------------
+#
+# File: MatMul.py.py
+#
+# Last edited: 27.01.2025
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the Licens
+from Deeploy.DeeployTypes import NodeTemplate
+
+referenceTemplate = NodeTemplate("""
+// GEMM using RedMule hardware accelerator (Name: ${nodeName}, Op: ${nodeOp})
+
+int8_t ${nodeName}_core_id = pi_core_id();
+
+if (${nodeName}_core_id == 0) {
+    for(uint32_t b=0; b<${batch}; b++) {
+        ${A_type.typeName} batch_A = ${A} + b * ${M} * ${N};
+        ${B_type.typeName} batch_B = ${B} + b * ${N} * ${O};
+        ${C_type.typeName} batch_C = ${C} + b * ${M} * ${O};
+        ${data_out_type.typeName} batch_out = ${data_out} + b * ${M} * ${O};
+
+        % if beta == 0:
+        MatMul_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_Redmule(
+            (const float32_t *) batch_A,
+            (const float32_t *) batch_B,
+            (float32_t *) batch_out,
+            ${M},
+            ${N},
+            ${O}
+        );
+        % else:
+        Gemm_fp${A_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_fp${B_type.referencedType.typeWidth}_Redmule(
+            (const float32_t *) batch_A,
+            (const float32_t *) batch_B,
+            (const float32_t *) batch_C,
+            (float32_t *) batch_out,
+            ${M},
+            ${N},
+            ${O}
+        );
+        % endif
+    }
+}
+""")
diff --git a/Deeploy/Targets/Redmule/Templates/MatmulTemplate.py b/Deeploy/Targets/Redmule/Templates/MatmulTemplate.py
new file mode 100644
index 0000000000..cb077ca897
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Templates/MatmulTemplate.py
@@ -0,0 +1,49 @@
+# ----------------------------------------------------------------------
+#
+# File: MatMul.py.py
+#
+# Last edited: 27.01.2025
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the Licens
+from Deeploy.DeeployTypes import NodeTemplate
+
+referenceTemplate = NodeTemplate("""
+// Matmul using RedMule hardware accelerator (Name: ${nodeName}, Op: ${nodeOp})
+
+int8_t ${nodeName}_core_id = pi_core_id();
+int8_t ${nodeName}_num_cores = NUM_CORES;
+
+if (${nodeName}_core_id == 0) {
+    for(uint32_t b=0; b<${batch}; b++) {
+        ${A_type.typeName} batch_A = ${A} + b * ${M} * ${N};
+        ${B_type.typeName} batch_B = ${B} + b * ${N} * ${O};
+        ${data_out_type.typeName} batch_out = ${data_out} + b * ${M} * ${O};
+
+        MatMul_fp32_fp32_fp32_Redmule(
+            (const float32_t *) batch_A,
+            (const float32_t *) batch_B,
+            (float32_t *) batch_out,
+            ${M},
+            ${N},
+            ${O}
+        );
+    }
+}
+""")
\ No newline at end of file
diff --git a/Deeploy/Targets/Redmule/Templates/__init__.py b/Deeploy/Targets/Redmule/Templates/__init__.py
new file mode 100644
index 0000000000..a73187ca8f
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Templates/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 08.05.2025
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/Targets/Redmule/TileConstraints/ConvTileConstraint.py b/Deeploy/Targets/Redmule/TileConstraints/ConvTileConstraint.py
new file mode 100644
index 0000000000..1b3a93f600
--- /dev/null
+++ b/Deeploy/Targets/Redmule/TileConstraints/ConvTileConstraint.py
@@ -0,0 +1,279 @@
+# ----------------------------------------------------------------------
+#
+# File: ConvTileConstraint.py
+#
+# Last edited: 09.05.2025
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple, Union
+
+from ortools.constraint_solver.pywrapcp import IntVar
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint8_t, uint16_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import PerformanceHint, TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
+
+
+class RedmuleConv2DTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        # Get to-be-tiled tensor's buffers
+        inputBufferName = parseDict['data_in']
+        weightBufferName = parseDict['weight']
+        outputBufferName = parseDict['data_out']
+
+        strides = parseDict["strides"]
+        padding = parseDict["pads"]
+        dilation = parseDict["dilations"]
+
+        # Add I/O dimensions to the model as variables
+        for bufferName in [inputBufferName, weightBufferName, outputBufferName]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        inputBatchVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 0)
+        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 1)
+        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 2)
+        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBufferName, dimIdx = 3)
+
+        # Updated dimension indexes for (H, W, Cin, Cout) format
+        weightHeightVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 0)
+        weightWidthVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 1)
+        weightInChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 2)
+        weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBufferName, dimIdx = 3)
+
+        outputBatchVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 0)
+        outputHeightVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 1)
+        outputWidthVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 2)
+        outputChannelVar = tilerModel.getTensorDimVar(tensorName = outputBufferName, dimIdx = 3)
+
+        # Map output dims to inputs dims
+        tilerModel.addConstraint(outputBatchVar == inputBatchVar)  # Batch
+        tilerModel.addConstraint(outputChannelVar == weightOutChannelVar)  # Output Channel (now at index 3)
+
+        inputBuffer = ctxt.lookup(inputBufferName)
+
+        effectiveHeight = inputHeightVar + ((padding[0] + padding[2]) * (inputHeightVar == inputBuffer.shape[1]))
+        effectiveWidth = inputWidthVar + ((padding[1] + padding[3]) * (inputWidthVar == inputBuffer.shape[2]))
+
+        tilerModel.addConstraint((outputHeightVar == (effectiveHeight - (weightHeightVar - 1) - 1) // strides[0] + 1))
+        tilerModel.addConstraint((outputWidthVar == (effectiveWidth - (weightWidthVar - 1) - 1) // strides[1] + 1))
+
+        return tilerModel
+
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        # Get to-be-tiled tensor's buffers
+        inputBuffer = ctxt.lookup(name = parseDict['data_in'])
+        weightBuffer = ctxt.lookup(name = parseDict['weight'])
+
+        inputHeightVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 1)
+        inputWidthVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 2)
+        inputChannelVar = tilerModel.getTensorDimVar(tensorName = inputBuffer.name, dimIdx = 3)
+
+        # Updated dimension indexes for (H, W, Cin, Cout) format
+        weightHeightVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 0)
+        weightWidthVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 1)
+        weightInChannelVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 2)
+        weightOutChannelVar = tilerModel.getTensorDimVar(tensorName = weightBuffer.name, dimIdx = 3)
+
+        strides = parseDict["strides"]
+        padding = parseDict["pads"]
+
+        tilerModel.addConstraint(inputChannelVar == parseDict['ch_im_in'])
+        # RW: Conv only tiled on outchannel
+        tilerModel.addConstraint(inputHeightVar == parseDict['dim_im_in_x'])
+        tilerModel.addConstraint(inputWidthVar == parseDict['dim_im_in_y'])
+        tilerModel.addConstraint(inputChannelVar == parseDict['ch_im_in'])
+
+        tilerModel.addConstraint(weightHeightVar == parseDict['dim_kernel_x'])
+        tilerModel.addConstraint(weightWidthVar == parseDict['dim_kernel_y'])
+        tilerModel.addConstraint(weightInChannelVar == parseDict['ch_im_in'])
+
+        outChannel = parseDict["ch_im_out"]
+        if outChannel >= 12:
+            tilerModel.addTileSizeDivisibleConstraint(parseDict,
+                                                      "ch_im_out",
+                                                      weightOutChannelVar,
+                                                      12,
+                                                      strategy = PerformanceHint(priority = 1))
+        else:
+            tilerModel.addConstraint(weightOutChannelVar == weightOutChannelVar.Max(), strategy = PerformanceHint(1))
+
+        return tilerModel
+
+    @staticmethod
+    def constructSymbolicNodeRep(tilerModel: TilerModel, parseDict: Dict,
+                                 ctxt: NetworkContext) -> Dict[str, Union[int, IntVar]]:
+
+        inputBuffer = ctxt.lookup(name = parseDict['data_in'])
+        weightBuffer = ctxt.lookup(name = parseDict['weight'])
+
+        symbolicParseDict = parseDict.copy()
+        symbolicParseDict['dim_im_in_x'] = tilerModel.getTensorDimVar(inputBuffer.name, 1)
+        # Using updated dimension indexes for kernel dimensions
+        symbolicParseDict['dim_kernel_x'] = tilerModel.getTensorDimVar(weightBuffer.name, 0)
+        symbolicParseDict['dim_kernel_y'] = tilerModel.getTensorDimVar(weightBuffer.name, 1)
+
+        return symbolicParseDict
+
+    @staticmethod
+    def computeMargins(kernelShape: Tuple[int, ...]) -> Tuple[int, ...]:
+        if kernelShape[1] % 2 == 0:
+            leftMargin = 0
+            rightMargin = 0
+        else:
+            leftMargin = ((kernelShape[1]) // 2)
+            rightMargin = ((kernelShape[1]) // 2)
+
+        if kernelShape[0] % 2 == 0:
+            topMargin = 0
+            bottomMargin = 0
+        else:
+            topMargin = ((kernelShape[0]) // 2)
+            bottomMargin = ((kernelShape[0]) // 2)
+
+        return leftMargin, rightMargin, topMargin, bottomMargin
+
+    @staticmethod
+    def computeInputCube(kernelShape: Tuple[int, ...], pads: Tuple[int, ...], strides: Tuple[int, ...],
+                         weightChannels: int, outputCube: HyperRectangle,
+                         outputDims: Tuple[int, ...]) -> Tuple[HyperRectangle, Tuple[int, ...]]:
+
+        (BatchOffset, HOffset, WOffset, COffset) = outputCube.offset
+        (BatchSize, HSize, WSize, CSize) = outputCube.dims
+
+        leftMargin, rightMargin, topMargin, bottomMargin = RedmuleConv2DTileConstraint.computeMargins(kernelShape)
+
+        padding_top = (HOffset == 0) * pads[0]
+        padding_bottom = (HOffset + HSize == outputDims[1]) * pads[2]
+
+        padding_left = (WOffset == 0) * pads[1]
+        padding_right = (WOffset + WSize == outputDims[2]) * pads[3]
+
+        inputHOffset = HOffset * strides[0] - topMargin * (HOffset != 0)
+        inputWOffset = WOffset * strides[1] - leftMargin * (WOffset != 0)
+
+        inputHSize = HSize * strides[0] + (topMargin + bottomMargin) - (padding_top + padding_bottom)
+        inputWSize = WSize * strides[1] + (leftMargin + rightMargin) - (padding_left + padding_right)
+
+        InCube = HyperRectangle((BatchOffset, inputHOffset, inputWOffset, 0),
+                                (BatchSize, inputHSize, inputWSize, weightChannels))
+
+        return InCube, (padding_left, padding_right, padding_top, padding_bottom)
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['data_in', 'weight', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        varWeight = operatorRepresentation['weight']
+        varOut = operatorRepresentation['data_out']
+
+        inputInCubes = []
+        inputWeightCubes = []
+        replacements: Dict[str, List[int]] = {
+            "dim_im_in_x": [],
+            "dim_im_in_y": [],
+            "dim_im_out_x": [],
+            "dim_im_out_y": [],
+            "ch_im_out": [],
+            "padding_y_top": [],
+            "padding_y_bottom": [],
+            "padding_x_left": [],
+            "padding_x_right": []
+        }
+
+        replacementTypes = {
+            "dim_im_in_x": PointerClass(uint16_t),
+            "dim_im_in_y": PointerClass(uint16_t),
+            "dim_im_out_x": PointerClass(uint16_t),
+            "dim_im_out_y": PointerClass(uint16_t),
+            "ch_im_out": PointerClass(uint16_t),
+            "padding_y_top": PointerClass(uint8_t),
+            "padding_y_bottom": PointerClass(uint8_t),
+            "padding_x_left": PointerClass(uint8_t),
+            "padding_x_right": PointerClass(uint8_t)
+        }
+
+        # Updated dimension indexes for (H, W, Cin, Cout) format
+        weightH = ctxt.lookup(varWeight).shape[0]  # Now index 0
+        weightW = ctxt.lookup(varWeight).shape[1]  # Now index 1
+        weightC = ctxt.lookup(varWeight).shape[2]  # Now index 2 (Cin)
+
+        pads = operatorRepresentation['pads']
+        strides = operatorRepresentation['strides']
+
+        for cube in outputCubes:
+            (BatchOffset, HOffset, WOffset, COffset) = cube.offset
+            (BatchSize, HSize, WSize, CSize) = cube.dims
+
+            InCube, padding_tuple = RedmuleConv2DTileConstraint.computeInputCube((weightH, weightW), pads, strides,
+                                                                                 weightC, cube,
+                                                                                 ctxt.lookup(varOut).shape)
+
+            padding_left, padding_right, padding_top, padding_bottom = padding_tuple
+
+            replacements['dim_im_in_x'].append(InCube.dims[1])
+            replacements['dim_im_in_y'].append(InCube.dims[2])
+            replacements['dim_im_out_x'].append(HSize)
+            replacements['dim_im_out_y'].append(WSize)
+            replacements['ch_im_out'].append(CSize)
+
+            replacements['padding_y_top'].append(padding_top)
+            replacements['padding_y_bottom'].append(padding_bottom)
+            replacements['padding_x_left'].append(padding_left)
+            replacements['padding_x_right'].append(padding_right)
+
+            inputInCubes.append(InCube)
+
+            # Updated WeightCube for (H, W, Cin, Cout) format
+            # COffset is now applied to dimension 3 (Cout)
+            WeightCube = HyperRectangle((0, 0, 0, COffset), (weightH, weightW, weightC, CSize))
+
+            inputWeightCubes.append(WeightCube)
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        for a, b in zip(inputInCubes, inputWeightCubes):
+            inputLoadSchedule.append({"data_in": a, "weight": b})
+
+        for out in outputCubes:
+            outputLoadSchedule.append({"data_out": out})
+
+        tilingSchedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+        variableReplacementSchedule = VariableReplacementScheme(replacements, replacementTypes)
+
+        return variableReplacementSchedule, tilingSchedule
diff --git a/Deeploy/Targets/Redmule/TileConstraints/GEMMTileConstraint.py b/Deeploy/Targets/Redmule/TileConstraints/GEMMTileConstraint.py
new file mode 100644
index 0000000000..fbae4824f8
--- /dev/null
+++ b/Deeploy/Targets/Redmule/TileConstraints/GEMMTileConstraint.py
@@ -0,0 +1,198 @@
+# ----------------------------------------------------------------------
+#
+# File: GEMMTileConstraint.py
+#
+# Last edited: 02.06.2023
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author:
+# - Victor Jung, jungvi@iis.ee.ethz.ch, ETH Zurich
+# - Moritz Scherer, scheremo@iis.ee.ethz.ch, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import uint8_t, uint16_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
+
+
+class RedmuleGEMMTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        # Get to-be-tiled tensor's buffers
+        bufferA = ctxt.lookup(name = parseDict['A'])
+        bufferB = ctxt.lookup(name = parseDict['B'])
+        bufferC = ctxt.lookup(name = parseDict['C'])
+        outputBuffer = ctxt.lookup(name = parseDict['data_out'])
+
+        # Add I/O dimensions to the model as variables
+        for bufferName in [bufferA.name, bufferB.name, bufferC.name, outputBuffer.name]:
+            tilerModel.addTensorDimToModel(ctxt, bufferName)
+
+        dimOffsetA = len(bufferA.shape) - 2
+        dimOffsetB = len(bufferB.shape) - 2
+        dimOffsetC = len(bufferC.shape) - 2
+        dimOffsetOut = len(outputBuffer.shape) - 2
+
+        AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = dimOffsetA + parseDict['transA'])
+        ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
+                                                   dimIdx = dimOffsetA + 1 - parseDict['transA'])
+        BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, dimIdx = dimOffsetB + parseDict['transB'])
+        BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
+                                                   dimIdx = dimOffsetB + 1 - parseDict['transB'])
+        outputFirstDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = dimOffsetOut)
+        outputSecondDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = dimOffsetOut + 1)
+
+        # Map output dims to inputs dims
+        tilerModel.addConstraint(outputFirstDimVar == AFirstDimVar)
+        tilerModel.addConstraint(outputSecondDimVar == BSecondDimVar)
+
+        # Add GEMM Geometrical constraints
+        tilerModel.addConstraint(ASecondDimVar == BFirstDimVar)
+
+        addDimVar_1 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = dimOffsetC)
+        addDimVar_2 = tilerModel.getTensorDimVar(tensorName = bufferC.name, dimIdx = dimOffsetC + 1)
+        tilerModel.addConstraint(outputFirstDimVar == addDimVar_1)
+        tilerModel.addConstraint(outputSecondDimVar == addDimVar_2)
+
+        return tilerModel
+
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+        from Deeploy.TilingExtension.TilerModel import PerformanceHint
+
+        bufferA = ctxt.lookup(name = parseDict['A'])
+        bufferB = ctxt.lookup(name = parseDict['B'])
+
+        tensorsShapeLen = min(len(bufferA.shape), len(bufferB.shape))
+
+        dimOffsetA = len(bufferA.shape) - 2
+        dimOffsetB = len(bufferB.shape) - 2
+
+        AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name, dimIdx = dimOffsetA + parseDict['transA'])
+        ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
+                                                   dimIdx = dimOffsetA + 1 - parseDict['transA'])
+        BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name, dimIdx = dimOffsetB + parseDict['transB'])
+        BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
+                                                   dimIdx = dimOffsetB + 1 - parseDict['transB'])
+
+        # VIC: We don't want to deal with intermediate results between kernel calls
+        tilerModel.addConstraint(ASecondDimVar == parseDict['N'])
+        tilerModel.addConstraint(BFirstDimVar == parseDict['N'])
+
+        tilerModel.addConstraint(AFirstDimVar == AFirstDimVar.Max(), strategy = PerformanceHint(1))
+        tilerModel.addConstraint(BSecondDimVar == BSecondDimVar.Max(), strategy = PerformanceHint(1))
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['A', 'B', 'C', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        transA = operatorRepresentation['transA']
+        transB = operatorRepresentation['transB']
+
+        varA = operatorRepresentation['A']
+        varB = operatorRepresentation['B']
+
+        if transA == 0:
+            NSize = ctxt.lookup(varA).shape[-1]
+        else:
+            NSize = ctxt.lookup(varA).shape[-2]
+
+        NOffset = 0
+
+        inputACubes = []
+        inputBCubes = []
+        inputAddCubes = []
+
+        replacements = {"M": [], "O": [], "batch": []}
+
+        # Every output is constructed by a pair of inputs. Reconstruct this pair.
+        for cube in outputCubes:
+
+            BSize = 1
+            BOffset = 0
+            BatchSize = 1
+            BatchOffset = 0
+
+            if len(cube.offset) == 2:
+                (MOffset, OOffset) = cube.offset
+                (MSize, OSize) = cube.dims
+            elif len(cube.offset) == 3:
+                (BatchOffset, MOffset, OOffset) = cube.offset
+                (BatchSize, MSize, OSize) = cube.dims
+            else:
+                (BatchOffset, BOffset, MOffset, OOffset) = cube.offset
+                (BatchSize, BSize, MSize, OSize) = cube.dims
+
+            replacements["M"].append(MSize)
+            replacements["O"].append(OSize)
+            replacements["batch"].append(BSize)
+
+            if transA == 0:
+                ACube = HyperRectangle((BatchOffset, BOffset, MOffset, NOffset), (BatchSize, BSize, MSize, NSize))
+            else:
+                ACube = HyperRectangle((BatchOffset, BOffset, NOffset, MOffset), (BatchSize, BSize, NSize, MSize))
+
+            if transB == 0:
+                BCube = HyperRectangle((BatchOffset, BOffset, NOffset, OOffset), (BatchSize, BSize, NSize, OSize))
+            else:
+                BCube = HyperRectangle((BatchOffset, BOffset, OOffset, NOffset), (BatchSize, BSize, OSize, NSize))
+
+            CCube = HyperRectangle(cube.offset, cube.dims)
+
+            inputACubes.append(ACube)
+            inputBCubes.append(BCube)
+            inputAddCubes.append(CCube)
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        replacements["N"] = [NSize] * len(outputCubes)
+
+        replacementTypes = {
+            "M": PointerClass(uint16_t),
+            "N": PointerClass(uint16_t),
+            "O": PointerClass(uint16_t),
+            "batch": PointerClass(uint8_t)
+        }
+
+        for a, b, c in zip(inputACubes, inputBCubes, inputAddCubes):
+            inputLoadSchedule.append({"A": a, "B": b, "C": c})
+
+        for out in outputCubes:
+            outputLoadSchedule.append({"data_out": out})
+
+        schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+
+        return VariableReplacementScheme(replacements, replacementTypes), schedule
diff --git a/Deeploy/Targets/Redmule/TileConstraints/MatmulTileConstraint.py b/Deeploy/Targets/Redmule/TileConstraints/MatmulTileConstraint.py
new file mode 100644
index 0000000000..1b14ccc464
--- /dev/null
+++ b/Deeploy/Targets/Redmule/TileConstraints/MatmulTileConstraint.py
@@ -0,0 +1,197 @@
+# ----------------------------------------------------------------------
+#
+# File: MatMulTileConstraint.py
+#
+# Last edited: 28.04.2025
+#
+# Copyright (C) 2025, ETH Zurich and University of Bologna.
+#
+# Author: [Your Name]
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Tuple
+
+from Deeploy.AbstractDataTypes import PointerClass
+from Deeploy.CommonExtensions.DataTypes import int8_t
+from Deeploy.DeeployTypes import NetworkContext, OperatorRepresentation
+from Deeploy.TilingExtension.MemoryConstraints import NodeMemoryConstraint
+from Deeploy.TilingExtension.TileConstraint import TileConstraint
+from Deeploy.TilingExtension.TilerModel import PerformanceHint, TilerModel
+from Deeploy.TilingExtension.TilingCodegen import AbsoluteHyperRectangle, HyperRectangle, TilingSchedule, \
+    VariableReplacementScheme
+
+
+class RedmuleMatmulTileConstraint(TileConstraint):
+
+    @staticmethod
+    def addGeometricalConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        # Get to-be-tiled tensor's buffers
+        bufferA = ctxt.lookup(name = parseDict['A'])
+        bufferB = ctxt.lookup(name = parseDict['B'])
+        outputBuffer = ctxt.lookup(name = parseDict['data_out'])
+
+        # Add I/O dimensions to the model as variables
+        for _buffer in [bufferA, bufferB, outputBuffer]:
+            tilerModel.addTensorDimToModel(ctxt, _buffer.name)
+
+        tensorsShapeLen = len(bufferA.shape)
+
+        AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
+                                                  dimIdx = (tensorsShapeLen - 2) + parseDict['transA'])
+        ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
+                                                   dimIdx = (tensorsShapeLen - 1) - parseDict['transA'])
+        BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
+                                                  dimIdx = (tensorsShapeLen - 2) + parseDict['transB'])
+        BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
+                                                   dimIdx = (tensorsShapeLen - 1) - parseDict['transB'])
+        outputFirstDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = (tensorsShapeLen - 2))
+        outputSecondDimVar = tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = (tensorsShapeLen - 1))
+
+        # Map output dims to inputs dims
+        for idx in range(tensorsShapeLen - 2):
+            tilerModel.addConstraint(
+                tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = idx) == tilerModel.getTensorDimVar(
+                    tensorName = bufferA.name, dimIdx = idx))
+            tilerModel.addConstraint(
+                tilerModel.getTensorDimVar(tensorName = outputBuffer.name, dimIdx = idx) == tilerModel.getTensorDimVar(
+                    tensorName = bufferB.name, dimIdx = idx))
+
+        tilerModel.addConstraint(outputFirstDimVar == AFirstDimVar)
+        tilerModel.addConstraint(outputSecondDimVar == BSecondDimVar)
+
+        # Add GEMM Geometrical constraints
+        tilerModel.addConstraint(ASecondDimVar == BFirstDimVar)
+
+        return tilerModel
+
+    @staticmethod
+    def addPolicyConstraint(tilerModel: TilerModel, parseDict: Dict, ctxt: NetworkContext) -> TilerModel:
+
+        bufferA = ctxt.lookup(name = parseDict['A'])
+        bufferB = ctxt.lookup(name = parseDict['B'])
+
+        tensorsShapeLen = len(bufferA.shape)
+
+        AFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
+                                                  dimIdx = (tensorsShapeLen - 2) + parseDict['transA'])
+        ASecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferA.name,
+                                                   dimIdx = (tensorsShapeLen - 1) - parseDict['transA'])
+        BFirstDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
+                                                  dimIdx = (tensorsShapeLen - 2) + parseDict['transB'])
+        BSecondDimVar = tilerModel.getTensorDimVar(tensorName = bufferB.name,
+                                                   dimIdx = (tensorsShapeLen - 1) - parseDict['transB'])
+
+        # VIC: We don't want to deal with intermediate results between kernel calls
+        tilerModel.addConstraint(ASecondDimVar == parseDict['N'])
+        tilerModel.addConstraint(BFirstDimVar == parseDict['N'])
+
+        # Hardware-specific constraints for 4x12 accelerator
+        tilerModel.addConstraint(BSecondDimVar == BSecondDimVar.Max(), strategy = PerformanceHint(1))
+
+        M_full_size = ctxt.lookup(bufferA.name).shape[(tensorsShapeLen - 2) + parseDict['transA']]
+        if M_full_size >= 16:
+            tilerModel.addTileSizeDivisibleConstraint(parseDict,
+                                                      "M",
+                                                      AFirstDimVar,
+                                                      16,
+                                                      strategy = PerformanceHint(priority = 1))
+        else:
+            tilerModel.addConstraint(AFirstDimVar == AFirstDimVar.Max(), strategy = PerformanceHint(1))
+
+        N_full_size = ctxt.lookup(bufferB.name).shape[(tensorsShapeLen - 2) + parseDict['transB']]
+        if N_full_size >= 12:
+            tilerModel.addTileSizeDivisibleConstraint(parseDict,
+                                                      "O",
+                                                      BSecondDimVar,
+                                                      12,
+                                                      strategy = PerformanceHint(priority = 1))
+        else:
+            tilerModel.addConstraint(BSecondDimVar == BSecondDimVar.Max(), strategy = PerformanceHint(1))
+
+        return tilerModel
+
+    @classmethod
+    def serializeTilingSolution(
+            cls, tilingSolution: NodeMemoryConstraint, absoluteOutputCubes: List[AbsoluteHyperRectangle],
+            targetMemLevel: str, ctxt: NetworkContext,
+            operatorRepresentation: OperatorRepresentation) -> Tuple[VariableReplacementScheme, TilingSchedule]:
+        outputCubes = [cube.rectangle for cube in absoluteOutputCubes]
+
+        addrNames = ['A', 'B', 'data_out']
+        inputBaseOffsets, outputBaseOffsets = cls.extractBaseAddr(tilingSolution, targetMemLevel,
+                                                                  operatorRepresentation, addrNames)
+
+        varA = operatorRepresentation['A']
+
+        NSize = ctxt.lookup(varA).shape[-1]
+        NOffset = 0
+
+        inputACubes = []
+        inputBCubes = []
+
+        replacements = {"M": [], "O": [], "batch": []}
+
+        # Every output is constructed by a pair of inputs. Reconstruct this pair.
+        for cube in outputCubes:
+
+            BSize = 1
+            BOffset = 0
+            BatchSize = 1
+            BatchOffset = 0
+
+            if len(cube.offset) == 2:
+                (MOffset, OOffset) = cube.offset
+                (MSize, OSize) = cube.dims
+            elif len(cube.offset) == 3:
+                (BatchOffset, MOffset, OOffset) = cube.offset
+                (BatchSize, MSize, OSize) = cube.dims
+            else:
+                (BatchOffset, BOffset, MOffset, OOffset) = cube.offset
+                (BatchSize, BSize, MSize, OSize) = cube.dims
+
+            replacements["M"].append(MSize)
+            replacements["O"].append(OSize)
+            replacements["batch"].append(BSize)
+
+            ACube = HyperRectangle((BatchOffset, BOffset, MOffset, NOffset), (BatchSize, BSize, MSize, NSize))
+            BCube = HyperRectangle((BatchOffset, BOffset, NOffset, OOffset), (BatchSize, BSize, NSize, OSize))
+
+            inputACubes.append(ACube)
+            inputBCubes.append(BCube)
+
+        inputLoadSchedule = []
+        outputLoadSchedule = []
+
+        replacements["N"] = [NSize] * len(outputCubes)
+
+        replacementTypes = {
+            "M": PointerClass(int8_t),
+            "N": PointerClass(int8_t),
+            "O": PointerClass(int8_t),
+            "batch": PointerClass(int8_t)
+        }
+
+        for a, b in zip(inputACubes, inputBCubes):
+            inputLoadSchedule.append({"A": a, "B": b})
+
+        for out in outputCubes:
+            outputLoadSchedule.append({"data_out": out})
+
+        schedule = TilingSchedule(inputBaseOffsets, outputBaseOffsets, inputLoadSchedule, outputLoadSchedule)
+
+        return VariableReplacementScheme(replacements, replacementTypes), schedule
diff --git a/Deeploy/Targets/Redmule/TileConstraints/__init__.py b/Deeploy/Targets/Redmule/TileConstraints/__init__.py
new file mode 100644
index 0000000000..a73187ca8f
--- /dev/null
+++ b/Deeploy/Targets/Redmule/TileConstraints/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 08.05.2025
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/Targets/Redmule/Tiler.py b/Deeploy/Targets/Redmule/Tiler.py
new file mode 100644
index 0000000000..1db49cb4a3
--- /dev/null
+++ b/Deeploy/Targets/Redmule/Tiler.py
@@ -0,0 +1,37 @@
+# ----------------------------------------------------------------------
+#
+# File: Tiler.py
+#
+# Last edited: 26.07.2024
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Moritz Scherer, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Deeploy.Targets.Redmule.Bindings import RedmuleConv2DBindings, RedmuleGEMMBindings, RedmuleMatmulBindings
+from Deeploy.Targets.Redmule.TileConstraints.ConvTileConstraint import RedmuleConv2DTileConstraint
+from Deeploy.Targets.Redmule.TileConstraints.GEMMTileConstraint import RedmuleGEMMTileConstraint
+from Deeploy.Targets.Redmule.TileConstraints.MatmulTileConstraint import RedmuleMatmulTileConstraint
+from Deeploy.TilingExtension.TilerExtension import TilingReadyNodeBindings
+
+RedmuleMatMulTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = RedmuleMatmulBindings,
+                                                           tileConstraint = RedmuleMatmulTileConstraint())
+RedmuleConvTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = RedmuleConv2DBindings,
+                                                         tileConstraint = RedmuleConv2DTileConstraint())
+RedmuleGEMMTilingReadyBindings = TilingReadyNodeBindings(nodeBindings = RedmuleGEMMBindings,
+                                                         tileConstraint = RedmuleGEMMTileConstraint())
diff --git a/Deeploy/Targets/Redmule/TopologyOptimizationPasses/Passes.py b/Deeploy/Targets/Redmule/TopologyOptimizationPasses/Passes.py
new file mode 100644
index 0000000000..f9d3d95b57
--- /dev/null
+++ b/Deeploy/Targets/Redmule/TopologyOptimizationPasses/Passes.py
@@ -0,0 +1,150 @@
+# ----------------------------------------------------------------------
+#
+# File: RedMulePasses.py
+#
+# Last edited: 09.05.2025
+#
+# Copyright (C) 2025, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import onnx_graphsurgeon as gs
+
+from Deeploy.CommonExtensions.OptimizationPasses.Matchers import Match
+from Deeploy.CommonExtensions.OptimizationPasses.PassClasses import ReplaceSequentialPatternPass, contextagnostic
+from Deeploy.CommonExtensions.OptimizationPasses.TopologyOptimizationPasses.LoweringOptimizationPasses import \
+    _appendTranspose
+
+
+def _redmule_weight_layout_fun(graph: gs.Graph, match: Match, name: str):
+    """Convert Conv weights from [cout, h, w, cin] to [h,w,cin, cout] for RedMule accelerator"""
+    node = list(match.nodes_map.values())[0]
+
+    weightTensor = node.inputs[1]
+    if isinstance(weightTensor, gs.Constant):
+        weightTensor.values = np.transpose(weightTensor.values, (1, 2, 3, 0))
+
+    return graph
+
+
+@contextagnostic
+class RedMuleAdjustWeightMemoryLayoutPass(ReplaceSequentialPatternPass):
+    """Pass to convert Conv weights from [cout, h, w, cin] to [hwcin, cout] for RedMule accelerator"""
+
+    def __init__(self, redmuleEngineName: str):
+        graph = gs.Graph()
+        _input = gs.Variable(name = 'input_1')
+        output = graph.layer(inputs = [_input], outputs = ['convOut'], op = 'Conv', name = 'conv')
+        graph.outputs.append(output)
+        graph.inputs.append(_input)
+
+        super().__init__(graph, _redmule_weight_layout_fun, "_REDMULE_ADJUST_WEIGHT_MEMORY_LAYOUT_PASS")
+
+
+def _redmule_gemm_transpose_fun(graph: gs.Graph, match: Match, name: str):
+    """
+    Handle GEMM transA and transB attributes for RedMule accelerator
+
+    Properly handles tensors of any dimensionality, ensuring only the last two
+    dimensions are transposed when needed.
+    """
+    matched_nodes = [m for k, m in match.nodes_map.items()]
+    gemm_node = matched_nodes[0]
+
+    if 'transA' not in gemm_node.attrs:
+        gemm_node.attrs['transA'] = 0
+    if 'transB' not in gemm_node.attrs:
+        gemm_node.attrs['transB'] = 0
+    if 'alpha' not in gemm_node.attrs:
+        gemm_node.attrs['alpha'] = 1.0
+    if 'beta' not in gemm_node.attrs:
+        gemm_node.attrs['beta'] = 1.0
+
+    inputA = gemm_node.inputs[0]
+    inputB = gemm_node.inputs[1]
+
+    if gemm_node.attrs['transA'] != 0:
+        if isinstance(inputA, gs.Constant):
+            print(f"Physical transpose for constant A: {inputA.name}")
+
+            if len(inputA.values.shape) > 2:
+                perm = list(range(len(inputA.values.shape)))
+                perm[-1], perm[-2] = perm[-2], perm[-1]
+                inputA.values = np.transpose(inputA.values, perm)
+            else:
+                inputA.values = np.transpose(inputA.values)
+
+            gemm_node.attrs['transA'] = 0
+        else:
+
+            perm = list(range(len(inputA.shape)))
+            perm[-1], perm[-2] = perm[-2], perm[-1]
+
+            anchorTransposeNode = _appendTranspose(inputA, gemm_node, perm)
+            gemm_node.attrs['transA'] = 0
+            graph.nodes.append(anchorTransposeNode)
+
+    if gemm_node.attrs['transB'] != 0:
+        if isinstance(inputB, gs.Constant):
+
+            if len(inputB.values.shape) > 2:
+
+                perm = list(range(len(inputB.values.shape)))
+                perm[-1], perm[-2] = perm[-2], perm[-1]
+
+                inputB.values = np.transpose(inputB.values, perm)
+            else:
+                inputB.values = np.transpose(inputB.values)
+
+            gemm_node.attrs['transB'] = 0
+        else:
+            print(f"Adding transpose node for variable B: {inputB.name}")
+
+            perm = list(range(len(inputB.shape)))
+            perm[-1], perm[-2] = perm[-2], perm[-1]
+
+            anchorTransposeNode = _appendTranspose(inputB, gemm_node, perm)
+            gemm_node.attrs['transB'] = 0
+            graph.nodes.append(anchorTransposeNode)
+
+    return graph
+
+
+@contextagnostic
+class RedMuleGEMMTransposePass(ReplaceSequentialPatternPass):
+    """Pass to handle GEMM transA and transB attributes for RedMule accelerator"""
+
+    def __init__(self, redmuleEngineName: str):
+
+        pattern = gs.Graph()
+
+        input_a = gs.Variable(name = "input_a")
+        input_b = gs.Variable(name = "input_b")
+
+        gemm_output = pattern.layer(op = "Gemm",
+                                    name = "gemm_node",
+                                    inputs = [input_a, input_b],
+                                    outputs = ["gemm_output"])
+
+        pattern.inputs = [input_a, input_b]
+        pattern.outputs = [gemm_output]
+
+        super().__init__(pattern = pattern,
+                         replacement_fn = _redmule_gemm_transpose_fun,
+                         name = "_REDMULE_GEMM_TRANSPOSE_PASS")
diff --git a/Deeploy/Targets/Redmule/TopologyOptimizationPasses/__init__.py b/Deeploy/Targets/Redmule/TopologyOptimizationPasses/__init__.py
new file mode 100644
index 0000000000..63063b6066
--- /dev/null
+++ b/Deeploy/Targets/Redmule/TopologyOptimizationPasses/__init__.py
@@ -0,0 +1,26 @@
+# ----------------------------------------------------------------------
+#
+# File: __init__.py
+#
+# Last edited: 09.05.2025
+#
+# Copyright (C) 2024, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from . import *
diff --git a/Deeploy/Targets/Redmule/__init__.py b/Deeploy/Targets/Redmule/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/Deeploy/Targets/SoftHier/__init__.py b/Deeploy/Targets/SoftHier/__init__.py
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/DeeployTest/CMakeLists.txt b/DeeployTest/CMakeLists.txt
index b7f3535790..d94bf1af07 100644
--- a/DeeployTest/CMakeLists.txt
+++ b/DeeployTest/CMakeLists.txt
@@ -41,7 +41,7 @@ elseif(DEEPLOY_ARCH STREQUAL PULP)
     -Wno-pointer-sign
   )
 
-  if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka)
+  if(platform STREQUAL Siracusa OR platform STREQUAL Siracusa_w_neureka OR platform STREQUAL Siracusa_w_redmule)
     add_subdirectory(Platforms/Siracusa)
   elseif(platform STREQUAL PULPOpen)
     add_subdirectory(Platforms/PULPOpen)
diff --git a/DeeployTest/conftest.py b/DeeployTest/conftest.py
index c7077067d9..a2be5e0cf3 100644
--- a/DeeployTest/conftest.py
+++ b/DeeployTest/conftest.py
@@ -66,6 +66,8 @@ def pytest_configure(config: pytest.Config) -> None:
     config.addinivalue_line("markers", "siracusa_tiled: mark test as a Siracusa platform test (tiled)")
     config.addinivalue_line("markers",
                             "siracusa_neureka_tiled: mark test as a Siracusa + Neureka platform test (tiled)")
+    config.addinivalue_line("markers",
+                            "siracusa_redmule_tiled: mark test as a Siracusa + RedMulE platform test (tiled)")
     config.addinivalue_line("markers", "gap9: mark test as a GAP9 platform test")
     config.addinivalue_line("markers", "gap9_tiled: mark test as a GAP9 platform test (tiled)")
     config.addinivalue_line("markers", "kernels: mark test as a kernel test (individual operators)")
diff --git a/DeeployTest/testRunner_tiled_siracusa_w_redmule.py b/DeeployTest/testRunner_tiled_siracusa_w_redmule.py
new file mode 100644
index 0000000000..9ebd9c6358
--- /dev/null
+++ b/DeeployTest/testRunner_tiled_siracusa_w_redmule.py
@@ -0,0 +1,49 @@
+# ----------------------------------------------------------------------
+#
+# File: testRunner_tiled_siracusa_w_neureka.py
+#
+# Last edited: 08.05.2025
+#
+# Copyright (C) 2023, ETH Zurich and University of Bologna.
+#
+# Author: Run Wang, ETH Zurich
+#
+# ----------------------------------------------------------------------
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from testUtils.testRunner import TestRunner, TestRunnerArgumentParser
+
+if __name__ == "__main__":
+
+    parser = TestRunnerArgumentParser(
+        tiling_arguments = True,
+        description = "Deeploy Code Generation Utility for the Siracusa Platform (Tiling & Redmule).")
+
+    parser.add_argument('--cores',
+                        metavar = '<cores>',
+                        dest = 'cores',
+                        type = int,
+                        default = 1,
+                        help = 'Set number of cluster cores')
+    args = parser.parse_args()
+
+    testRunner = TestRunner(platform = "Siracusa_w_redmule",
+                            simulator = "gvsoc",
+                            tiling = True,
+                            argument_parser = parser)
+
+    testRunner.cmake_args += f" -D NUM_CORES={args.cores}"
+
+    testRunner.run()
diff --git a/DeeployTest/testUtils/platformMapping.py b/DeeployTest/testUtils/platformMapping.py
index 9d526906f9..32c06c95cd 100644
--- a/DeeployTest/testUtils/platformMapping.py
+++ b/DeeployTest/testUtils/platformMapping.py
@@ -25,13 +25,17 @@
     NeurekaPlatform
 from Deeploy.Targets.PULPOpen.Deployer import PULPDeployer
 from Deeploy.Targets.PULPOpen.Platform import MemoryPULPPlatform, MemoryPULPPlatformWrapper, PULPOptimizer, PULPPlatform
+from Deeploy.Targets.Redmule.Deployer import RedmuleDeployer
+from Deeploy.Targets.Redmule.Platform import RedmuleOptimizer, RedmulePlatform
 from Deeploy.Targets.Snitch.Deployer import SnitchDeployer
 from Deeploy.Targets.Snitch.Platform import SnitchOptimizer, SnitchPlatform
 from Deeploy.Targets.SoftHier.Deployer import SoftHierDeployer
 from Deeploy.Targets.SoftHier.Platform import SoftHierOptimizer, SoftHierPlatform
 
 _SIGNPROP_PLATFORMS = ["Apollo3", "Apollo4", "QEMU-ARM", "Generic", "MemPool", "SoftHier"]
-_NONSIGNPROP_PLATFORMS = ["Siracusa", "Siracusa_w_neureka", "PULPOpen", "Snitch", "Chimera", "GAP9"]
+_NONSIGNPROP_PLATFORMS = [
+    "Siracusa", "Siracusa_w_neureka", "Siracusa_w_redmule", "PULPOpen", "Snitch", "Chimera", "GAP9"
+]
 _PLATFORMS = _SIGNPROP_PLATFORMS + _NONSIGNPROP_PLATFORMS
 
 
@@ -67,6 +71,9 @@ def mapPlatform(platformName: str) -> Tuple[DeploymentPlatform, bool]:
     elif platformName == "Siracusa_w_neureka":
         Platform = NeurekaPlatform()
 
+    elif platformName == "Siracusa_w_redmule":
+        Platform = RedmulePlatform()
+
     elif platformName == "Snitch":
         Platform = SnitchPlatform()
 
@@ -84,7 +91,7 @@ def mapPlatform(platformName: str) -> Tuple[DeploymentPlatform, bool]:
 
 def setupMemoryPlatform(platform: DeploymentPlatform, memoryHierarchy: MemoryHierarchy,
                         defaultTargetMemoryLevel: MemoryLevel) -> Union[MemoryPlatform, MemoryPlatformWrapper]:
-    if isinstance(platform, PULPPlatform):
+    if isinstance(platform, (PULPPlatform, RedmulePlatform)):
         return MemoryPULPPlatformWrapper(platform, memoryHierarchy, defaultTargetMemoryLevel)
     elif isinstance(platform, NeurekaPlatform):
         weightMemoryLevel = memoryHierarchy.memoryLevels["WeightMemory_SRAM"] \
@@ -207,6 +214,24 @@ def mapDeployer(platform: DeploymentPlatform,
                                    default_channels_first = default_channels_first,
                                    deeployStateDir = deeployStateDir)
 
+    elif isinstance(platform, RedmulePlatform):
+
+        if loweringOptimizer is None:
+            loweringOptimizer = RedmuleOptimizer
+
+        if default_channels_first is None:
+            default_channels_first = False
+
+        deployer = RedmuleDeployer(graph,
+                                   platform,
+                                   inputTypes,
+                                   loweringOptimizer,
+                                   scheduler,
+                                   name = name,
+                                   default_channels_first = default_channels_first,
+                                   deeployStateDir = deeployStateDir,
+                                   inputOffsets = inputOffsets)
+
     elif isinstance(platform, (GAP9Platform, MemoryGAP9Platform, MemoryGAP9PlatformWrapper)):
 
         if loweringOptimizer is None:
diff --git a/DeeployTest/test_platforms.py b/DeeployTest/test_platforms.py
index 6d9f3cfcd7..4add6e1fc9 100644
--- a/DeeployTest/test_platforms.py
+++ b/DeeployTest/test_platforms.py
@@ -33,6 +33,9 @@
 from test_siracusa_neureka_tiled_config import L3_DOUBLEBUFFER_MODELS as NEUREKA_L3_DOUBLEBUFFER_MODELS
 from test_siracusa_neureka_tiled_config import L3_DOUBLEBUFFER_MODELS_WMEM as NEUREKA_L3_DOUBLEBUFFER_MODELS_WMEM
 from test_siracusa_neureka_tiled_config import L3_SINGLEBUFFER_MODELS as NEUREKA_L3_SINGLEBUFFER_MODELS
+from test_siracusa_redmule_tiled_config import DEFAULT_CORES as REDMULE_DEFAULT_CORES
+from test_siracusa_redmule_tiled_config import L2_DOUBLEBUFFER_KERNELS as REDMULE_L2_DOUBLEBUFFER_KERNELS
+from test_siracusa_redmule_tiled_config import L2_SINGLEBUFFER_KERNELS as REDMULE_L2_SINGLEBUFFER_KERNELS
 from test_siracusa_tiled_config import L2_DOUBLEBUFFER_KERNELS, L2_DOUBLEBUFFER_MODELS, L2_SINGLEBUFFER_KERNELS, \
     L2_SINGLEBUFFER_MODELS, L3_DOUBLEBUFFER_MODELS, L3_SINGLEBUFFER_MODELS
 from test_snitch_config import DEFAULT_NUM_CORES as SNITCH_DEFAULT_NUM_CORES
@@ -987,3 +990,61 @@ def test_gap9_tiled_models_l3_doublebuffer(test_params, deeploy_test_dir, toolch
         double_buffer = True,
     )
     run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
+@pytest.mark.siracusa_redmule_tiled
+@pytest.mark.kernels
+@pytest.mark.singlebuffer
+@pytest.mark.l2
+@pytest.mark.parametrize(
+    "test_params",
+    generate_test_params(REDMULE_L2_SINGLEBUFFER_KERNELS, "L2-singlebuffer"),
+    ids = param_id,
+)
+def test_siracusa_redmule_tiled_kernels_l2_singlebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir,
+                                                        cmake_args, skipgen, skipsim) -> None:
+    test_name, l1, config_name = test_params
+    config = create_test_config(
+        test_name = test_name,
+        platform = "Siracusa_w_redmule",
+        simulator = "gvsoc",
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = cmake_args,
+        tiling = True,
+        cores = REDMULE_DEFAULT_CORES,
+        l1 = l1,
+        default_mem_level = "L2",
+        double_buffer = False,
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim)
+
+
+@pytest.mark.siracusa_redmule_tiled
+@pytest.mark.kernels
+@pytest.mark.doublebuffer
+@pytest.mark.l2
+@pytest.mark.parametrize(
+    "test_params",
+    generate_test_params(REDMULE_L2_DOUBLEBUFFER_KERNELS, "L2-doublebuffer"),
+    ids = param_id,
+)
+def test_siracusa_redmule_tiled_kernels_l2_doublebuffer(test_params, deeploy_test_dir, toolchain, toolchain_dir,
+                                                        cmake_args, skipgen, skipsim) -> None:
+    test_name, l1, config_name = test_params
+    config = create_test_config(
+        test_name = test_name,
+        platform = "Siracusa_w_redmule",
+        simulator = "gvsoc",
+        deeploy_test_dir = deeploy_test_dir,
+        toolchain = toolchain,
+        toolchain_dir = toolchain_dir,
+        cmake_args = cmake_args,
+        tiling = True,
+        cores = REDMULE_DEFAULT_CORES,
+        l1 = l1,
+        default_mem_level = "L2",
+        double_buffer = True,
+    )
+    run_and_assert_test(test_name, config, skipgen, skipsim)
diff --git a/DeeployTest/test_siracusa_redmule_tiled_config.py b/DeeployTest/test_siracusa_redmule_tiled_config.py
new file mode 100644
index 0000000000..08883e0cb7
--- /dev/null
+++ b/DeeployTest/test_siracusa_redmule_tiled_config.py
@@ -0,0 +1,21 @@
+# SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+#
+# SPDX-License-Identifier: Apache-2.0
+"""Test configuration for Siracusa platform with RedMulE accelerator (tiled)."""
+
+# Siracusa + RedMulE platform with tiling support
+# Default configuration: 8 cores, gvsoc simulator
+
+DEFAULT_CORES = 8
+
+# L2 single-buffer kernel tests
+# Format: dict of {test_name: [L1_sizes]}
+L2_SINGLEBUFFER_KERNELS = {
+    "Kernels/FP32/GEMM/Regular": [8000],
+    "Kernels/FP32/GEMM/TransB": [8000],
+}
+
+# L2 double-buffer kernel tests
+L2_DOUBLEBUFFER_KERNELS = {
+    "Kernels/FP32/GEMM/Regular": [8000],
+}
diff --git a/Makefile b/Makefile
index f007f105c1..423c3b8d70 100644
--- a/Makefile
+++ b/Makefile
@@ -49,7 +49,7 @@ PULP_SDK_COMMIT_HASH ?= 7f4f22516157a1b7c55bcbbc72ca81326180b3b4
 MEMPOOL_COMMIT_HASH ?= affd45d94e05e375a6966af6a762deeb182a7bd6
 SNITCH_COMMIT_HASH ?= e02cc9e3f24b92d4607455d5345caba3eb6273b2
 SOFTHIER_COMMIT_HASH ?= 0       # bowwang: to be updated
-GVSOC_COMMIT_HASH ?= edfcd8398840ceb1e151711befa06678b05f06a0
+GVSOC_COMMIT_HASH ?= 35d00d15d7249daaac0de61bd8485fba128e5959
 MINIMALLOC_COMMMIT_HASH ?= e9eaf54094025e1c246f9ec231b905f8ef42a29d
 CHIMERA_SDK_COMMIT_HASH ?= b2392f6efcff75c03f4c65eaf3e12104442b22ea
 XTL_VERSION ?= 0.7.5
@@ -465,7 +465,7 @@ snitch_runtime: ${SNITCH_INSTALL_DIR}
 
 ${TOOLCHAIN_DIR}/gvsoc:
 	cd ${TOOLCHAIN_DIR} && \
-	git clone https://github.com/gvsoc/gvsoc.git && \
+	git clone https://github.com/runwangdl/gvsoc.git && \
 	cd ${TOOLCHAIN_DIR}/gvsoc && git checkout ${GVSOC_COMMIT_HASH} && \
 	git submodule update --init --recursive && \
 	pip install -r core/requirements.txt && pip install -r gapy/requirements.txt
diff --git a/TargetLibraries/PULPOpen/CMakeLists.txt b/TargetLibraries/PULPOpen/CMakeLists.txt
index 1a510c945b..d136ed3507 100644
--- a/TargetLibraries/PULPOpen/CMakeLists.txt
+++ b/TargetLibraries/PULPOpen/CMakeLists.txt
@@ -10,7 +10,7 @@ if(NOT DEFINED ENV{PULP_SDK_HOME})
   message(FATAL_ERROR "Environment variable PULP_SDK_HOME not set.")
 endif()
 
-if(platform STREQUAL "Siracusa" OR platform STREQUAL "Siracusa_w_neureka")
+if(platform STREQUAL "Siracusa" OR platform STREQUAL "Siracusa_w_neureka" OR platform STREQUAL "Siracusa_w_redmule")
   include(cmake/pulp-sdk-siracusa.cmake)
 elseif(platform STREQUAL "PULPOpen")
   include(cmake/pulp-sdk-pulp-open.cmake)
diff --git a/TargetLibraries/PULPOpen/src/Matmul_fp32_Redmule.c b/TargetLibraries/PULPOpen/src/Matmul_fp32_Redmule.c
new file mode 100644
index 0000000000..ad33b66b43
--- /dev/null
+++ b/TargetLibraries/PULPOpen/src/Matmul_fp32_Redmule.c
@@ -0,0 +1,160 @@
+// SPDX-FileCopyrightText: 2026 ETH Zurich and University of Bologna
+//
+// SPDX-License-Identifier: Apache-2.0
+
+#include "DeeployBasicMath.h"
+
+#define REDMULE_BASE_ADDR 0x10201C00
+
+#define REG_MNK_M 0x00
+#define REG_MNK_N 0x04
+#define REG_MNK_K 0x08
+#define REG_X_ADDR 0x0C
+#define REG_Y_ADDR 0x10
+#define REG_Z_ADDR 0x14
+#define REG_W_ADDR 0x18
+#define REG_COMPUTE_MODE 0x1C
+#define REG_TRIGGER 0x20
+#define REG_WAIT 0x28
+
+void MatMul_fp32_fp32_fp32_Redmule(const float32_t *__restrict__ pSrcA,
+                                   const float32_t *__restrict__ pSrcB,
+                                   float32_t *__restrict__ pDstY, uint32_t M,
+                                   uint32_t N, uint32_t O) {
+
+  uint32_t total_elements = M * O;
+  for (uint32_t i = 0; i < total_elements; i++) {
+    pDstY[i] = 0.0f;
+  }
+
+  volatile uint16_t *mnk_m =
+      (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_M);
+  volatile uint16_t *mnk_n =
+      (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_N);
+  volatile uint16_t *mnk_k =
+      (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_K);
+
+  *mnk_m = (uint16_t)M;
+  *mnk_n = (uint16_t)N;
+  *mnk_k = (uint16_t)O;
+
+  volatile uint32_t *x_addr =
+      (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_X_ADDR);
+  volatile uint32_t *y_addr =
+      (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Y_ADDR);
+  volatile uint32_t *z_addr =
+      (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Z_ADDR);
+  volatile uint32_t *w_addr =
+      (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_W_ADDR);
+
+  *x_addr = (uint32_t)((uintptr_t)pSrcA);
+  *y_addr = (uint32_t)((uintptr_t)pDstY);
+  *z_addr = (uint32_t)((uintptr_t)pDstY);
+  *w_addr = (uint32_t)((uintptr_t)pSrcB);
+
+  volatile uint32_t *compute_mode =
+      (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_COMPUTE_MODE);
+  *compute_mode = 4; // FP32 mode
+
+  volatile uint32_t *trigger =
+      (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_TRIGGER);
+  *trigger;
+
+  volatile uint32_t *wait_reg =
+      (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_WAIT);
+  uint32_t result = *wait_reg;
+}
+
+void MatMul_fp32_fp32_fp32_Redmule_Async(const float32_t *__restrict__ pSrcA,
+                                         const float32_t *__restrict__ pSrcB,
+                                         float32_t *__restrict__ pDstY,
+                                         uint32_t M, uint32_t N, uint32_t O) {
+
+  uint32_t total_elements = M * O;
+  for (uint32_t i = 0; i < total_elements; i++) {
+    pDstY[i] = 0.0f;
+  }
+
+  volatile uint16_t *mnk_m =
+      (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_M);
+  volatile uint16_t *mnk_n =
+      (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_N);
+  volatile uint16_t *mnk_k =
+      (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_K);
+
+  *mnk_m = (uint16_t)M;
+  *mnk_n = (uint16_t)N;
+  *mnk_k = (uint16_t)O;
+
+  volatile uint32_t *x_addr =
+      (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_X_ADDR);
+  volatile uint32_t *y_addr =
+      (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Y_ADDR);
+  volatile uint32_t *z_addr =
+      (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Z_ADDR);
+  volatile uint32_t *w_addr =
+      (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_W_ADDR);
+
+  *x_addr = (uint32_t)((uintptr_t)pSrcA);
+  *y_addr = (uint32_t)((uintptr_t)pDstY);
+  *z_addr = (uint32_t)((uintptr_t)pDstY);
+  *w_addr = (uint32_t)((uintptr_t)pSrcB);
+
+  volatile uint32_t *compute_mode =
+      (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_COMPUTE_MODE);
+  *compute_mode = 4; // FP32 mode
+
+  volatile uint32_t *trigger =
+      (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_TRIGGER);
+  *trigger; // Trigger without waiting
+}
+
+uint32_t MatMul_fp32_fp32_fp32_Redmule_Wait() {
+  volatile uint32_t *wait_reg =
+      (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_WAIT);
+  return *wait_reg;
+}
+
+void Gemm_fp32_fp32_fp32_fp32_Redmule(const float32_t *__restrict__ pSrcA,
+                                      const float32_t *__restrict__ pSrcB,
+                                      const float32_t *__restrict__ pBias,
+                                      float32_t *__restrict__ pDstY, uint32_t M,
+                                      uint32_t N, uint32_t O) {
+
+  volatile uint16_t *mnk_m =
+      (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_M);
+  volatile uint16_t *mnk_n =
+      (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_N);
+  volatile uint16_t *mnk_k =
+      (volatile uint16_t *)(REDMULE_BASE_ADDR + REG_MNK_K);
+
+  *mnk_m = (uint16_t)M;
+  *mnk_n = (uint16_t)N;
+  *mnk_k = (uint16_t)O;
+
+  volatile uint32_t *x_addr =
+      (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_X_ADDR);
+  volatile uint32_t *y_addr =
+      (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Y_ADDR);
+  volatile uint32_t *z_addr =
+      (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_Z_ADDR);
+  volatile uint32_t *w_addr =
+      (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_W_ADDR);
+
+  *x_addr = (uint32_t)((uintptr_t)pSrcA);
+  *y_addr = (uint32_t)((uintptr_t)pBias);
+  *z_addr = (uint32_t)((uintptr_t)pDstY);
+  *w_addr = (uint32_t)((uintptr_t)pSrcB);
+
+  volatile uint32_t *compute_mode =
+      (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_COMPUTE_MODE);
+  *compute_mode = 4; // FP32 mode
+
+  volatile uint32_t *trigger =
+      (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_TRIGGER);
+  *trigger;
+
+  volatile uint32_t *wait_reg =
+      (volatile uint32_t *)(REDMULE_BASE_ADDR + REG_WAIT);
+  uint32_t result = *wait_reg;
+}