[otbn,sim] Add masking accelerator interface simulator implementation

etterli · etterli · commit 3cb66f5116ae · 2026-01-14T16:52:53.000+01:00
This adds the simulator implementation of the masking accelerator interface
(MAI). The MAI allows the OTBN to offload A2B, B2A or secAdd computations to
hardened accelerators.

Signed-off-by: Pascal Etterli &lt;pascal.etterli@lowrisc.org&gt;
diff --git a/hw/ip/otbn/dv/otbnsim/sim/constants.py b/hw/ip/otbn/dv/otbnsim/sim/constants.py
@@ -32,6 +32,7 @@ class ErrBits(IntEnum):
     KEY_INVALID = 1 << 5
     RND_REP_CHK_FAIL = 1 << 6
     RND_FIPS_CHK_FAIL = 1 << 7
+    MAI_ERROR = 1 << 8
     IMEM_INTG_VIOLATION = 1 << 16
     DMEM_INTG_VIOLATION = 1 << 17
     REG_INTG_VIOLATION = 1 << 18
diff --git a/hw/ip/otbn/dv/otbnsim/sim/insn.py b/hw/ip/otbn/dv/otbnsim/sim/insn.py
@@ -434,11 +434,18 @@ def execute(self, state: OTBNState) -> Optional[Iterator[None]]:
                 # There's a pending EDN request. Stall for a cycle.
                 yield None
 
-        # At this point, the CSR is ready. Read, update and write back to grs1.
+        # At this point, the CSR is ready. Read it to grd.
         old_val = state.read_csr(self.csr)
-        new_val = old_val | bits_to_set
         state.gprs.get_reg(self.grd).write_unsigned(old_val)
+
+        # If CSR should be updated, compute update, check if update is allowed
+        # and write it back.
         if self.grs1 != 0:
+            new_val = old_val | bits_to_set
+            if self.csr == 0x7f0:
+                if not state.mai.is_valid_ctrl_change(new_val):
+                    state.stop_at_end_of_cycle(ErrBits.MAI_ERROR)
+                    return None
             state.write_csr(self.csr, new_val)
 
         return None
@@ -479,6 +486,12 @@ def execute(self, state: OTBNState) -> Optional[Iterator[None]]:
             old_val = state.read_csr(self.csr)
             state.gprs.get_reg(self.grd).write_unsigned(old_val)
 
+        # Check if the write to MAI_CTRL is allowed.
+        if self.csr == 0x7f0:
+            if not state.mai.is_valid_ctrl_change(new_val):
+                state.stop_at_end_of_cycle(ErrBits.MAI_ERROR)
+                return None
+
         state.write_csr(self.csr, new_val)
         return None
 
@@ -1271,6 +1284,13 @@ def execute(self, state: OTBNState) -> None:
             state.stop_at_end_of_cycle(ErrBits.ILLEGAL_INSN)
             return None
 
+        # Check if MAI is ready to accept new inputs. If not stop with MAI
+        # error.
+        if self.wsr in [12, 13, 14, 15]:
+            if not state.mai.ready_for_inputs():
+                state.stop_at_end_of_cycle(ErrBits.MAI_ERROR)
+                return None
+
         val = state.wdrs.get_reg(self.wrs).read_unsigned()
         state.wsrs.write_at_idx(self.wsr, val)
 
diff --git a/hw/ip/otbn/dv/otbnsim/sim/mai.py b/hw/ip/otbn/dv/otbnsim/sim/mai.py
@@ -0,0 +1,313 @@
+# Copyright lowRISC contributors (OpenTitan project).
+# Licensed under the Apache License, Version 2.0, see LICENSE for details.
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Optional, Tuple
+from collections import deque
+
+from .csr import CSRFile, MaiOperation
+from .wsr import WSRFile, DumbWSR
+
+# The masking accelerator interface (MAI) emulates the behavior of the interface and the actual
+# accelerators.
+
+# Enable or disable assertions which check that the inputs and outputs of the accelerators
+# meet certain constraints (e.g., being smaller than the modulus).
+CHECK_ACCELERATOR_CONSTRAINTS = False
+
+
+class MaskingAccelerator:
+    '''Models a masking accelerator which has a simple pipeline.
+    New operations can be pushed to the accelerator, and results can be popped from it.
+    Each step of the simulation advances the pipeline by one stage.
+    '''
+
+    def __init__(self, latency: int, mod_wsr: DumbWSR) -> None:
+        # Latency of the masking accelerator in cycles.
+        self.latency = latency
+
+        # The MOD WSR is used to get the current modulus for operations.
+        self.mod_wsr = mod_wsr
+
+        # The pipeline contains the two result shares and is modeled with a deque where None
+        # indicates an empty slot.
+        self.pipeline: deque[Optional[Tuple[int, int]]]
+        self.pipeline = deque([None] * self.latency, self.latency)
+
+    def push(self, in0_s0: int, in0_s1: int, in1_s0: int, in1_s1: int) -> bool:
+        '''Try to push an operation to the masking accelerator pipeline.
+
+        Returns True if the accelerator can accept it (free pipeline slot), False otherwise.
+        '''
+        # This accelerator implementation features no backpressure, so we always accept new
+        # operations. Pop the leftmost pipeline slot and replace it with the new operation result.
+        # The result is computed immediately but will only be available after the full pipeline
+        # latency.
+        self.pipeline.popleft()
+        self.pipeline.appendleft(self._compute(in0_s0, in0_s1, in1_s0, in1_s1))
+        return True
+
+    def pop(self) -> Optional[Tuple[int, int]]:
+        '''Read the current output of the masking accelerator pipeline.'''
+        # We do only peak the pipeline as the pipeline advancing is modelled in the step() method.
+        return self.pipeline[-1]
+
+    def step(self) -> None:
+        '''Advance the pipeline by one stage if possible.'''
+        # This accelerator implementation features no backpressure, so we always advance the
+        # pipeline. We insert an unused pipeline slot which is replaced in case a new item is
+        # pushed. appendleft() will drop the rightmost item automatically.
+        self.pipeline.appendleft(None)
+
+    def is_busy(self) -> bool:
+        '''Return True if the accelerator is busy (has pending operations), False otherwise.'''
+        # The accelerator is busy if there is at least one non-None item in the pipeline.
+        return any(slot is not None for slot in self.pipeline)
+
+    def _modulus(self) -> int:
+        '''Return the current 32-bit modulus from the modulus WSR.'''
+        return self.mod_wsr.read_unsigned() & ((1 << 32) - 1)
+
+    def _compute(self, in0_s0: int, in0_s1: int, in1_s0: int, in1_s1: int) -> Tuple[int, int]:
+        '''Compute the result of the masking operation.'''
+        raise NotImplementedError
+
+
+class A2BAccelerator(MaskingAccelerator):
+    def __init__(self, mod_wsr: DumbWSR):
+        super().__init__(8, mod_wsr)
+
+    def _compute(self, in0_s0: int, in0_s1: int, in1_s0: int, in1_s1: int) -> Tuple[int, int]:
+        # The current placeholder implementation removes the arithmetic mask and adds a new boolean
+        # mask. We use a fixed mask until the exact design is known.
+        #
+        # Input: (x - s mod q, s),  (x - s) + s mod q = x,  0 <= s < q
+        # Output: (x XOR r, r),  x XOR r XOR r = x, 0 <= x, r < q <  2^k
+
+        # in1_s0 and in1_s1 are not used by the A2B accelerator
+
+        s = in0_s1
+        # We take a zero mask for simplicity and to avoid assertion errors due to modulus being
+        # smaller than this fixed constant.
+        r = 0
+        secret = (in0_s0 + s) % self._modulus()
+        masked_secret = (secret ^ r)
+
+        # Optionally, we crash if the constraints are not met.
+        if CHECK_ACCELERATOR_CONSTRAINTS:
+            assert self._modulus() < 2**32
+            assert 0 <= s < self._modulus()
+            assert 0 <= r < self._modulus()
+            assert 0 <= secret < self._modulus()
+
+        # Limit results to 32 bits
+        masked_secret &= ((1 << 32) - 1)
+        r &= ((1 << 32) - 1)
+        return (masked_secret, r)
+
+
+class B2AAccelerator(MaskingAccelerator):
+    def __init__(self, mod_wsr: DumbWSR):
+        super().__init__(7, mod_wsr)
+
+    def _compute(self, in0_s0: int, in0_s1: int, in1_s0: int, in1_s1: int) -> Tuple[int, int]:
+        # The current placeholder implementation removes the boolean mask and adds a new arithmetic
+        # mask. We use a fixed mask until the exact design is known.
+        #
+        # Input: (x XOR r, r), 0 <= x, r < q <  2^k
+        # Output: (x - s mod q, s),  (x - s) + s mod q = x,  0 <= s < q
+
+        # in1_s0 and in1_s1 are not used by the B2A accelerator
+
+        # We take a zero mask for simplicity and to avoid assertion errors due to modulus being
+        # smaller than this fixed constant.
+        s = 0
+        r = in0_s1
+
+        secret = in0_s0 ^ r
+        masked_secret = (secret - s) % self._modulus()
+
+        # Optionally, we crash if the constraints are not met.
+        if CHECK_ACCELERATOR_CONSTRAINTS:
+            assert self._modulus() < 2**32
+            assert 0 <= in0_s0 < self._modulus()
+            assert 0 <= r < self._modulus()
+            assert 0 <= s < self._modulus()
+
+        # Limit results to 32 bits
+        masked_secret &= ((1 << 32) - 1)
+        s &= ((1 << 32) - 1)
+        return (masked_secret, s)
+
+
+class SecAddModqAccelerator(MaskingAccelerator):
+    def __init__(self, mod_wsr: DumbWSR):
+        super().__init__(9, mod_wsr)
+
+    def _compute(self, in0_s0: int, in0_s1: int, in1_s0: int, in1_s1: int) -> Tuple[int, int]:
+        # The current placeholder implementation removes the boolean masks, adds the secrets
+        # modulo q, and adds a new boolean mask. We use a fixed mask until the exact design is
+        # known.
+        #
+        # Input: (x xor r1, r1), (y xor s1, s1), 0 <= x, y, s, r < q < 2^k
+        # Output: ((x + y mod q) XOR t, t)
+        r1 = in0_s1
+        s1 = in1_s1
+        # We take a zero mask for simplicity and to avoid assertion errors due to modulus being
+        # smaller than this fixed constant.
+        t = 0
+
+        x = in0_s0 ^ r1
+        y = in1_s0 ^ s1
+        sum = (x + y) % self._modulus()
+        masked_sum = sum ^ t
+
+        if CHECK_ACCELERATOR_CONSTRAINTS:
+            assert self._modulus() < 2**32
+            assert 0 <= x < self._modulus()
+            assert 0 <= y < self._modulus()
+            assert 0 <= r1 < self._modulus()
+            assert 0 <= s1 < self._modulus()
+            assert 0 <= t < self._modulus()
+
+        # Limit results to 32 bits
+        masked_sum &= ((1 << 32) - 1)
+        t &= ((1 << 32) - 1)
+        return (masked_sum, t)
+
+
+class MaskingAcceleratorInterface:
+    def __init__(self, csrs: CSRFile, wsrs: WSRFile) -> None:
+
+        # The CSRs and WSRs
+        self.csrs = csrs
+        self.wsrs = wsrs
+        self.mai_ctrl = self.csrs.MaiCtrl
+        self.mai_status = self.csrs.MaiStatus
+        self.mai_res_s0 = self.wsrs.MaiResS0
+        self.mai_res_s1 = self.wsrs.MaiResS1
+        self.mai_in0_s0 = self.wsrs.MaiIn0S0
+        self.mai_in0_s1 = self.wsrs.MaiIn0S1
+        self.mai_in1_s0 = self.wsrs.MaiIn1S0
+        self.mai_in1_s1 = self.wsrs.MaiIn1S1
+
+        # All available accelerators are instantiated here in a dictionary.
+        # The currently active accelerator is selected based on the operation field in MAI_CTRL.
+        # Changing the operation while an operation is ongoing is not allowed (see
+        # is_valid_ctrl_change). Thus, the step() method can simply read the operation field each
+        # cycle to get the current accelerator like this:
+        # self._all_accelerators[self.mai_ctrl.read_operation()]
+        self._all_accelerators = {
+            MaiOperation.A2B: A2BAccelerator(self.wsrs.MOD),
+            MaiOperation.B2A: B2AAccelerator(self.wsrs.MOD),
+            MaiOperation.SECADD: SecAddModqAccelerator(self.wsrs.MOD),
+        }
+
+        # Dispatch related variables
+        # The dispatch logic is responsible for pushing inputs into the accelerator.
+        self._dispatch_idx = 0
+        self.is_dispatching = False
+
+        # Writeback related variables
+        # The writeback logic is responsible for receiving results from the accelerator into the
+        # output WSRs.
+        self._writeback_idx = 0
+
+    def _accelerator(self) -> MaskingAccelerator:
+        '''Return the currently selected masking accelerator based on the operation field.'''
+        return self._all_accelerators[self.mai_ctrl.read_operation()]
+
+    def step(self) -> None:
+        '''Advance the MAI simulation by one cycle.
+
+        This is expected to be called before the current instruction executes / steps.
+        '''
+        ###################
+        # Writeback logic #
+        ###################
+        # Get the newest result and write it into the output WSRs. This is done before
+        # advancing the pipeline to model the fact that the result is available at
+        # the start of the cycle.
+        results = self._accelerator().pop()
+        if results is not None:
+            # Write to the output WSRs
+            self.mai_res_s0.set_32bit_unsigned(results[0], self._writeback_idx)
+            self.mai_res_s1.set_32bit_unsigned(results[1], self._writeback_idx)
+            self._writeback_idx += 1
+
+        # Detect if we finished writing back
+        if self._writeback_idx >= 8:
+            self._writeback_idx = 0
+            # If we are finishing the writeback, reset the busy bit. The write method update the
+            # bits when committing to the changes so the current instruction still reads the old
+            # value.
+            self.mai_status.write_busy_bit(False)
+
+        ######################
+        # Accelerator update #
+        ######################
+        # Advance the accelerator pipeline.
+        self._accelerator().step()
+
+        #################
+        # Start logic   #
+        #################
+        # Start a new operation if start bit was set in last cycle
+        if self.mai_ctrl.read_start_bit():
+            # Begin pushing inputs in the dispatch logic
+            self.is_dispatching = True
+            # Immediately set the busy bit such that the current instruction reads it as set.
+            self.mai_status.set_busy_bit(True)
+            # Immediately reset the ready bit such that the current instruction reads it as reset
+            # and any configuration change check does not allow changing the operation type.
+            self.mai_status.set_ready_bit(False)
+            # Immediately reset the start bit such that it always reads zero.
+            self.mai_ctrl.set_start_bit(False)
+
+        ##################
+        # Dispatch logic #
+        ##################
+        if self.is_dispatching:
+            self._accelerator().push(self.mai_in0_s0.read_32bit_unsigned(self._dispatch_idx),
+                                     self.mai_in0_s1.read_32bit_unsigned(self._dispatch_idx),
+                                     self.mai_in1_s0.read_32bit_unsigned(self._dispatch_idx),
+                                     self.mai_in1_s1.read_32bit_unsigned(self._dispatch_idx))
+            self._dispatch_idx += 1
+
+        # Detect if we have finished dispatching
+        if self._dispatch_idx >= 8:
+            self._dispatch_idx = 0
+            self.is_dispatching = False
+            # Set the ready bit at the end of this cycle. This indicates that new inputs can be
+            # accepted.
+            self.mai_status.write_ready_bit(True)
+
+    def is_busy(self) -> bool:
+        '''Returns whether the MAI is currently busy processing an operation.'''
+        return self.mai_status.read_busy_bit()
+
+    def is_ready(self) -> bool:
+        '''Returns whether the MAI is ready to accept new inputs.'''
+        return self.mai_status.read_ready_bit()
+
+    def ready_for_inputs(self) -> bool:
+        return self.is_ready()
+
+    def ready_to_start(self) -> bool:
+        return not self.is_busy()
+
+    def is_valid_ctrl_change(self, value: int) -> bool:
+        '''Return whether writing value to the MAI_CTRL CSR is currently allowed.'''
+        # Starting is only allowed if MAI is ready.
+        if self.mai_ctrl.would_set_start_bit(value) and not self.ready_to_start():
+            return False
+
+        # We only allow setting the operation to valid options.
+        if not self.mai_ctrl.is_valid_operation(value):
+            return False
+
+        # Changing the operation is only allowed if MAI is not busy / no operation is ongoing.
+        if self.mai_ctrl.would_change_op(value) and self.is_busy():
+            return False
+
+        return True
diff --git a/hw/ip/otbn/dv/otbnsim/sim/state.py b/hw/ip/otbn/dv/otbnsim/sim/state.py
@@ -15,6 +15,7 @@
 from .flags import FlagReg
 from .gpr import GPRs
 from .loop import LoopStack
+from .mai import MaskingAcceleratorInterface
 from .reg import RegFile
 from .trace import Trace, TracePC
 from .wsr import WSRFile
@@ -200,6 +201,9 @@ def __init__(self) -> None:
         # random data).
         self.edn_seen_running = False
 
+        # The masking accelerator interface (MAI) handles the accelerators
+        self.mai = MaskingAcceleratorInterface(self.csrs, self.wsrs)
+
     def get_next_pc(self) -> int:
         if self._pc_next_override is not None:
             return self._pc_next_override
@@ -308,6 +312,7 @@ def step(self, handle_injected_error: bool) -> None:
             self.take_injected_err_bits()
         self.ext_regs.step()
         self._urnd_client.step()
+        self.mai.step()
 
     def commit(self, sim_stalled: bool) -> None:
         if self._time_to_imem_invalidation is not None: