From 985562a9700f7fce461ef68435fbde7903fd1ce1 Mon Sep 17 00:00:00 2001 From: Damien Pretet Date: Wed, 20 Dec 2023 22:07:56 +0100 Subject: [PATCH] Document atomic operation support --- doc/atomic_ops.md | 177 ++++++++++++++++ doc/axi_id_ordering.md | 71 +++++++ doc/friscv.drawio | 359 ++++++++++++++++---------------- doc/ordering_cache.md | 31 +++ doc/project_mgt_hw.md | 10 +- rtl/friscv_dcache.sv | 1 + rtl/friscv_rv32i_core.sv | 15 +- test/common/friscv_testbench.sv | 1 + 8 files changed, 478 insertions(+), 187 deletions(-) create mode 100644 doc/atomic_ops.md create mode 100644 doc/axi_id_ordering.md create mode 100644 doc/ordering_cache.md diff --git a/doc/atomic_ops.md b/doc/atomic_ops.md new file mode 100644 index 0000000..44cea0a --- /dev/null +++ b/doc/atomic_ops.md @@ -0,0 +1,177 @@ + +## Overview + +The aim of this dev (made from v1.6.1) is to support atomic operation instructions. Atomic +operations will bring synchronization techniques required by kernels. The goal for FRISCV is to be +able to boot a kernel like FreeRTOS or Linux (without MMU) and makes the core a platform for real +world usecases. + +From [OS dev wiki](https://wiki.osdev.org/Atomic_operation): + + An atomic operation is an operation that will always be executed without any other process being + able to read or change state that is read or changed during the operation. It is effectively + executed as a single step, and is an important quality in a number of algorithms that deal with + multiple independent processes, both in synchronization and algorithms that update shared data + without requiring synchronization. + +For single core system: + + If an operation requires multiple CPU instructions, then it may be interrupted in + the middle of executing. If this results in a context switch (or if the interrupt handler refers + to data that was being used) then atomicity could be compromised. It is possible to use any + standard locking technique (e.g. a spinlock) to prevent this, but may be inefficient. If it is + possible, disabling interrupts may be the most efficient method of ensuring atomicity (although + note that this may increase the worst-case interrupt latency, which could be problematic if it + becomes too long). + +For multi core system: + + In multiprocessor systems, ensuring atomicity exists is a little harder. It is still possible to + use a lock (e.g. a spinlock) the same as on single processor systems, but merely using a single + instruction or disabling interrupts will not guarantee atomic access. You must also ensure that + no other processor or core in the system attempts to access the data you are working with. + +[Wiki Linearizability](https://en.m.wikipedia.org/wiki/Linearizability) + +[Wiki Load-link/Store-Conditional](https://en.wikipedia.org/wiki/Load-link/store-conditional) + +In summary, an atomic operation can be useful to: +- synchronize threads among a core +- synchronize cores in a SOC +- ensure a memory location can be read-then-update in any situation, including exceptions handling + and avoid any hazards + +Atomic operations will be implemented in the load/store stage (`memfy`). dCache stage will also be +updated to better support `ACACHE`, slighlty change `AID` handling and put in place exclusive access +support (a new special routing). Finally, AXI memory model needs to support this new access type. + +## Implementation + +From [Y-Combinator](https://news.ycombinator.com/item?id=27674238) + +LR/SC stands for load-reserved/store-conditional, also called load-linked/store-conditional. +In a traditional atomic implementation using Compare-and-Swap, the order of execution is as follows: + +1. Read value X into register A. +2. Do computation using register A, creating a new value in register B. +3. Do a compare-and-swap on value X: If X == A, then set X to B. The operation was successful. If X + != A, another thread changed X while we were using it, so the operation failed. Rollback and + retry. + +This suffers from the ABA problem: it does not detect the case where another thread changes X to a +new value C, but then changed it back to A before the compare-and-swap happens. + +[Google Group](https://groups.google.com/a/groups.riscv.org/g/isa-dev/c/bdiZ9QANeQM?pli=1a) + + + + +## RISCV Specification v1.0 - Chapter 8 - “A” Standard Extension + +Instructions that atomically read-modify-write memory to support synchronization between multiple +RISC-V harts running in the same memory space. + +The two forms of atomic instruction provided are load-reserved/store-conditional instruction and +atomic fetch-and-op memory instruction. + +### Ordering + +The base RISC-V ISA has a relaxed memory model, with the FENCE instruction used to impose additional +ordering constraints. The address space is divided by the execution environment into memory and I/O +domains, and the FENCE instruction provides options to order accesses to one or both of these two +address domains. + +To provide more efficient support for release consistency, each atomic instruction has two bits, +aq and rl, used to specify additional memory ordering constraints as viewed by other RISC-V harts. + +If both bits are clear, no additional ordering constraints are imposed on the atomic memory op- +eration. + +If only the aq bit is set, the atomic memory operation is treated as an acquire access, +i.e., no following memory operations on this RISC-V hart can be observed to take place before the +acquire memory operation. + +=> All memory instructions must be executed before the AMO. + +If only the rl bit is set, the atomic memory operation is treated as a release access, i.e., the +release memory operation cannot be observed to take place before any earlier memory operations on +this RISC-V hart. + +=> All memory instructions must be executed after the AMO. + +If both the aq and rl bits are set, the atomic memory operation is sequentially consistent and +cannot be observed to happen before any earlier memory operations or after any later memory +operations in the same RISC-V hart and to the same address domain. + +=> All memory instructions must be executed before & after the AMO. + + +## Design Plan + +- Document and list all AXI usage and limitations in the IP. +- The core, `memfy` and `dCache` stages, will be updated on `AID` usage. Please refer + to [AMBA spec](./axi_id_ordering.md) for further details of `AID` usage and ordering model. + + +### Global Update + +- Add ALOCK among the core & the platorm +- Resize ALOCK to 1 bit in interconnect + +### Processing Unit + +Nothing expected to be changed + +### Memfy Unit + +When `memfy` unit receives an atomic operation: +- it reserves its `rs1`/`rs2`/`rd` registers in processing scheduler +- it issues a read request to a memory register with: + - a specific `AID` (e.g. `0x50`), dedicated to exclusive access + - `ALOCK=0x1` making the request an `exclusive access` + - `ACACHE=0x0` making the request `non-cachable` and `non-bufferable`, a `device` access +- it executes the atomic operation +- it issues to memory a request with the same attributes than read operation + - a write request to update the memory register + - a read request to release the memory register + +### dCache Unit + +Needs to support exclusive access +- Exclusive access is a `device` access (`non-cachable` and `non-bufferable`), read/write trough + policy +- Don't replace ID for exclusive access +- Invalidate cache line if exclusive access occurs on a cache hit. Even if memory map should ensure + a proper attribute to a memory cell, it will ease software design without hardware knowledge +- dCache will not be responsible of concurrency between exclusive access and regular access. + Memfy needs to handle correctly requests + +### AXI Memory + +- Upgrade to AXI4 +- Support exclusive access, managed by a dedicated LUT + - Reserve if first access + - Release on a second (either with read or write) + - Based on ID and address + - Release exclusivity if write non-exclusive target a reserved-exclusive access +- Correctly support in-order if same ID issued multiple times + +### Core + +- Upgrade interfaces to AXI4 + +### Platform + +- Upgrade to AXI interconnect + + +## Test Plan + +- An atomic operation can't be stopped if control unit manages async/sync exceptions +- Check ordering with aq & rl bits combinations +- Used an unaligned address to raise an exception +- Read-exclusive followed by a write non-exclusive to check exclusivity in RAM +- Concurrent excusive accesses to check exclusivity in RAM +- Write applications + - https://begriffs.com/posts/2020-03-23-concurrent-programming.html + - voir les livres / pdf sur le sujet OS et semaphores diff --git a/doc/axi_id_ordering.md b/doc/axi_id_ordering.md new file mode 100644 index 0000000..d4cc1ea --- /dev/null +++ b/doc/axi_id_ordering.md @@ -0,0 +1,71 @@ +# AMBA AXI ID & Ordering + +## AXI Transaction Identifier + +### Overview + +The AXI protocol includes AXI ID transaction identifiers. A Manager can use these to identify +separate transactions that must be returned in order. All transactions with a given AXI ID value +must remain ordered, but there is no restriction on the ordering of transactions with different ID +values. + +A single physical port can support out-of-order transactions by acting as a number of logical ports, +each handling its transactions in order. + +By using AXI IDs, a Manager can issue transactions without waiting for earlier transactions to +complete. This can improve system performance, because it enables parallel processing of +transactions. + +There is no requirement for Subordinates or Managers to use AXI transaction IDs. Managers and +Subordinates can process one transaction at a time. Transactions are processed in the order they are +issued. + +Subordinates are required to reflect on the appropriate BID or RID response an AXI ID received from +a Manager. + +### Read Data Ordering + +The Subordinate must ensure that the RID value of any returned data matches the ARID value of the +address that it is responding to. + +The interconnect must ensure that the read data from a sequence of transactions with the same ARID +value targeting different Subordinates is received by the Manager in the order that it issued the +addresses. + +The read data reordering depth is the number of addresses pending in the Subordinate that can be +reordered. A Subordinate that processes all transactions in order has a read data reordering depth +of one. The read data reordering depth is a static value that must be specified by the designer of +the Subordinate. + +There is no mechanism that a Manager can use to determine the read data reordering depth of a +Subordinate. + +### Write data ordering + +A Manager must issue write data in the same order that it issues the transaction addresses. + +An interconnect that combines write transactions from different Managers must ensure that it +forwards the write data in address order. + + +### Interconnect use of transaction identifiers + +When a Manager is connected to an interconnect, the interconnect appends additional bits to the +ARID, AWID and WID identifiers that are unique to that Manager port. This has two effects: + +- Managers do not have to know what ID values are used by other Managers because the interconnect + makes the ID values used by each Manager unique by appending the Manager number to the original + identifier. +- The ID identifier at a Subordinate interface is wider than the ID identifier at a Manager + interface. + +For response, the interconnect uses the additional bits of the xID identifier to determine which +Manager port the response is destined for. The interconnect removes these bits of the xID +identifier before passing the xID value to the correct Manager port. + + +#### Master + +#### Slave + +#### Interconnect diff --git a/doc/friscv.drawio b/doc/friscv.drawio index 78cfd75..d0c11fa 100644 --- a/doc/friscv.drawio +++ b/doc/friscv.drawio @@ -1,4 +1,4 @@ - + @@ -13,7 +13,7 @@ - + @@ -217,10 +217,8 @@ - - - - + + @@ -307,13 +305,22 @@ - + + + + + + + + + + - + @@ -479,8 +486,8 @@ - - + + @@ -546,7 +553,7 @@ - + @@ -608,14 +615,14 @@ - + - + @@ -1013,7 +1020,7 @@ - + @@ -1465,9 +1472,7 @@ - - - + @@ -1605,7 +1610,7 @@ - + @@ -3068,10 +3073,13 @@ - + + + + @@ -3520,7 +3528,12 @@ - + + + + + + @@ -4200,7 +4213,7 @@ - + @@ -4531,14 +4544,11 @@ - - - - + @@ -4599,9 +4609,6 @@ - - - @@ -4612,7 +4619,7 @@ - + @@ -4636,8 +4643,8 @@ - - + + @@ -4646,7 +4653,7 @@ - + @@ -4785,7 +4792,7 @@ - + @@ -4828,7 +4835,7 @@ - + @@ -4863,334 +4870,334 @@ - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/doc/ordering_cache.md b/doc/ordering_cache.md new file mode 100644 index 0000000..76caff0 --- /dev/null +++ b/doc/ordering_cache.md @@ -0,0 +1,31 @@ +## Future Enhancement: Ordering + +Rework AXI support for ID management and cache + +MPU: +- support cachability of a region +- support sharability of a region +- IO & memory of a region (replace old implementation) + +Memfy +- Manage multiple requests with a LUT (see memfy_opt branch) +- Should be able to manage completion reodering (possible enhancement) + +dCache +- the cache should be able to manage different IDs and don't substitute all the time them. Better + performance. Reordering should be done only on different IDs. +- Use WRAP mode for read request, INCR for write request + +Better ID management Cortex-M7 ID Usage example: + +Read IDs: +- ID O: Normal Non-Cacheable, Device and Strongly-ordered reads +- ID 2: Data cache linefills from linefill buffer 0 +- ID 3: Data cache linefills from linefill buffer 1 +- ID 4: Any instruction fetch + +Write IDs: +- ID O: Normal non-cacheable memory and all store-exclusive transactions +- ID 1: Stores to Normal cacheable memory +- ID 2: Writes to Device or Strongly-ordered memory +- ID 3: Evictions to Normal cacheable Write-Back memory diff --git a/doc/project_mgt_hw.md b/doc/project_mgt_hw.md index d0f7ff4..f5563c8 100644 --- a/doc/project_mgt_hw.md +++ b/doc/project_mgt_hw.md @@ -5,9 +5,6 @@ - [X] Support U-mode - [X] Support PMP/PMA - [X] https://github.com/eembc/coremark - - [ ] Advanced Interrupt controller - - [ ] AXI ERR handling - - [ ] AXI EXOKAY handling - [ ] Atomic operations - stage to execute the instruction, controlling ldst Stages - memfy exposes two interfaces for requests. @@ -26,6 +23,12 @@ Any new features should be carefully study to ensure a proper exception and inte ## Memory +- [ ] Bus fault to route on exceptions https://lists.riscv.org/g/tech-privileged/topic/80351141 +- [ ] Support posted-write + - memfy issues write request without waiting BRESP + - Manages concurrency with read request + - Manages hazard + - https://medium.com/@ryoungsunpark/why-axi-has-a-write-response-channel-a0fe26eebf7d - [ ] Better manage ACACHE attribute - [ ] Correct value driven from memfy - [ ] Use it correctly across the cache @@ -55,6 +58,7 @@ Any new features should be carefully study to ensure a proper exception and inte ## Cache Stages +- [ ] Add dedicated RAM for cache, not connected thru AXI interconnect - [ ] AXI4 + Wrap mode for read - [ ] Support datapath adaptation from memory controller - [ ] Narrow transfer support? diff --git a/rtl/friscv_dcache.sv b/rtl/friscv_dcache.sv index e7f417a..b1ccfc4 100644 --- a/rtl/friscv_dcache.sv +++ b/rtl/friscv_dcache.sv @@ -651,6 +651,7 @@ module friscv_dcache .aresetn (aresetn), .srst (srst), .ready (cache_ready), + // unconnected while dcache can't be sw-flushed .flush_blocks (1'b0), .flush_ack (), .flushing (flushing), diff --git a/rtl/friscv_rv32i_core.sv b/rtl/friscv_rv32i_core.sv index 327fbae..7ec4c4e 100644 --- a/rtl/friscv_rv32i_core.sv +++ b/rtl/friscv_rv32i_core.sv @@ -86,11 +86,12 @@ module friscv_rv32i_core parameter PMPADDR13_INIT = 32'h0, parameter PMPADDR14_INIT = 32'h0, parameter PMPADDR15_INIT = 32'h0, + // Virtual memory support parameter MMU_SUPPORT = 0, - // IO regions for direct read/write access - parameter IO_MAP_NB = 0, + // Memory IO regions for device, non-cacheable, read/write access + parameter IO_MAP_NB = 1, // IO address ranges, organized by memory region as END-ADDR_START-ADDR: // > 0xEND-MEM2_START-MEM2_END-MEM1_START-MEM1_END-MEM0_START-MEM0 // IO mapping can be contiguous or sparse, no restriction on the number, @@ -204,7 +205,6 @@ module friscv_rv32i_core logic [5 -1:0] ctrl_rd_addr; logic [XLEN -1:0] ctrl_rd_val; - // ISA registers interface logic [NB_ALU_UNIT*5 -1:0] proc_rs1_addr; logic [NB_ALU_UNIT*XLEN -1:0] proc_rs1_val; logic [NB_ALU_UNIT*5 -1:0] proc_rs2_addr; @@ -279,11 +279,10 @@ module friscv_rv32i_core logic [`CSR_SB_W -1:0] csr_sb; logic [`CTRL_SB_W -1:0] ctrl_sb; - - logic [AXI_ADDR_W -1:0] mpu_imem_addr; - logic [AXI_ADDR_W -1:0] mpu_dmem_addr; - logic [4 -1:0] mpu_imem_allow; - logic [4 -1:0] mpu_dmem_allow; + logic [AXI_ADDR_W -1:0] mpu_imem_addr; + logic [AXI_ADDR_W -1:0] mpu_dmem_addr; + logic [4 -1:0] mpu_imem_allow; + logic [4 -1:0] mpu_dmem_allow; ////////////////////////////////////////////////////////////////////////// // Check parameters setup consistency and break up if not supported diff --git a/test/common/friscv_testbench.sv b/test/common/friscv_testbench.sv index a424e1c..699eee7 100644 --- a/test/common/friscv_testbench.sv +++ b/test/common/friscv_testbench.sv @@ -423,6 +423,7 @@ module friscv_testbench( .DCACHE_BLOCK_W (DCACHE_BLOCK_W), .DCACHE_PREFETCH_EN (DCACHE_PREFETCH_EN), .DCACHE_DEPTH (DCACHE_DEPTH), + .IO_MAP_NB (0), .MPU_SUPPORT (MPU_SUPPORT), .PMPCFG0_INIT (PMPCFG0_INIT), .PMPCFG1_INIT (PMPCFG1_INIT),