From 3583a6a2d6cefbb4d2930970c54770abcff62ef9 Mon Sep 17 00:00:00 2001 From: Damien Pretet Date: Tue, 5 Sep 2023 22:01:26 +0200 Subject: [PATCH] Populate IO / parameters chapter --- doc/ios_params.md | 183 +++++++++++++++++++++++++++++++- doc/project_mgt_hw.md | 68 ++++++------ rtl/friscv_processing.sv | 2 - rtl/friscv_rv32i_core.sv | 5 +- rtl/friscv_rv32i_platform.sv | 3 - test/common/friscv_testbench.sv | 4 - 6 files changed, 221 insertions(+), 44 deletions(-) diff --git a/doc/ios_params.md b/doc/ios_params.md index 053eafc..09a599b 100644 --- a/doc/ios_params.md +++ b/doc/ios_params.md @@ -1 +1,182 @@ -# Inputs / Outputs / Parameters +# Parameters + +## Core + +- ILEN: + - length of an instruction + - 32 (bits) + - can't be changed + - default: 32 + +- XLEN: + - data bus widness + - 32 or 64 (bits) + - architecture dependent + - default: 32 + +- BOOT_ADDR: + - address the core will boot from + - any value (byte) + - platform dependent + - default: 0 + +- INST_OSTDREQ_NUM + - number of oustanding instruction read the control can issue + - any value from 1 + - default: 8 + +- DATA_OSTDREQ_NUM + - number of oustanding read/write the load/store module can issue + - any value from 1 + - default: 8 + +- HART_ID + - RISCV core identifier (MHART CSR) + - any value from 0 + - default: 0 + +- RV32E + - use only 16 registers (RV32E extension activation, MISA CSR [4]) + - 0 or 1 + - default: 0, 32 registers available + +- F_EXTENSION + - activate floating point extension (MISA CSR [5]) + - 0 or 1 + - default: 0, no floating point support + +- M_EXTENSION + - activate multiply/divide extension (MISA CSR [12]) + - 0 or 1 + - default: 0, no multiply/divide support + +- PROCESSING_BUS_PIPELINE + - insert a pipeline at processing unit input bus + - 0 or 1 + - default: 0, no pipeline + +- AXI_ADDR_W + - wideness of any AXI address bus + - any value (bits) + - default: XLEN, 32 bits + +- AXI_ID_W + - widness of any AXI ID bus + - any value greater than 1 (bits) + - default: 8 bits + +- AXI_IMEM_W + - wideness of any AXI data bus for instruction + - 32 (bits) + - default: XLEN, 32 bits + +- AXI_DMEM_W + - wideness of any AXI data bus for data + - 32 or 64 (bits) + - default: XLEN, 32 bits + +- AXI_IMEM_MASK + - mask applied to instruction AXI ID bus to identify it + - any value matching AXI_ID_W + - default: 0x10 + +- AXI_DMEM_MASK + - mask applied to data AXI ID bus to identify it + - any value matching AXI_ID_W + - default: 0x20 + +- CACHE_EN + - enable both instruction and data cache stages + - 0 or 1 + - default: 0, disabled + +- ICACHE_PREFETCH_EN + - enable next instrutction prefetch on continuous address parsing + - 0 or 1 + - default: 0, disabled + +- ICACHE_BLOCK_W + - number of instruction per cache block + - any multiple of ILEN (bits) + - default: 4*ILEN + +- ICACHE_DEPTH + - number of cache block + - any value greater than 1 + - default: 512 + +- DCACHE_PREFETCH_EN + - enable next data prefetch on continuous address parsing + - 0 or 1 + - default: 0, disabled + +- DCACHE_BLOCK_W + - number of data per cache block + - any multiple of XLEN (bits) + - default: 4*XLEN + +- DCACHE_DEPTH + - number of cache block + - any value greater than 1 + - default: 512 + +- IO_MAP_NB + - number of I/O (device) memory map (to bypass data cache fetch) + - any value equal or greater than 0 + - default: 0, no device mapped + +- IO_MAP + - Start / End address of the IO (device) memory map + - any value mapped in the memory, organized like END-ADDR_START-ADDR, matching + AXI_ADDR_W*2*IO_MAP_NB (bits) + - default: 64'h001000FF_00100000 + +## Platform + +All parameters listed in [core](#core) section apply here + +# Inputs / Outputs + +## Core + +- aclk + - the main clock of the core + - input + +- aresetn + - the main asynchronous active low reset + - input + - don't use it if already using srst + +- srst + - the main synchronous active high reset + - input + - don't use it if already using aresetn + +- ext_irq + - external interrupt, from any hardware source + - input + +- sw_irq + - software interrupt, from any other hart or PLIC controller + - input + +- timer_irq + - timer interrupt, from CLINT controller + - input + +- status + - debug bus + - output + +- dbg_regs + - all the ISA registers + - output + +- imem_* + - AXI4-lite instruction bus (read channels only) + - input/output + +- dmem_* + - AXI4-lite data bus + - input/output diff --git a/doc/project_mgt_hw.md b/doc/project_mgt_hw.md index 2ab485f..b99aebb 100644 --- a/doc/project_mgt_hw.md +++ b/doc/project_mgt_hw.md @@ -2,16 +2,14 @@ - [ ] Revoir tous les paramètres de chaque instance et les documenter - [ ] Review readme files +- [ ] Rework logging by using SVLogger everywhere - [ ] Revoir la RAM AXI pour les temps de réponses write compliance et speed # BACKLOG -N.B. : -- Counters and timers should be reworked for multi hart architecture, and probably adapted - for platform specific configurations -- Any new feature and ISA should be carefully study to ensure a proper - exception and interrupt handling +N.B. : Any new feature and ISA should be carefully study to ensure a proper + exception and interrupt handling Memory - [ ] Better manage ACACHE attribute @@ -22,17 +20,18 @@ Memory - [ ] IO map bufferable / non-bufferable - [ ] Make memory mapping of the core with: - [ ] Normal vs device - - [ ] Inst vs data zone for cacheability + - [ ] Inst vs data zone for cacheability / executability - [ ] Sharable for L2 cache - [ ] Support exception code for memory access error - [ ] Manage write response from cache or interco, don’t wait endpoint - [ ] Raise exception also from cache -Cache Stages: +Cache Stages + - [ ] AXI4 + Wrap mode for read - [ ] Support datapath adaptation from memory controller - - Narrow transfer support? - - Gather/merge multiple continuous transactions? + - [ ] Narrow transfer support? + - [ ] Gather/merge multiple continuous transactions? - [ ] Bien définir la politique write through no allocate - [ ] Write thru n’a pas besoin de n’a pas besoin d’eviction buffer https://stackoverflow.com/questions/23635284/what-is-the-difference-between-eviction-buffer-and-merging-store-buffer-on-arm-c - [ ] Renommer le write stage pour merging store buffer et essayer de merger les acces au besoin @@ -40,14 +39,17 @@ Cache Stages: - [ ] Write back policy permet de sauver de la BW mais rend la structure plus évoluée - [ ] New cache associativity (2 / 4 / 8 ways configurable) - [ ] OoO read: miss could be stacked and served later waiting for cache fill and continue reading the next address +- [ ] Fully concurrent read / write access (Issue #1) Misc. -- [ ] Create a HW platform + +- [ ] Create a HW test platform + - [ ] Cloud + - [ ] Analogue pocket - [ ] Add registers to configure the core in platform - [ ] Support completly a profile - [ ] 64 bits support -- [ ] Atomic operations -- [ ] Zicond +- [ ] Atomic operations for single core - [ ] Support privileged instructions, supervisor mode & user mode - voir les CSRs dans la privileged mode, implementer les compteurs par mode - https://danielmangum.com/posts/risc-v-bytes-privilege-levels/ @@ -60,17 +62,21 @@ Misc. - https://tomverbeure.github.io/2021/07/18/VexRiscv-OpenOCD-and-Traps.html - https://tomverbeure.github.io/2022/02/20/GDBWave-Post-Simulation-RISCV-SW-Debugging.html - https://github.com/BLangOS/VexRiscV_with_HW-GDB_Server -- [ ] Super scaler arch - - https://en.m.wikipedia.org/wiki/Instruction-level_parallelism - - https://en.m.wikipedia.org/wiki/Data_dependency - - https://www.youtube.com/channel/UCPSsA8oxlSBjidJsSPdpjsQ/videos -- [ ] Support PLIC (only for multi-core) +- [ ] Zicond extension - [ ] Support CLIC controller - [ ] Random peripheral - [ ] UART: Support 9/10 bits & parity +- [ ] AXI platform +- [ ] Multi-core platform: + - [ ] Counters and timers should be reworked + - [ ] Nb core configurable + - [ ] Support PLIC + - [ ] Extended atomic operation support + - [ ] Implement a L2 cache stage AXI4 Infrastructure + - [ ] Detect address collision in memfy for better performance - support concurrent r/w in dCache - merge memfy_opt for memfy core udpate @@ -82,13 +88,13 @@ AXI4 Infrastructure - [ ] Rework IO APB interconnect - Fix IO subsystem misrouted - Fix IO subsystem bridge -- [ ] Implement a L2 cache stage - [ ] Out of order support in AXI (memfy if not using cache) -Control: -- [ ] Detect IO requests to forward info for FENCE execution +Control + - [ ] Preload jal even if processing is busy +- [ ] Detect IO requests to forward info for FENCE execution - [ ] Move LUI into processing to prepare future extension support - [ ] Read ASM to be sure its used for processing and not control - [ ] Benchmark waveform doesn’t reveal high usage @@ -101,13 +107,12 @@ Control: - [ ] Rewind pipeline (L0 local cache) -Processing: +Processing - -- [ ] Parameter in processing to deactivate hazard detection, save logic and measure gain +- [ ] Processing: parameter to deactivate hazard detection, save logic and measure gain - [ ] Memfy: - If not ready, and request present, the FSM can’t drive further data - - Manage RRESP/BRESP in an exception bus + - Manage RRESP/BRESP in the exception bus - [ ] Support F extension - [ ] Division - [ ] Save bandwidth by removing dead cycles @@ -122,10 +127,9 @@ Processing: Verification/Validation: - [ ] Drop lxt2 waveform -- [ ] SV Testbench: Assert flush without ARVALID - [ ] Create app per benchmark -- [ ] Testcase C ASM stress de cache -- [ ] Print des tests qui ne marchent pas dans le bash et svut_h.sv pour verilator +- [ ] Testcase C ASM cache stress +- [ ] Print des tests qui ne marchent pas dans le bash - [ ] Update synthesis flow - [ ] Standard cells library for Yosys - [ ] https://github.com/dpretet/ascend-freepdk45/tree/master/lib @@ -138,13 +142,14 @@ Verification/Validation: - [ ] Support cache disable in testbench - [ ] Error Logger Interface - [ ] Shared bus des CSRs, privilege mode, event, … - - [ ] stream the event like a write memory error + - [ ] Stream the event like a write memory error - [ ] log error in a file - [ ] Support GDB: https://tomverbeure.github.io/2021/07/18/VexRiscv-OpenOCD-and-Traps.html - [ ] Update RISCV testsuite sources - [ ] SV Testbench: be able to assert or not a flush req along a new request on the same cycle Hardware Test: + - [ ] Support LiteX: https://github.com/litex-hub/litex-boards, https://pcotret.gitlab.io/blog/processor_in_litex/ - [ ] Azure: https://www.xilinx.com/products/boards-and-kits/alveo/cloud-solutions/microsoft-azure.html - [ ] AWS: https://www.xilinx.com/products/design-tools/acceleration-zone/aws.html @@ -154,7 +159,10 @@ Hardware Test: # Ideas / Applications - [ ] Next CPU architecture: - - super scalar architecture + - Super scalar architecture + - https://en.m.wikipedia.org/wiki/Instruction-level_parallelism + - https://en.m.wikipedia.org/wiki/Data_dependency + - https://www.youtube.com/channel/UCPSsA8oxlSBjidJsSPdpjsQ/videos - SIMD architecture - Vector architecture - Application to GPGPU area @@ -166,7 +174,7 @@ Hardware Test: # DONE -- [X] Mesure et amélioration des performances +- [X] v1.5.0: Mesure et amélioration des performances - [X] Print et save des registres CSRs pour chaque test, garde la trace des performances dans Git - [X] IP point de mesure des différents bus en bandwidth - [X] CPI measure in benchmark diff --git a/rtl/friscv_processing.sv b/rtl/friscv_processing.sv index 62aa36a..8c7e051 100644 --- a/rtl/friscv_processing.sv +++ b/rtl/friscv_processing.sv @@ -31,8 +31,6 @@ module friscv_processing parameter MAX_UNIT = 4, // Insert a pipeline on instruction bus coming from the controller parameter INST_BUS_PIPELINE = 0, - // Internal FIFO depth, buffering the instruction to execute (UNUSED) - parameter INST_QUEUE_DEPTH = 0, // Number of outstanding requests used by the LOAD/STORE unit parameter DATA_OSTDREQ_NUM = 8, // Reorder read completion internally in Memfy, not in dCache diff --git a/rtl/friscv_rv32i_core.sv b/rtl/friscv_rv32i_core.sv index e9e616b..48c77bd 100644 --- a/rtl/friscv_rv32i_core.sv +++ b/rtl/friscv_rv32i_core.sv @@ -42,8 +42,6 @@ module friscv_rv32i_core parameter M_EXTENSION = 0, // Insert a pipeline on instruction bus coming from the controller parameter PROCESSING_BUS_PIPELINE = 0, - // FIFO depth of processing unit, buffering the instruction to execute - parameter PROCESSING_QUEUE_DEPTH = 0, //////////////////////////////////////////////////////////////////////// // AXI4 / AXI4-lite interface setup @@ -71,7 +69,7 @@ module friscv_rv32i_core parameter ICACHE_PREFETCH_EN = 0, // Block width defining only the data payload, in bits, must an // integer multiple of XLEN (power of two) - parameter ICACHE_BLOCK_W = XLEN*4, + parameter ICACHE_BLOCK_W = ILEN*4, // Number of blocks in the cache parameter ICACHE_DEPTH = 512, @@ -595,7 +593,6 @@ module friscv_rv32i_core .MAX_UNIT (MAX_ALU_UNIT), .DATA_OSTDREQ_NUM (DATA_OSTDREQ_NUM), .INST_BUS_PIPELINE (PROCESSING_BUS_PIPELINE), - .INST_QUEUE_DEPTH (PROCESSING_QUEUE_DEPTH), .IO_MAP_NB (IO_MAP_NB), .IO_MAP (IO_MAP) ) diff --git a/rtl/friscv_rv32i_platform.sv b/rtl/friscv_rv32i_platform.sv index 6a1a2e8..ffb8449 100644 --- a/rtl/friscv_rv32i_platform.sv +++ b/rtl/friscv_rv32i_platform.sv @@ -43,8 +43,6 @@ module friscv_rv32i_platform parameter M_EXTENSION = 0, // Insert a pipeline on instruction bus coming from the controller parameter PROCESSING_BUS_PIPELINE = 0, - // FIFO depth of processing unit, buffering the instruction to execute - parameter PROCESSING_QUEUE_DEPTH = 0, //////////////////////////////////////////////////////////////////////// // AXI4 / AXI4-lite interface setup @@ -319,7 +317,6 @@ module friscv_rv32i_platform .XLEN (XLEN), .M_EXTENSION (M_EXTENSION), .F_EXTENSION (F_EXTENSION), - .PROCESSING_QUEUE_DEPTH (PROCESSING_QUEUE_DEPTH), .PROCESSING_BUS_PIPELINE (PROCESSING_BUS_PIPELINE), .BOOT_ADDR (BOOT_ADDR), .INST_OSTDREQ_NUM (INST_OSTDREQ_NUM), diff --git a/test/common/friscv_testbench.sv b/test/common/friscv_testbench.sv index 041ebe1..ccc0d44 100644 --- a/test/common/friscv_testbench.sv +++ b/test/common/friscv_testbench.sv @@ -99,8 +99,6 @@ module friscv_testbench( parameter M_EXTENSION = 1; // Insert a pipeline on instruction bus coming from the controller parameter PROCESSING_BUS_PIPELINE = 1; - // FIFO depth of processing unit, buffering the instruction to execute - parameter PROCESSING_QUEUE_DEPTH = 0; // Address buses width parameter AXI_ADDR_W = XLEN; @@ -293,7 +291,6 @@ module friscv_testbench( .RV32E (RV32E), .M_EXTENSION (M_EXTENSION), .F_EXTENSION (F_EXTENSION), - .PROCESSING_QUEUE_DEPTH (PROCESSING_QUEUE_DEPTH), .PROCESSING_BUS_PIPELINE (PROCESSING_BUS_PIPELINE), .AXI_ADDR_W (AXI_ADDR_W), .AXI_ID_W (AXI_ID_W), @@ -480,7 +477,6 @@ module friscv_testbench( .RV32E (RV32E), .M_EXTENSION (M_EXTENSION), .F_EXTENSION (F_EXTENSION), - .PROCESSING_QUEUE_DEPTH (PROCESSING_QUEUE_DEPTH), .PROCESSING_BUS_PIPELINE (PROCESSING_BUS_PIPELINE), .AXI_ADDR_W (AXI_ADDR_W), .AXI_ID_W (AXI_ID_W),