Skip to content

Commit ae23431

Browse files
author
Amir Kiamarzi amirhossein.kiamarz2@unibo.it
committed
Fix strided load and store instruction
1 parent 70bc183 commit ae23431

File tree

5 files changed

+374
-4
lines changed

5 files changed

+374
-4
lines changed

hw/ip/spatz/src/spatz.sv

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -215,6 +215,7 @@ module spatz import spatz_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; #(
215215
logic [NrReadPorts-1:0] vrf_re;
216216
vrf_data_t [NrReadPorts-1:0] vrf_rdata;
217217
logic [NrReadPorts-1:0] vrf_rvalid;
218+
logic [NrWritePorts-1:0] vrf_wvalid_vlsu;
218219

219220
spatz_vrf #(
220221
.NrReadPorts (NrReadPorts ),
@@ -229,6 +230,8 @@ module spatz import spatz_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; #(
229230
.we_i (vrf_we ),
230231
.wbe_i (vrf_wbe_buf ),
231232
.wvalid_o (vrf_wvalid ),
233+
.wvalid_vlsu_o (vrf_wvalid_vlsu),
234+
232235
`ifdef BUF_FPU
233236
.fpu_buf_usage_i (vfu_buf_usage ),
234237
`endif
@@ -424,7 +427,8 @@ module spatz import spatz_pkg::*; import rvv_pkg::*; import fpnew_pkg::*; #(
424427
.vrf_wdata_o (vrf_wdata[VLSU_VD_WD] ),
425428
.vrf_we_o (sb_we[VLSU_VD_WD] ),
426429
.vrf_wbe_o (vrf_wbe[VLSU_VD_WD] ),
427-
.vrf_wvalid_i (vrf_wvalid[VLSU_VD_WD] ),
430+
// .vrf_wvalid_i (vrf_wvalid[VLSU_VD_WD] ),
431+
.vrf_wvalid_i (vrf_wvalid_vlsu[VLSU_VD_WD] ),//vrf_wvalid[1]
428432
.vrf_raddr_o (vrf_raddr[VLSU_VD_RD:VLSU_VS2_RD] ),
429433
.vrf_re_o (sb_re[VLSU_VD_RD:VLSU_VS2_RD] ),
430434
.vrf_rdata_i (vrf_rdata[VLSU_VD_RD:VLSU_VS2_RD] ),

hw/ip/spatz/src/spatz_vrf.sv

Lines changed: 30 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,8 @@ module spatz_vrf
2222
input logic [NrWritePorts-1:0] we_i,
2323
input vrf_be_t [NrWritePorts-1:0] wbe_i,
2424
output logic [NrWritePorts-1:0] wvalid_o,
25+
output logic [NrWritePorts-1:0] wvalid_vlsu_o,
26+
output logic [NrWritePorts-1:0] wbe_o,
2527
`ifdef BUF_FPU
2628
// Signal to track if result can be buffered or not
2729
input logic [$clog2(FpuBufDepth)-1:0] fpu_buf_usage_i,
@@ -90,13 +92,32 @@ module spatz_vrf
9092
end
9193
end: gen_write_request
9294

95+
96+
vrf_be_t [NrVRFBanks-1:0] wbe_d,wbe_q;
97+
98+
always_ff @(posedge clk_i or negedge rst_ni) begin
99+
if(~rst_ni) begin
100+
wbe_q <= '0;
101+
end else begin
102+
for (int bank = 0; bank < NrVRFBanks; bank++) begin
103+
if(!write_request[bank][VLSU_VD_WD])begin
104+
wbe_q[bank] <= '0;
105+
end else begin
106+
wbe_q[bank] <= wbe_d[bank];
107+
end
108+
end
109+
end
110+
end
111+
112+
93113
always_comb begin : proc_write
94114
waddr = '0;
95115
wdata = '0;
96116
we = '0;
97117
wbe = '0;
98118
wvalid_o = '0;
99-
119+
wvalid_vlsu_o = '0;
120+
wbe_d = wbe_q;
100121
// For each bank, we have a priority based access scheme. First priority always has the VFU,
101122
// second priority has the LSU, and third priority has the slide unit.
102123
for (int unsigned bank = 0; bank < NrVRFBanks; bank++) begin
@@ -123,7 +144,10 @@ module spatz_vrf
123144
wdata[bank] = wdata_i[VFU_VD_WD];
124145
we[bank] = 1'b1;
125146
wbe[bank] = wbe_i[VFU_VD_WD];
126-
wvalid_o[VFU_VD_WD] = 1'b1;
147+
// wvalid_o[VFU_VD_WD] = 1'b1;
148+
wbe_d[bank] = wbe_q[bank]|wbe_i[VLSU_VD_WD];
149+
wvalid_o[VLSU_VD_WD] = &(wbe_q[bank]|wbe_i[VLSU_VD_WD]);//1
150+
wvalid_vlsu_o[VLSU_VD_WD] = 1'b1;//1
127151
end else if (write_request[bank][VSLDU_VD_WD]) begin
128152
waddr[bank] = f_vreg(waddr_i[VSLDU_VD_WD]);
129153
wdata[bank] = wdata_i[VSLDU_VD_WD];
@@ -144,7 +168,10 @@ module spatz_vrf
144168
wdata[bank] = wdata_i[VLSU_VD_WD];
145169
we[bank] = 1'b1;
146170
wbe[bank] = wbe_i[VLSU_VD_WD];
147-
wvalid_o[VLSU_VD_WD] = 1'b1;
171+
// wvalid_o[VLSU_VD_WD] = 1'b1;
172+
wbe_d[bank] = wbe_q[bank]|wbe_i[VLSU_VD_WD];
173+
wvalid_o[VLSU_VD_WD] = &(wbe_q[bank]|wbe_i[VLSU_VD_WD]);//1
174+
wvalid_vlsu_o[VLSU_VD_WD] = 1'b1;//1
148175
end else if (write_request[bank][VSLDU_VD_WD]) begin
149176
waddr[bank] = f_vreg(waddr_i[VSLDU_VD_WD]);
150177
wdata[bank] = wdata_i[VSLDU_VD_WD];

sw/riscvTests/CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -128,3 +128,6 @@ add_snitch_test(vfcvt isa/rv64uv/vfcvt.c)
128128
add_snitch_test(vfncvt isa/rv64uv/vfncvt.c)
129129

130130
add_snitch_test(vfmv isa/rv64uv/vfmv.c)
131+
132+
add_snitch_test(vls isa/rv64uv/vls.c)
133+
add_snitch_test(vss isa/rv64uv/vss.c)

sw/riscvTests/isa/rv64uv/vls.c

Lines changed: 190 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,190 @@
1+
// Copyright 2021 ETH Zurich and University of Bologna.
2+
// Solderpad Hardware License, Version 0.51, see LICENSE for details.
3+
// SPDX-License-Identifier: SHL-0.51
4+
//
5+
// Author: Matteo Perotti <mperotti@iis.ee.ethz.ch>
6+
7+
#include "vector_macros.h"
8+
9+
// Positive-stride tests
10+
void TEST_CASE1(void) {
11+
VSET(4, e8, m1);
12+
volatile uint8_t INP1[] = {0x9f, 0xe4, 0x19, 0x20, 0x8f, 0x2e, 0x05, 0xe0,
13+
0xf9, 0xaa, 0x71, 0xf0, 0xc3, 0x94, 0xbb, 0xd3};
14+
uint64_t stride = 3;
15+
asm volatile("vlse8.v v1, (%0), %1" ::"r"(INP1), "r"(stride));
16+
VCMP_U8(1, v1, 0x9f, 0x20, 0x05, 0xaa);
17+
}
18+
19+
void TEST_CASE2(void) {
20+
VSET(4, e16, m1);
21+
volatile uint16_t INP1[] = {0x9fe4, 0x1920, 0x8f2e, 0x05e0,
22+
0xf9aa, 0x71f0, 0xc394, 0xbbd3};
23+
uint64_t stride = 4;
24+
asm volatile("vlse16.v v1, (%0), %1" ::"r"(INP1), "r"(stride));
25+
VCMP_U16(2, v1, 0x9fe4, 0x8f2e, 0xf9aa, 0xc394);
26+
}
27+
28+
void TEST_CASE3(void) {
29+
VSET(4, e32, m1);
30+
volatile uint32_t INP1[] = {0x9fe41920, 0x8f2e05e0, 0xf9aa71f0, 0xc394bbd3,
31+
0xa11a9384, 0xa7163840, 0x99991348, 0xa9f38cd1};
32+
uint64_t stride = 8;
33+
asm volatile("vlse32.v v1, (%0), %1" ::"r"(INP1), "r"(stride));
34+
VCMP_U32(3, v1, 0x9fe41920, 0xf9aa71f0, 0xa11a9384, 0x99991348);
35+
}
36+
37+
void TEST_CASE4(void) {
38+
VSET(4, e64, m1);
39+
volatile uint64_t INP1[] = {0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3,
40+
0xa11a9384a7163840, 0x99991348a9f38cd1};
41+
uint64_t stride = 8;
42+
asm volatile("vlse64.v v1, (%0), %1" ::"r"(INP1), "r"(stride));
43+
VCMP_U64(4, v1, 0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3, 0xa11a9384a7163840,
44+
0x99991348a9f38cd1);
45+
}
46+
47+
// Zero-stride tests
48+
// The implementation must perform all the memory accesses
49+
void TEST_CASE5(void) {
50+
VSET(16, e8, m1);
51+
volatile uint8_t INP1[] = {0x9f};
52+
uint64_t stride = 0;
53+
asm volatile("vlse8.v v1, (%0), %1" ::"r"(INP1), "r"(stride));
54+
VCMP_U8(5, v1, 0x9f, 0x9f, 0x9f, 0x9f, 0x9f, 0x9f, 0x9f, 0x9f, 0x9f, 0x9f,
55+
0x9f, 0x9f, 0x9f, 0x9f, 0x9f, 0x9f);
56+
}
57+
58+
// The implementation can also perform fewer accesses
59+
void TEST_CASE6(void) {
60+
VSET(16, e8, m1);
61+
volatile uint8_t INP1[] = {0x9f};
62+
asm volatile("vlse8.v v1, (%0), x0" ::"r"(INP1));
63+
VCMP_U8(6, v1, 0x9f, 0x9f, 0x9f, 0x9f, 0x9f, 0x9f, 0x9f, 0x9f, 0x9f, 0x9f,
64+
0x9f, 0x9f, 0x9f, 0x9f, 0x9f, 0x9f);
65+
}
66+
67+
// Different LMUL
68+
void TEST_CASE7(void) {
69+
VSET(8, e64, m2);
70+
volatile uint64_t INP1[] = {0x9fa831c7a11a9384};
71+
asm volatile("vlse64.v v2, (%0), x0" ::"r"(INP1));
72+
VCMP_U64(7, v2, 0x9fa831c7a11a9384, 0x9fa831c7a11a9384, 0x9fa831c7a11a9384,
73+
0x9fa831c7a11a9384, 0x9fa831c7a11a9384, 0x9fa831c7a11a9384,
74+
0x9fa831c7a11a9384, 0x9fa831c7a11a9384);
75+
}
76+
77+
// Others
78+
// Negative-stride test
79+
void TEST_CASE8(void) {
80+
VSET(4, e16, m1);
81+
volatile uint16_t INP1[] = {0x9fe4, 0x1920, 0x8f2e, 0x05e0,
82+
0xf9aa, 0x71f0, 0xc394, 0xbbd3};
83+
uint64_t stride = -4;
84+
asm volatile("vlse16.v v1, (%0), %1" ::"r"(&INP1[7]), "r"(stride));
85+
VCMP_U16(8, v1, 0xbbd3, 0x71f0, 0x05e0, 0x1920);
86+
}
87+
88+
// Stride greater than default Ara AXI width == 128-bit (4 lanes)
89+
void TEST_CASE9(void) {
90+
VSET(2, e64, m1);
91+
volatile uint64_t INP1[] = {0x99991348a9f38cd1, 0x9fa831c7a11a9384,
92+
0x9fa831c7a11a9384, 0x9fa831c7a11a9384,
93+
0x9fa831c7a11a9384, 0x01015ac1309bb678};
94+
uint64_t stride = 40;
95+
asm volatile("vlse64.v v1, (%0), %1" ::"r"(INP1), "r"(stride));
96+
VCMP_U64(9, v1, 0x99991348a9f38cd1, 0x01015ac1309bb678);
97+
}
98+
99+
// Fill Ara internal Load Buffer
100+
void TEST_CASE10(void) {
101+
VSET(8, e64, m1);
102+
volatile uint64_t INP1[] = {
103+
0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3, 0xa11a9384a7163840,
104+
0x99991348a9f38cd1, 0x9fa831c7a11a9384, 0x3819759853987548,
105+
0x1893179501093489, 0x81937598aa819388, 0x1874754791888188,
106+
0x3eeeeeeee33111ae, 0x9013930148815808, 0xab8b914891484891,
107+
0x9031850931584902, 0x3189759837598759, 0x8319599991911111,
108+
0x8913984898951989};
109+
uint64_t stride = 16;
110+
asm volatile("vlse64.v v1, (%0), %1" ::"r"(INP1), "r"(stride));
111+
VCMP_U64(10, v1, 0x9fe419208f2e05e0, 0xa11a9384a7163840, 0x9fa831c7a11a9384,
112+
0x1893179501093489, 0x1874754791888188, 0x9013930148815808,
113+
0x9031850931584902, 0x8319599991911111);
114+
}
115+
116+
// Masked stride loads
117+
void TEST_CASE11(void) {
118+
VSET(4, e8, m1);
119+
volatile uint8_t INP1[] = {0x9f, 0xe4, 0x19, 0x20, 0x8f, 0x2e, 0x05, 0xe0,
120+
0xf9, 0xaa, 0x71, 0xf0, 0xc3, 0x94, 0xbb, 0xd3};
121+
uint64_t stride = 3;
122+
VLOAD_8(v0, 0xAA);
123+
VCLEAR(v1);
124+
asm volatile("vlse8.v v1, (%0), %1, v0.t" ::"r"(INP1), "r"(stride));
125+
VCMP_U8(11, v1, 0x00, 0x20, 0x00, 0xaa);
126+
}
127+
128+
void TEST_CASE12(void) {
129+
VSET(4, e16, m1);
130+
volatile uint16_t INP1[] = {0x9fe4, 0x1920, 0x8f2e, 0x05e0,
131+
0xf9aa, 0x71f0, 0xc394, 0xbbd3};
132+
uint64_t stride = 4;
133+
VLOAD_8(v0, 0xAA);
134+
VCLEAR(v1);
135+
asm volatile("vlse16.v v1, (%0), %1, v0.t" ::"r"(INP1), "r"(stride));
136+
VCMP_U16(12, v1, 0, 0x8f2e, 0, 0xc394);
137+
}
138+
139+
void TEST_CASE13(void) {
140+
VSET(4, e32, m1);
141+
volatile uint32_t INP1[] = {0x9fe41920, 0x8f2e05e0, 0xf9aa71f0, 0xc394bbd3,
142+
0xa11a9384, 0xa7163840, 0x99991348, 0xa9f38cd1};
143+
uint64_t stride = 8;
144+
VLOAD_8(v0, 0xAA);
145+
VCLEAR(v1);
146+
asm volatile("vlse32.v v1, (%0), %1, v0.t" ::"r"(INP1), "r"(stride));
147+
VCMP_U32(13, v1, 0, 0xf9aa71f0, 0, 0x99991348);
148+
}
149+
150+
void TEST_CASE14(void) {
151+
VSET(8, e64, m1);
152+
volatile uint64_t INP1[] = {
153+
0x9fe419208f2e05e0, 0xf9aa71f0c394bbd3, 0xa11a9384a7163840,
154+
0x99991348a9f38cd1, 0x9fa831c7a11a9384, 0x3819759853987548,
155+
0x1893179501093489, 0x81937598aa819388, 0x1874754791888188,
156+
0x3eeeeeeee33111ae, 0x9013930148815808, 0xab8b914891484891,
157+
0x9031850931584902, 0x3189759837598759, 0x8319599991911111,
158+
0x8913984898951989};
159+
uint64_t stride = 16;
160+
VLOAD_8(v0, 0xAA);
161+
VCLEAR(v1);
162+
asm volatile("vlse64.v v1, (%0), %1, v0.t" ::"r"(INP1), "r"(stride));
163+
VCMP_U64(14, v1, 0, 0xa11a9384a7163840, 0, 0x1893179501093489, 0,
164+
0x9013930148815808, 0, 0x8319599991911111);
165+
}
166+
167+
int main(void) {
168+
INIT_CHECK();
169+
enable_vec();
170+
171+
TEST_CASE1();
172+
TEST_CASE2();
173+
TEST_CASE3();
174+
TEST_CASE4();
175+
176+
TEST_CASE5();
177+
TEST_CASE6();
178+
TEST_CASE7();
179+
180+
TEST_CASE8();
181+
TEST_CASE9();
182+
TEST_CASE10();
183+
184+
// TEST_CASE11();
185+
// TEST_CASE12();
186+
// TEST_CASE13();
187+
// TEST_CASE14();
188+
189+
EXIT_CHECK();
190+
}

0 commit comments

Comments
 (0)