Skip to content

Commit 25b8730

Browse files
add legacy code
1 parent 65bd047 commit 25b8730

File tree

15 files changed

+800
-107
lines changed

15 files changed

+800
-107
lines changed

docs/info.md

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,12 +9,12 @@ You can also include images in this folder and reference them in the markdown. E
99

1010
## How it works
1111

12-
Explain how your project works
12+
The project is an AI chip inspired by Google's TPU. It multiply 8-bit floating-point valued matrices. It does so by tiling in 2x2 to fit on the chip's tiny area, so expect performance degradation compared to regular chips. However, the chip's I/O bandwidth will be fully utilized and saturated.
1313

1414
## How to test
1515

16-
Explain how to use your project
16+
Use cocotb and [pyuvm](https://github.com/pyuvm/pyuvm) to lean towards [IEEE-1800.2](https://blogs.sw.siemens.com/verificationhorizons/2015/07/30/uvm-the-next-ieee-standard-1800-2/).
1717

1818
## External hardware
1919

20-
List external hardware used in your project (e.g. PMOD, LED display, etc), if any
20+
Connect the PCB board with the Tiny Tapeout chips (a.k.a. a Raspberry Pi) to a personal computer via USB.

info.yaml

Lines changed: 39 additions & 33 deletions
Original file line numberDiff line numberDiff line change
@@ -1,56 +1,62 @@
11
# Tiny Tapeout project information
22
project:
3-
title: "" # Project title
4-
author: "" # Your name
5-
discord: "" # Your discord username, for communication and automatically assigning you a Tapeout role (optional)
6-
description: "" # One line description of what your project does
3+
title: "Tensor Processing Unit v2" # Project title
4+
author: "William Zhang" # Your name
5+
discord: "walrus_23" # Your discord username, for communication and automatically assigning you a Tapeout role (optional)
6+
description: "multiplies fp8 matrices" # One line description of what your project does
77
language: "Verilog" # other examples include SystemVerilog, Amaranth, VHDL, etc
8-
clock_hz: 0 # Clock frequency in Hz (or 0 if not applicable)
8+
clock_hz: 50000000 # Clock frequency in Hz (or 0 if not applicable)
99

1010
# How many tiles your design occupies? A single tile is about 167x108 uM.
11-
tiles: "1x1" # Valid values: 1x1, 1x2, 2x2, 3x2, 4x2, 6x2 or 8x2
11+
tiles: "1x2" # Valid values: 1x1, 1x2, 2x2, 3x2, 4x2, 6x2 or 8x2
1212

1313
# Your top module name must start with "tt_um_". Make it unique by including your github username:
14-
top_module: "tt_um_example"
14+
top_module: "tt_um_tpu"
1515

1616
# List your project's source files here.
1717
# Source files must be in ./src and you must list each source file separately, one per line.
1818
# Don't forget to also update `PROJECT_SOURCES` in test/Makefile.
1919
source_files:
20-
- "project.v"
20+
- "tpu.v"
21+
- "systolic_array_2x2.v"
22+
- "PE.v"
23+
- "memory.v"
24+
- "control_unit.v"
25+
- "mmu_feeder.v"
26+
- "delay_cell.v"
2127

2228
# The pinout of your project. Leave unused pins blank. DO NOT delete or add any pins.
2329
# This section is for the datasheet/website. Use descriptive names (e.g., RX, TX, MOSI, SCL, SEG_A, etc.).
2430
pinout:
2531
# Inputs
26-
ui[0]: ""
27-
ui[1]: ""
28-
ui[2]: ""
29-
ui[3]: ""
30-
ui[4]: ""
31-
ui[5]: ""
32-
ui[6]: ""
33-
ui[7]: ""
32+
ui[0]: "IN0"
33+
ui[1]: "IN1"
34+
ui[2]: "IN2"
35+
ui[3]: "IN3"
36+
ui[4]: "IN4"
37+
ui[5]: "IN5"
38+
ui[6]: "IN6"
39+
ui[7]: "IN7"
3440

3541
# Outputs
36-
uo[0]: ""
37-
uo[1]: ""
38-
uo[2]: ""
39-
uo[3]: ""
40-
uo[4]: ""
41-
uo[5]: ""
42-
uo[6]: ""
43-
uo[7]: ""
42+
uo[0]: "OUT0"
43+
uo[1]: "OUT1"
44+
uo[2]: "OUT2"
45+
uo[3]: "OUT3"
46+
uo[4]: "OUT4"
47+
uo[5]: "OUT5"
48+
uo[6]: "OUT6"
49+
uo[7]: "OUT7"
4450

4551
# Bidirectional pins
46-
uio[0]: ""
47-
uio[1]: ""
48-
uio[2]: ""
49-
uio[3]: ""
50-
uio[4]: ""
51-
uio[5]: ""
52-
uio[6]: ""
53-
uio[7]: ""
52+
uio[0]: "LOAD_EN (input)"
53+
uio[1]: "TRANSPOSE (input)"
54+
uio[2]: "ACTIVATION (input)"
55+
uio[3]: "Unused"
56+
uio[4]: "Unused"
57+
uio[5]: "Unused"
58+
uio[6]: "Unused"
59+
uio[7]: "DONE (output)"
5460

5561
# Do not change!
56-
yaml_version: 6
62+
yaml_version: 6

src/PE.v

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,30 @@
1+
module PE #(
2+
parameter WIDTH = 8
3+
)(
4+
input wire clk,
5+
input wire rst,
6+
input wire clear,
7+
input wire signed [WIDTH-1:0] a_in,
8+
input wire signed [WIDTH-1:0] b_in,
9+
10+
output reg signed [WIDTH-1:0] a_out,
11+
output reg signed [WIDTH-1:0] b_out,
12+
13+
output reg signed [WIDTH*2-1:0] c_out
14+
);
15+
16+
always @(posedge clk) begin
17+
a_out <= a_in;
18+
b_out <= b_in;
19+
if (rst) begin
20+
c_out <= 0;
21+
a_out <= 0;
22+
b_out <= 0;
23+
end else if (clear) begin
24+
c_out <= a_in * b_in;
25+
end else begin
26+
c_out <= c_out + (a_in * b_in);
27+
end
28+
end
29+
30+
endmodule

src/config.json

Lines changed: 7 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -13,15 +13,15 @@
1313

1414
"//": "PL_TARGET_DENSITY_PCT - You can increase this if Global Placement fails with error GPL-0302.",
1515
"//": "Users have reported that values up to 80 worked well for them.",
16-
"PL_TARGET_DENSITY_PCT": 60,
16+
"PL_TARGET_DENSITY_PCT": 80,
1717

1818
"//": "CLOCK_PERIOD - Increase this in case you are getting setup time violations.",
1919
"//": "The value is in nanoseconds, so 20ns == 50MHz.",
2020
"CLOCK_PERIOD": 20,
2121

2222
"//": "Hold slack margin - Increase them in case you are getting hold violations.",
23-
"PL_RESIZER_HOLD_SLACK_MARGIN": 0.1,
24-
"GRT_RESIZER_HOLD_SLACK_MARGIN": 0.05,
23+
"PL_RESIZER_HOLD_SLACK_MARGIN": 0.5,
24+
"GRT_RESIZER_HOLD_SLACK_MARGIN": 0.4,
2525

2626
"//": "RUN_LINTER, LINTER_INCLUDE_PDK_MODELS - Disabling the linter is not recommended!",
2727
"RUN_LINTER": 1,
@@ -31,7 +31,7 @@
3131
"//": "https://tinytapeout.com/faq/#how-can-i-map-an-additional-external-clock-to-one-of-the-gpios",
3232
"CLOCK_PORT": "clk",
3333

34-
"//": "Configuration docs: https://librelane.readthedocs.io/en/latest/reference/configuration.html",
34+
"//": "Configuration docs: https://openlane.readthedocs.io/en/latest/reference/configuration.html",
3535

3636
"//": "!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!",
3737
"//": "!!! DO NOT CHANGE ANYTHING BELOW THIS POINT !!!",
@@ -63,12 +63,13 @@
6363
"//": "Clock",
6464
"RUN_CTS": 1,
6565

66-
"//": "Don't generate power rings",
66+
"//": "Don't use power rings or met5 layer",
6767
"FP_PDN_MULTILAYER": 0,
68+
"RT_MAX_LAYER": "met4",
6869

6970
"//": "MAGIC_DEF_LABELS may cause issues with LVS",
7071
"MAGIC_DEF_LABELS": 0,
7172

7273
"//": "Only export pin area in LEF (without any connected nets)",
7374
"MAGIC_WRITE_LEF_PINONLY": 1
74-
}
75+
}

src/control_unit.v

Lines changed: 129 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,129 @@
1+
`default_nettype none
2+
3+
module control_unit (
4+
input wire clk,
5+
input wire rst,
6+
input wire load_en,
7+
8+
// Memory interface
9+
output reg [2:0] mem_addr,
10+
11+
// MMU feeding control
12+
output reg mmu_en,
13+
output reg [2:0] mmu_cycle,
14+
15+
// For debugging
16+
output wire [1:0] state_out
17+
);
18+
19+
// STATES
20+
localparam [1:0] S_IDLE = 2'b00;
21+
localparam [1:0] S_LOAD_MATS = 2'b01;
22+
localparam [1:0] S_MMU_FEED_COMPUTE_WB = 2'b10;
23+
24+
reg [1:0] state, next_state;
25+
26+
assign state_out = state;
27+
28+
// Next state logic
29+
always @(*) begin
30+
next_state = state;
31+
32+
case (state)
33+
S_IDLE: begin
34+
if (load_en) begin
35+
next_state = S_LOAD_MATS;
36+
end
37+
end
38+
39+
S_LOAD_MATS: begin
40+
// All 8 elements loaded (4 for each matrix)
41+
if (mem_addr == 3'b111) begin
42+
next_state = S_MMU_FEED_COMPUTE_WB;
43+
end
44+
end
45+
46+
S_MMU_FEED_COMPUTE_WB:
47+
next_state = S_MMU_FEED_COMPUTE_WB;
48+
/* MMU CYCLE PATTERN
49+
* Cycle 0: Start feeding data (a00×b00 starts)
50+
* Cycle 1: First partial products computed, more data fed
51+
* Cycle 2: c00 ready (a00×b00 + a01×b10), is output, while take in next a00
52+
* Cycle 3: c01 and c10 ready simultaneously, read next a01:
53+
* c01 = a00×b01 + a01×b11
54+
* c10 = a10×b00 + a11×b10
55+
* Cycle 4: c11 ready (a10×b01 + a11×b11), read next a10
56+
* Cycle 5: All outputs remain valid, read next a11
57+
* Cycle 6: Keep outputting, read next b00
58+
* Cycle 7: Keep outputting, read next b01
59+
* Back to cycle 0: Start feeding data (a00×b00 starts), keep outputting, read next b10
60+
* Cycle 1: First partial products computed, keep outputting, read next b01
61+
* Cycle 2: c00 ready, begin output, take in next a00, pattern continues...
62+
*/
63+
64+
default: begin
65+
next_state = S_IDLE;
66+
end
67+
endcase
68+
end
69+
70+
// State Machine
71+
always @(posedge clk) begin
72+
if (rst) begin
73+
state <= S_IDLE;
74+
mmu_cycle <= 0;
75+
mmu_en <= 0;
76+
mem_addr <= 0;
77+
end else begin
78+
state <= next_state;
79+
mem_addr <= 0;
80+
case (state)
81+
S_IDLE: begin
82+
mmu_cycle <= 0;
83+
mmu_en <= 0;
84+
if (load_en) begin
85+
mem_addr <= mem_addr + 1;
86+
end
87+
end
88+
89+
S_LOAD_MATS: begin
90+
if (load_en) begin
91+
mem_addr <= mem_addr + 1;
92+
end
93+
94+
if (mem_addr == 3'b101) begin
95+
mmu_en <= 1;
96+
end else if (mem_addr >= 3'b110) begin
97+
mmu_en <= 1;
98+
mmu_cycle <= mmu_cycle + 1;
99+
if (mem_addr == 3'b111) begin
100+
mem_addr <= 0;
101+
end
102+
end
103+
end
104+
105+
S_MMU_FEED_COMPUTE_WB: begin
106+
// Now: the TPU will be forever stuck in this cycle...
107+
// Cycles through counter of 8...
108+
// In each cycle of 8 counts, it will: output 4 16-bit output elements the result of the previous matmul,
109+
// and take in 8 new 8-bit elements
110+
if (load_en) begin
111+
mem_addr <= mem_addr + 1;
112+
end
113+
mmu_cycle <= mmu_cycle + 1; // allow mmu_cycle to continue incrementing, permitting a pipeline flush
114+
if (mmu_cycle == 3'b111) begin
115+
mmu_cycle <= 0;
116+
end else if (mmu_cycle == 1) begin
117+
mem_addr <= 0;
118+
end
119+
end
120+
121+
default: begin
122+
mmu_cycle <= 0;
123+
mmu_en <= 0;
124+
end
125+
endcase
126+
end
127+
end
128+
129+
endmodule

src/delay_cell.v

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
module buffer (
2+
output wire X ,
3+
input wire A ,
4+
input wire VPWR,
5+
input wire VGND,
6+
input wire VPB ,
7+
input wire VNB
8+
);
9+
10+
assign X = A;
11+
12+
wire _unused;
13+
assign _unused = &{ 1'b0, VPWR, VGND, VPB, VNB };
14+
15+
endmodule

src/memory.v

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
`default_nettype none
2+
3+
module memory (
4+
input wire clk,
5+
input wire rst,
6+
input wire load_en,
7+
input wire [2:0] addr, // MSB selects matrix (0: weights, 1: inputs), [1:0] selects element
8+
input wire [7:0] in_data, // Fixed from reg to wire to match tt_um_tpu.v
9+
output wire [7:0] weight0, weight1, weight2, weight3, // 2x2 matrix A elements, 1 byte each
10+
output wire [7:0] input0, input1, input2, input3 // 2x2 matrix B elements, 1 byte each
11+
);
12+
13+
reg [7:0] sram [0:7]; // 8 locations: 0-3 for weights, 4-7 for inputs
14+
integer i;
15+
16+
always @(posedge clk) begin
17+
if (rst) begin
18+
for (i = 0; i < 8; i = i + 1) begin
19+
sram[i] <= 8'b0;
20+
end
21+
end else if (load_en) begin
22+
sram[addr] <= in_data;
23+
end
24+
end
25+
26+
// asynchronous read
27+
assign weight0 = sram[0];
28+
assign weight1 = sram[1];
29+
assign weight2 = sram[2];
30+
assign weight3 = sram[3];
31+
assign input0 = sram[4];
32+
assign input1 = sram[5];
33+
assign input2 = sram[6];
34+
assign input3 = sram[7];
35+
36+
endmodule

0 commit comments

Comments
 (0)