-
Notifications
You must be signed in to change notification settings - Fork 18
Expand file tree
/
Copy pathadd_basic.rs
More file actions
42 lines (39 loc) · 1.12 KB
/
add_basic.rs
File metadata and controls
42 lines (39 loc) · 1.12 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
/*
* SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
* SPDX-License-Identifier: Apache-2.0
*/
use cuda_async::device_operation::DeviceOperation;
use cutile::{
self, api,
tensor::Unpartition,
tile_kernel::{IntoDeviceOperationPartition, TensorDeviceOpToHostVec, TileKernel, Unzippable3},
};
use my_module::add_async as add;
#[cutile::module]
mod my_module {
use cutile::core::*;
#[cutile::entry(print_ir = true)]
fn add<const S: [i32; 1]>(
a: &Tensor<f32, { [-1] }>,
b: &Tensor<f32, { [-1] }>,
c: &mut Tensor<f32, S>,
) {
let pid = get_tile_block_id().0;
let tile_a = a.load_tile(const_shape!(S), [pid]);
let tile_b = b.load_tile(const_shape!(S), [pid]);
c.store(tile_a + tile_b);
}
}
fn main() -> () {
let a = api::ones([32]).arc();
let b = api::ones([32]).arc();
let c = api::zeros([32]).partition([4]);
let c_host_vec = add(a, b, c)
.grid((8, 1, 1))
.unzip()
.2
.unpartition()
.to_host_vec()
.sync();
println!("{:#?}", c_host_vec);
}