-
Notifications
You must be signed in to change notification settings - Fork 0
/
matrix.mojo
139 lines (107 loc) · 4.07 KB
/
matrix.mojo
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
from sys.info import simdwidthof
from random import rand
from memory.memory import memset_zero
from algorithm import vectorize, parallelize
from runtime.llcl import Runtime
# alias f32 = DType.float32
alias nelts = simdwidthof[DType.float32]()
# TODO: try out the built-in Tensor type instead of a custom Matrix struct
# TODO: use generic FP datatype
struct Matrix:
var rows: Int
var cols: Int
var data: DTypePointer[DType.float32]
fn __init__(inout self, rows: Int, cols: Int):
self.rows = rows
self.cols = cols
self.data = DTypePointer[DType.float32].alloc(rows*cols)
@always_inline
fn __getitem__(self, y: Int, x: Int) -> Float32:
return self.load[1](y, x)
@always_inline
fn __getitem__(self, i: Int) -> Float32:
return self.data.simd_load[1](i)
@always_inline
fn __setitem__(inout self, y: Int, x: Int, val: Float32):
self.store[1](y, x, val)
@always_inline
fn __setitem__(inout self, i: Int, val: Float32):
self.data.simd_store[1](i, val)
@always_inline
fn zero(inout self):
memset_zero(self.data, self.size())
@always_inline
fn fill(inout self, val: Float32):
for i in range(self.size()):
self.data.simd_store[1](i, val)
@always_inline
fn load[nelts: Int](self, y: Int, x: Int) -> SIMD[DType.float32, nelts]:
return self.data.simd_load[nelts](y * self.cols + x)
@always_inline
fn store[nelts: Int](self, y: Int, x: Int, val: SIMD[DType.float32, nelts]):
return self.data.simd_store[nelts](y * self.cols + x, val)
@always_inline
fn size(self) -> Int:
return self.rows * self.cols
fn dump(self):
print_no_newline("[")
for y in range(self.rows):
print_no_newline("[")
for x in range(self.cols):
print_no_newline(self[y, x], "")
if y == self.rows - 1:
print_no_newline("]")
else:
print("],")
print("]")
fn __del__(owned self):
self.data.free()
fn matmul(inout C: Matrix, A: Matrix, B: Matrix):
"""C = A @ B (naive implementation)."""
for m in range (A.rows):
for k in range(A.cols):
for n in range(C.cols):
C[m, n] += A[m, k] * B[k, n]
fn matmul(C: Matrix, A: Matrix, B: Matrix, rt: Runtime):
matmul_parallelized(C, A, B, rt)
fn matmul_parallelized(C: Matrix, A: Matrix, B: Matrix, rt: Runtime):
@parameter
fn calc_row(m: Int):
for k in range(A.cols):
@parameter
fn dot[nelts: Int](n: Int):
C.store[nelts](m, n, C.load[nelts](m, n) + A[m, k] * B.load[nelts](k, n))
vectorize[nelts, dot](C.cols)
parallelize[calc_row](rt, C.rows)
fn matmul_transposed(inout C: Matrix, A: Matrix, B: Matrix):
"""C = A @ B.T (naive implementation)."""
for m in range (A.rows):
for n in range(B.rows):
for k in range(A.cols):
C[m, n] += A[m, k] * B[n, k]
fn matmul_transposed(C: Matrix, A: Matrix, B: Matrix, rt: Runtime):
matmul_parallelized_transposed(C, A, B, rt)
fn matmul_parallelized_transposed(C: Matrix, A: Matrix, B: Matrix, rt: Runtime):
"""C = A @ B.T."""
@parameter
fn calc_row(m: Int):
for n in range(C.cols):
var tmp = SIMD[DType.float32, nelts](0)
@parameter
fn dot[nelts_: Int](k: Int):
if nelts_ < nelts: # take care of tail elements
tmp[0] += (A.load[nelts_](m, k) * B.load[nelts_](n, k)).reduce_add()
else:
tmp += A.load[nelts](m, k) * B.load[nelts](n, k)
vectorize[nelts, dot](A.cols)
C.store[1](m, n, tmp.reduce_add())
parallelize[calc_row](rt, C.rows)
fn transpose(out_m: Matrix, in_m: Matrix, rt: Runtime):
"""B = A^T."""
@parameter
fn row_fn(m: Int):
@parameter
fn col_fn[nelts: Int](n: Int):
out_m.store[nelts](n, m, in_m.load[nelts](m, n))
vectorize[nelts, col_fn](in_m.cols)
parallelize[row_fn](rt, in_m.rows)