-
Notifications
You must be signed in to change notification settings - Fork 0
/
bmm_main.cu
executable file
·146 lines (124 loc) · 4.79 KB
/
bmm_main.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
//Do NOT MODIFY THIS FILE
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <math.h>
#include "cuda_runtime.h"
#include "device_launch_parameters.h"
#include "gputimer.h"
#include "gpuerrors.h"
#include "bmm.h"
// ===========================> Functions Prototype <===============================
void fill(float* data, int size);
double calc_mse(float* data1, float* data2, int size);
void cpuKernel_yx(const float* const a, const float* const b, float* c, const int m, const int n, const int y, const int x);
void cpuKernel_y(const float* const a, const float* const b, float* c, const int m, const int n, const int y);
void cpuKernel(const float* const a, const float* const b, float* c, const int m, const int n);
void gpuKernel(const float* const a, const float* const b, float* c, const int m, const int n, double* gpu_kernel_time);
// =================================================================================
int main(int argc, char** argv) {
struct cudaDeviceProp p;
cudaGetDeviceProperties(&p, 0);
printf("Device Name: %s\n", p.name);
// get parameter from command line to build Matrix dimension
// check for 10<=m<=13, because m>=14 do not fit in the memory of our GPU, i.e., 1GB.
int m = atoi(argv[1]);
int n = (1 << m);
// allocate memory in CPU for calculation
float* a;
float* b;
float* c_serial;
float* c;
a = (float*)malloc(n*n * sizeof(float));
b = (float*)malloc(n*n * sizeof(float));
c_serial = (float*)malloc(n*n * sizeof(float));
c = (float*)malloc(n*n * sizeof(float));
// fill a, b matrices with random values between -16.0f and 16.0f
srand(0);
fill(a, n*n);
fill(b, n*n);
// CPU calculations
if (m<=10) {
cpuKernel (a, b, c_serial, m, n);
} else {
cpuKernel_y (a, b, c_serial, m, n, 0); // first row
cpuKernel_y (a, b, c_serial, m, n, n-1); // last row
}
// GPU calculations
double gpu_kernel_time = 0.0;
clock_t t1 = clock();
gpuKernel (a, b, c, m, n, &gpu_kernel_time);
clock_t t2 = clock();
// check correctness of GPU calculations against CPU
double mse = 0.0;
if (m<=10) {
mse += calc_mse( c_serial, c, n*n );
} else {
mse += calc_mse( c_serial , c , n ); // first row
mse += calc_mse( c_serial + n*(n-1), c + n*(n-1), n ); // last row
}
printf("m=%d n=%d GPU=%g ms GPU-Kernel=%g ms mse=%g\n",
m, n, (t2-t1)/1000.0, gpu_kernel_time, mse);
// free allocated memory for later use
free(a);
free(b);
free(c_serial);
free(c);
return 0;
}
//-----------------------------------------------------------------------------
void fill(float* data, int size) {
for (int i=0; i<size; ++i)
data[i] = (float) (rand() % 17 - 8);
}
double calc_mse (float* data1, float* data2, int size) {
double mse = 0.0;
int i; for (i=0; i<size; i++) {
double e = data1[i]-data2[i];
e = e * e;
mse += e;
}
return mse;
}
//-----------------------------------------------------------------------------
void cpuKernel_yx(const float* const a, const float* const b, float* c, const int m, const int n,
const int y, const int x) { // one element: y,x
mem2d(c,m,y,x)=0.0f;
for(int k=0; k<n; k++) {
mem2d(c,m,y,x) += mem2d(a,m,y,k) * mem2d(b,m,k,x);
}
}
void cpuKernel_y(const float* const a, const float* const b, float* c, const int m, const int n,
const int y) { // one row: y
for(int x=0; x<n; x++) {
cpuKernel_yx(a,b,c,m,n,y,x);
}
}
void cpuKernel(const float* const a, const float* const b, float* c, const int m, const int n) { // entire matrix
for(int y=0; y<n; y++)
for(int x=0; x<n; x++) {
cpuKernel_yx(a,b,c,m,n,y,x);
}
}
//-----------------------------------------------------------------------------
void gpuKernel(const float* const a, const float* const b, float* c, const int m, const int n, double* gpu_kernel_time) {
float* ad;
float* bd;
float* cd;
HANDLE_ERROR(cudaMalloc((void**)&ad, n*n * sizeof(float)));
HANDLE_ERROR(cudaMalloc((void**)&bd, n*n * sizeof(float)));
HANDLE_ERROR(cudaMalloc((void**)&cd, n*n * sizeof(float)));
HANDLE_ERROR(cudaMemcpy(ad, a, n*n * sizeof(float), cudaMemcpyHostToDevice));
HANDLE_ERROR(cudaMemcpy(bd, b, n*n * sizeof(float), cudaMemcpyHostToDevice));
dim3 dimGrid = getDimGrid(m,n); //modify this function in bmm.cu
dim3 dimBlock = getDimBlock(m,n); //modify this function in bmm.cu
GpuTimer timer;
timer.Start();
kernelFunc<<< dimGrid,dimBlock >>>(ad, bd, cd, m, n); //modify this function in bmm.cu
timer.Stop();
*gpu_kernel_time = timer.Elapsed();
HANDLE_ERROR(cudaMemcpy(c, cd, n*n * sizeof(float), cudaMemcpyDeviceToHost));
HANDLE_ERROR(cudaFree(ad));
HANDLE_ERROR(cudaFree(bd));
HANDLE_ERROR(cudaFree(cd));
}