This repository has been archived by the owner on Mar 3, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 57
/
Sleep.cu
81 lines (67 loc) · 2.31 KB
/
Sleep.cu
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
#include <cstdio>
#include <cuda_runtime_api.h>
#include <mutex>
// FIXME: nvcc and gcc 4.9 don't like std::unordered_map
#include <tr1/unordered_map>
#include "util/CachedDeviceProperties.h"
namespace facebook { namespace cuda {
namespace {
constexpr int kDebug = false;
}
template <int type>
__global__ void sleepKernel(double* cycles, int64_t waitCycles) {
extern __shared__ int s[];
long long int start = clock64();
for (;;) {
auto total = clock64() - start;
if (total >= waitCycles) { break; }
}
*cycles = (double(clock64() - start));
}
void cudaSleep(int64_t cycles, int type) {
static std::mutex m;
std::lock_guard<std::mutex> _(m);
int device;
cudaGetDevice(&device);
static std::tr1::unordered_map<int, double*> clocks;
auto e = clocks.find(device);
if (e == clocks.end()) {
double* c;
cudaMalloc((void**)(&c), sizeof(double));
clocks[device] = c;
e = clocks.find(device);
}
auto t = e->second;
auto p = getCurrentDeviceProperties();
int minGridSize, blockSize;
cudaOccupancyMaxPotentialBlockSize(
&minGridSize, &blockSize, sleepKernel<1>, 0, 0);
// Use all available SMs, smem to force kernel to eat up all resources
#define SLEEP(TYPE) \
if (type == TYPE) { \
sleepKernel<TYPE><<<minGridSize, blockSize>>> (t, cycles); \
double tt; \
cudaMemcpy(&tt, t, sizeof(double), cudaMemcpyDeviceToHost); \
if (kDebug) { \
unsigned long micros = (tt / p.clockRate) * 1000; \
printf("cuda slept %ld us\n", micros); \
} \
return; \
}
SLEEP(1);
SLEEP(2);
SLEEP(3);
SLEEP(4);
SLEEP(5);
sleepKernel<99><<<minGridSize, blockSize>>>(t, cycles);
double tt;
cudaMemcpy(&tt, t, sizeof(double), cudaMemcpyDeviceToHost);
if (kDebug) {
unsigned long micros = (tt / p.clockRate) * 1000;
printf("cuda slept %ld us\n", micros);
}
}
} }
extern "C" void cudaSleepFFI(int64_t cycles, int type) {
facebook::cuda::cudaSleep(cycles, type);
}