Skip to content

Commit

Permalink
Merge pull request #45 from daltonrpruitt/cross-platform-ify
Browse files Browse the repository at this point in the history
Cross-platform-ify
  • Loading branch information
daltonrpruitt authored Nov 2, 2022
2 parents 84c9462 + 5657d10 commit 2be2725
Show file tree
Hide file tree
Showing 32 changed files with 397 additions and 198 deletions.
46 changes: 45 additions & 1 deletion .vscode/c_cpp_properties.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,51 @@
"cStandard": "gnu17",
"cppStandard": "gnu++14",
"intelliSenseMode": "linux-gcc-x64",
"configurationProvider": "ms-vscode.cmake-tools"
"mergeConfigurations": false,
"browse": {
"path": [
"${workspaceFolder}/**",
"${workspaceFolder}/src",
"${CUDA_INCLUDE_DIRS}",
"/usr/include/linux",
"/apps/cuda-11.2.1/toolkit/targets/x86_64-linux/include",
"/apps/gcc-8.3.0/gcc-8.3.0/lib/gcc/x86_64-pc-linux-gnu/8.3.0/../../../../include/c++/8.3.0",
"/apps/gcc-8.3.0/gcc-8.3.0/lib/gcc/x86_64-pc-linux-gnu/8.3.0/../../../../include/c++/8.3.0/x86_64-pc-linux-gnu",
"/apps/gcc-8.3.0/gcc-8.3.0/lib/gcc/x86_64-pc-linux-gnu/8.3.0/../../../../include/c++/8.3.0/backward",
"/apps/gcc-8.3.0/gcc-8.3.0/lib/gcc/x86_64-pc-linux-gnu/8.3.0/include",
"/usr/local/include",
"/apps/gcc-8.3.0/gcc-8.3.0/include",
"/apps/gcc-8.3.0/gcc-8.3.0/lib/gcc/x86_64-pc-linux-gnu/8.3.0/include-fixed",
"/usr/include"
],
"limitSymbolsToIncludedHeaders": true
}
},
{
"name": "GPU Laptop",
"includePath": [
"${workspaceFolder}/**",
"C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.7\\include"
],
"defines": [
"_DEBUG",
"UNICODE",
"_UNICODE"
],
"windowsSdkVersion": "10.0.18362.0",
"compilerPath": "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.7/bin/nvcc.exe",
"cStandard": "c17",
"cppStandard": "c++17",
"intelliSenseMode": "windows-msvc-x64",
"mergeConfigurations": false,
"browse": {
"path": [
"${workspaceFolder}/**",
"C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.7\\include"
],
"limitSymbolsToIncludedHeaders": true
},
"configurationProvider": "ms-vscode.cpptools"
}
],
"version": 4
Expand Down
18 changes: 17 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,23 @@
"__functional_base": "cpp",
"__node_handle": "cpp",
"__memory": "cpp",
"filesystem": "cpp"
"filesystem": "cpp",
"xstring": "cpp",
"stack": "cpp",
"xfacet": "cpp",
"xhash": "cpp",
"xiosbase": "cpp",
"xlocale": "cpp",
"xlocbuf": "cpp",
"xlocinfo": "cpp",
"xlocmes": "cpp",
"xlocmon": "cpp",
"xlocnum": "cpp",
"xloctime": "cpp",
"xmemory": "cpp",
"xstddef": "cpp",
"xtr1common": "cpp",
"xutility": "cpp"
},
"cSpell.words": [
"Spmv",
Expand Down
26 changes: 22 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,25 +18,43 @@ include_directories(${CMAKE_SOURCE_DIR}/src/utilities/matrices)
# add_executable(driver src/driver.cu)
add_executable(main src/main.cu)
set_property(TARGET main PROPERTY CXX_STANDARD 17)
set_property(TARGET main PROPERTY CUDA_ARCHITECTURES 35 52 72 80) #35-K20m, 52-Quadro M2000, 72-V100, 80-A100
set_property(TARGET main PROPERTY CUDA_ARCHITECTURES 52 ) # 52-Quadro M2000 or GTX 980M (personal laptop)
# set_property(TARGET main PROPERTY CUDA_ARCHITECTURES 35 52 72 80) #35-K20m, 52-Quadro M2000, 72-V100, 80-A100
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --keep -Xptxas -dlcm=cg")
if (MSVC)
target_compile_options(main PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: -Xcompiler "/bigobj" >)
# list(APPEND CUDA_NVCC_FLAGS "--compiler-options;/bigobj")
# else ()
# target_compile_options(main PRIVATE -Wa,-mbig-obj)
endif ()
target_compile_options(main PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
--keep
-Xptxas -dlcm=cg
-Xcudafe "--diag_suppress=unsigned_compare_with_zero"
>)
>)

if (NOT MSVC)
add_executable(spmv src/spmv.cu)
set_property(TARGET spmv PROPERTY CXX_STANDARD 17)
set_property(TARGET spmv PROPERTY CUDA_ARCHITECTURES 35 52 72 80) #35-K20m, 52-Quadro M2000, 72-V100, 80-A100
set_property(TARGET main PROPERTY CUDA_ARCHITECTURES 52 ) # 52-Quadro M2000 or GTX 980M (personal laptop)
# set_property(TARGET spmv PROPERTY CUDA_ARCHITECTURES 35 52 72 80) #35-K20m, 52-Quadro M2000, 72-V100, 80-A100
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --maxrregcount=255") # -dlcm=cg --expt-relaxed-constexpr
target_compile_options(spmv PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
--expt-relaxed-constexpr
--maxrregcount=255
--keep
-Xptxas -dlcm=cg
-Xcudafe "--diag_suppress=set_but_not_used --diag_suppress=declared_but_not_referenced"
>)
>)

if (MSVC)
target_compile_options(spmv PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: -Xcompiler "/bigobj" >)
# else ()
# target_compile_options(spmv PRIVATE -Wa,-mbig-obj)
endif ()

endif ()

# message(STATUS "python exec: ${Python_EXECUTABLE}")

# add_custom_target(
Expand Down
76 changes: 76 additions & 0 deletions src/clock_gettime_windows.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/**
* @file clock_gettime_windows.h
* @author Dalton Winans-Pruitt ([email protected])
* @brief Uses a SO post about porting the clock_gettime() function
* @version 0.1
* @date 2022-09-17
*
* Based on https://stackoverflow.com/questions/5404277/porting-clock-gettime-to-windows,
* which seems to be the established answer to this problem.
* Also, this is only relevant due to trying to run on Windows.
*
* @copyright Copyright (c) 2022
*
*/
#include <windows.h>
#include <time.h>

LARGE_INTEGER
getFILETIMEoffset()
{
SYSTEMTIME s;
FILETIME f;
LARGE_INTEGER t;

s.wYear = 1970;
s.wMonth = 1;
s.wDay = 1;
s.wHour = 0;
s.wMinute = 0;
s.wSecond = 0;
s.wMilliseconds = 0;
SystemTimeToFileTime(&s, &f);
t.QuadPart = f.dwHighDateTime;
t.QuadPart <<= 32;
t.QuadPart |= f.dwLowDateTime;
return (t);
}

int
clock_gettime(int X, struct timespec *tv)
{
LARGE_INTEGER t;
FILETIME f;
double microseconds;
static LARGE_INTEGER offset;
static double frequencyToMicroseconds;
static int initialized = 0;
static BOOL usePerformanceCounter = 0;

if (!initialized) {
LARGE_INTEGER performanceFrequency;
initialized = 1;
usePerformanceCounter = QueryPerformanceFrequency(&performanceFrequency);
if (usePerformanceCounter) {
QueryPerformanceCounter(&offset);
frequencyToMicroseconds = (double)performanceFrequency.QuadPart / 1000000.;
} else {
offset = getFILETIMEoffset();
frequencyToMicroseconds = 10.;
}
}
if (usePerformanceCounter) QueryPerformanceCounter(&t);
else {
GetSystemTimeAsFileTime(&f);
t.QuadPart = f.dwHighDateTime;
t.QuadPart <<= 32;
t.QuadPart |= f.dwLowDateTime;
}

t.QuadPart -= offset.QuadPart;
microseconds = (double)t.QuadPart / frequencyToMicroseconds;
t.QuadPart = microseconds;
tv->tv_sec = t.QuadPart / 1000000;
tv->tv_nsec = t.QuadPart % 1000000 * 1000;
return (0);
}
2 changes: 1 addition & 1 deletion src/device_props.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class device_context {
* 1000 / (1024*1024*1024); // 1000 cycles/kcycle * 1 GB/1024^3 B
std::streamsize ss = cout.precision();
cout << "Device '" << props_.name << "' : Max Bandwidth = " << std::fixed << std::setprecision(1) << theoretical_bw_ << " GB/s" << endl;
cout << std::setprecision(ss) << resetiosflags( std::ios::fixed | std::ios::showpoint );
cout << std::setprecision(ss) << std::resetiosflags( std::ios::fixed | std::ios::showpoint );
}
cudaPrintLastError();
return pass;
Expand Down
6 changes: 5 additions & 1 deletion src/indices_generation.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Index array generation functions
#pragma once
#include <iostream>
#include <random>
#include <algorithm>
#include <vector>
#include <string>
Expand Down Expand Up @@ -258,8 +259,11 @@ int random_indices(it* indxs, unsigned long long N, int block_size, int shuffle_
for(int i=0; i < N; i++) {
indxs[i] = i;
}
std::random_device rd;
std::mt19937 generator(rd());

for(int i=0; i < N; i+=shuffle_size) {
std::random_shuffle(indxs+i, indxs + i + shuffle_size);
std::shuffle(indxs+i, indxs + i + shuffle_size, generator);
}

for(int i=0; i < N; i++) {
Expand Down
6 changes: 3 additions & 3 deletions src/kernel_context.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ using std::vector;
template<typename kernel_ctx_t>
__global__
void compute_kernel(unsigned long long N, kernel_ctx_t ctx) {
uint idx = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx >= N) return;
ctx(idx);
}
Expand Down Expand Up @@ -94,8 +94,8 @@ float local_execute_template(int N, int Gsz, int Bsz, int shdmem_usage, device_c
template<typename vt, typename it>
struct KernelCPUContext {
public:
uint vt_size = sizeof(vt);
uint it_size = sizeof(it);
unsigned int vt_size = sizeof(vt);
unsigned int it_size = sizeof(it);
typedef it IT;
string name;
unsigned long long N=0;
Expand Down
8 changes: 4 additions & 4 deletions src/kernels/burst_mode/interleaved_copy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ using std::vector;

template<typename vt, typename it, int block_life, int elements>
__forceinline__ __host__ __device__
void interleaved_kernel(uint idx, vt* gpu_in, vt* gpu_out, unsigned long long N){
void interleaved_kernel(unsigned int idx, vt* gpu_in, vt* gpu_out, unsigned long long N){

unsigned long long b_idx = blockIdx.x;
unsigned long long t_idx = threadIdx.x;
Expand All @@ -47,7 +47,7 @@ void interleaved_kernel(uint idx, vt* gpu_in, vt* gpu_out, unsigned long long N)

template<typename vt, typename it, int block_life, int elements>
__global__
void uncoalesced_reuse_kernel_for_regs(uint idx, vt* gpu_in, vt* gpu_out, unsigned long long N){
void uncoalesced_reuse_kernel_for_regs(unsigned int idx, vt* gpu_in, vt* gpu_out, unsigned long long N){
extern __shared__ int dummy[];
interleaved_kernel<vt, it, block_life, elements>(idx, gpu_in, gpu_out, N);
}
Expand Down Expand Up @@ -75,7 +75,7 @@ struct InterleavedCopyContext : public KernelCPUContext<vt, it> {
unsigned long long N;

__device__
void operator() (uint idx){
void operator() (unsigned int idx){
extern __shared__ int dummy[];
interleaved_kernel<vt, it, block_life, elements>(idx, gpu_in, gpu_out, N);
}
Expand Down Expand Up @@ -146,7 +146,7 @@ struct InterleavedCopyContext : public KernelCPUContext<vt, it> {
void local_compute_register_usage(bool& pass) override {
// Kernel Registers
struct cudaFuncAttributes funcAttrib;
cudaErrChk(cudaFuncGetAttributes(&funcAttrib, *uncoalesced_reuse_kernel_for_regs<vt,it,block_life,elements>), "getting function attributes (for # registers)", pass);
cudaErrChk(cudaFuncGetAttributes(&funcAttrib, uncoalesced_reuse_kernel_for_regs<vt,it,block_life,elements>), "getting function attributes (for # registers)", pass);
if(!pass) {
this->okay = false;
return;
Expand Down
10 changes: 5 additions & 5 deletions src/kernels/burst_mode/interleaved_copy_full_life.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ using std::vector;

template<typename vt, typename it, int elements>
__forceinline__ __host__ __device__
void interleaved_full_life_kernel(uint idx, vt* gpu_in, vt* gpu_out, unsigned long long N){
void interleaved_full_life_kernel(unsigned int idx, vt* gpu_in, vt* gpu_out, unsigned long long N){

// unsigned long long b_idx = blockIdx.x;
// unsigned long long t_idx = threadIdx.x;
Expand All @@ -37,7 +37,7 @@ void interleaved_full_life_kernel(uint idx, vt* gpu_in, vt* gpu_out, unsigned lo

// int block_life = N / gridDim.x / elements;
unsigned long long start_idx = blockIdx.x * blockDim.x * elements + threadIdx.x;
uint cycle_offset = gridDim.x * blockDim.x * elements;
unsigned int cycle_offset = gridDim.x * blockDim.x * elements;

for(int x=0; x < N / ( gridDim.x * blockDim.x * elements); ++x) {
for(int y=0; y < elements; ++y) {
Expand All @@ -51,7 +51,7 @@ void interleaved_full_life_kernel(uint idx, vt* gpu_in, vt* gpu_out, unsigned lo

template<typename vt, typename it, int elements>
__global__
void uncoalesced_reuse_kernel_for_regs(uint idx, vt* gpu_in, vt* gpu_out, unsigned long long N){
void uncoalesced_reuse_kernel_for_regs(unsigned int idx, vt* gpu_in, vt* gpu_out, unsigned long long N){
extern __shared__ int dummy[];
interleaved_full_life_kernel<vt, it, elements>(idx, gpu_in, gpu_out, N);
}
Expand Down Expand Up @@ -79,7 +79,7 @@ struct InterleavedCopyFullLifeContext : public KernelCPUContext<vt, it> {
unsigned long long N;

__device__
void operator() (uint idx){
void operator() (unsigned int idx){
extern __shared__ int dummy[];
interleaved_full_life_kernel<vt, it, elements>(idx, gpu_in, gpu_out, N);
}
Expand Down Expand Up @@ -153,7 +153,7 @@ struct InterleavedCopyFullLifeContext : public KernelCPUContext<vt, it> {
void local_compute_register_usage(bool& pass) override {
// Kernel Registers
struct cudaFuncAttributes funcAttrib;
cudaErrChk(cudaFuncGetAttributes(&funcAttrib, *uncoalesced_reuse_kernel_for_regs<vt,it,elements>), "getting function attributes (for # registers)", pass);
cudaErrChk(cudaFuncGetAttributes(&funcAttrib, uncoalesced_reuse_kernel_for_regs<vt,it,elements>), "getting function attributes (for # registers)", pass);
if(!pass) {
this->okay = false;
return;
Expand Down
Loading

0 comments on commit 2be2725

Please sign in to comment.