Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cross-platform-ify #45

Merged
merged 17 commits into from
Nov 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 45 additions & 1 deletion .vscode/c_cpp_properties.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,51 @@
"cStandard": "gnu17",
"cppStandard": "gnu++14",
"intelliSenseMode": "linux-gcc-x64",
"configurationProvider": "ms-vscode.cmake-tools"
"mergeConfigurations": false,
"browse": {
"path": [
"${workspaceFolder}/**",
"${workspaceFolder}/src",
"${CUDA_INCLUDE_DIRS}",
"/usr/include/linux",
"/apps/cuda-11.2.1/toolkit/targets/x86_64-linux/include",
"/apps/gcc-8.3.0/gcc-8.3.0/lib/gcc/x86_64-pc-linux-gnu/8.3.0/../../../../include/c++/8.3.0",
"/apps/gcc-8.3.0/gcc-8.3.0/lib/gcc/x86_64-pc-linux-gnu/8.3.0/../../../../include/c++/8.3.0/x86_64-pc-linux-gnu",
"/apps/gcc-8.3.0/gcc-8.3.0/lib/gcc/x86_64-pc-linux-gnu/8.3.0/../../../../include/c++/8.3.0/backward",
"/apps/gcc-8.3.0/gcc-8.3.0/lib/gcc/x86_64-pc-linux-gnu/8.3.0/include",
"/usr/local/include",
"/apps/gcc-8.3.0/gcc-8.3.0/include",
"/apps/gcc-8.3.0/gcc-8.3.0/lib/gcc/x86_64-pc-linux-gnu/8.3.0/include-fixed",
"/usr/include"
],
"limitSymbolsToIncludedHeaders": true
}
},
{
"name": "GPU Laptop",
"includePath": [
"${workspaceFolder}/**",
"C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.7\\include"
],
"defines": [
"_DEBUG",
"UNICODE",
"_UNICODE"
],
"windowsSdkVersion": "10.0.18362.0",
"compilerPath": "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.7/bin/nvcc.exe",
"cStandard": "c17",
"cppStandard": "c++17",
"intelliSenseMode": "windows-msvc-x64",
"mergeConfigurations": false,
"browse": {
"path": [
"${workspaceFolder}/**",
"C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.7\\include"
],
"limitSymbolsToIncludedHeaders": true
},
"configurationProvider": "ms-vscode.cpptools"
}
],
"version": 4
Expand Down
18 changes: 17 additions & 1 deletion .vscode/settings.json
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,23 @@
"__functional_base": "cpp",
"__node_handle": "cpp",
"__memory": "cpp",
"filesystem": "cpp"
"filesystem": "cpp",
"xstring": "cpp",
"stack": "cpp",
"xfacet": "cpp",
"xhash": "cpp",
"xiosbase": "cpp",
"xlocale": "cpp",
"xlocbuf": "cpp",
"xlocinfo": "cpp",
"xlocmes": "cpp",
"xlocmon": "cpp",
"xlocnum": "cpp",
"xloctime": "cpp",
"xmemory": "cpp",
"xstddef": "cpp",
"xtr1common": "cpp",
"xutility": "cpp"
},
"cSpell.words": [
"Spmv",
Expand Down
26 changes: 22 additions & 4 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -18,25 +18,43 @@ include_directories(${CMAKE_SOURCE_DIR}/src/utilities/matrices)
# add_executable(driver src/driver.cu)
add_executable(main src/main.cu)
set_property(TARGET main PROPERTY CXX_STANDARD 17)
set_property(TARGET main PROPERTY CUDA_ARCHITECTURES 35 52 72 80) #35-K20m, 52-Quadro M2000, 72-V100, 80-A100
set_property(TARGET main PROPERTY CUDA_ARCHITECTURES 52 ) # 52-Quadro M2000 or GTX 980M (personal laptop)
# set_property(TARGET main PROPERTY CUDA_ARCHITECTURES 35 52 72 80) #35-K20m, 52-Quadro M2000, 72-V100, 80-A100
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --keep -Xptxas -dlcm=cg")
if (MSVC)
target_compile_options(main PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: -Xcompiler "/bigobj" >)
# list(APPEND CUDA_NVCC_FLAGS "--compiler-options;/bigobj")
# else ()
# target_compile_options(main PRIVATE -Wa,-mbig-obj)
endif ()
target_compile_options(main PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
--keep
-Xptxas -dlcm=cg
-Xcudafe "--diag_suppress=unsigned_compare_with_zero"
>)
>)

if (NOT MSVC)
add_executable(spmv src/spmv.cu)
set_property(TARGET spmv PROPERTY CXX_STANDARD 17)
set_property(TARGET spmv PROPERTY CUDA_ARCHITECTURES 35 52 72 80) #35-K20m, 52-Quadro M2000, 72-V100, 80-A100
set_property(TARGET main PROPERTY CUDA_ARCHITECTURES 52 ) # 52-Quadro M2000 or GTX 980M (personal laptop)
# set_property(TARGET spmv PROPERTY CUDA_ARCHITECTURES 35 52 72 80) #35-K20m, 52-Quadro M2000, 72-V100, 80-A100
# set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --maxrregcount=255") # -dlcm=cg --expt-relaxed-constexpr
target_compile_options(spmv PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
--expt-relaxed-constexpr
--maxrregcount=255
--keep
-Xptxas -dlcm=cg
-Xcudafe "--diag_suppress=set_but_not_used --diag_suppress=declared_but_not_referenced"
>)
>)

if (MSVC)
target_compile_options(spmv PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: -Xcompiler "/bigobj" >)
# else ()
# target_compile_options(spmv PRIVATE -Wa,-mbig-obj)
endif ()

endif ()

# message(STATUS "python exec: ${Python_EXECUTABLE}")

# add_custom_target(
Expand Down
76 changes: 76 additions & 0 deletions src/clock_gettime_windows.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
/**
* @file clock_gettime_windows.h
* @author Dalton Winans-Pruitt ([email protected])
* @brief Uses a SO post about porting the clock_gettime() function
* @version 0.1
* @date 2022-09-17
*
* Based on https://stackoverflow.com/questions/5404277/porting-clock-gettime-to-windows,
* which seems to be the established answer to this problem.
* Also, this is only relevant due to trying to run on Windows.
*
* @copyright Copyright (c) 2022
*
*/
#include <windows.h>
#include <time.h>

LARGE_INTEGER
getFILETIMEoffset()
{
SYSTEMTIME s;
FILETIME f;
LARGE_INTEGER t;

s.wYear = 1970;
s.wMonth = 1;
s.wDay = 1;
s.wHour = 0;
s.wMinute = 0;
s.wSecond = 0;
s.wMilliseconds = 0;
SystemTimeToFileTime(&s, &f);
t.QuadPart = f.dwHighDateTime;
t.QuadPart <<= 32;
t.QuadPart |= f.dwLowDateTime;
return (t);
}

int
clock_gettime(int X, struct timespec *tv)
{
LARGE_INTEGER t;
FILETIME f;
double microseconds;
static LARGE_INTEGER offset;
static double frequencyToMicroseconds;
static int initialized = 0;
static BOOL usePerformanceCounter = 0;

if (!initialized) {
LARGE_INTEGER performanceFrequency;
initialized = 1;
usePerformanceCounter = QueryPerformanceFrequency(&performanceFrequency);
if (usePerformanceCounter) {
QueryPerformanceCounter(&offset);
frequencyToMicroseconds = (double)performanceFrequency.QuadPart / 1000000.;
} else {
offset = getFILETIMEoffset();
frequencyToMicroseconds = 10.;
}
}
if (usePerformanceCounter) QueryPerformanceCounter(&t);
else {
GetSystemTimeAsFileTime(&f);
t.QuadPart = f.dwHighDateTime;
t.QuadPart <<= 32;
t.QuadPart |= f.dwLowDateTime;
}

t.QuadPart -= offset.QuadPart;
microseconds = (double)t.QuadPart / frequencyToMicroseconds;
t.QuadPart = microseconds;
tv->tv_sec = t.QuadPart / 1000000;
tv->tv_nsec = t.QuadPart % 1000000 * 1000;
return (0);
}
2 changes: 1 addition & 1 deletion src/device_props.h
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ class device_context {
* 1000 / (1024*1024*1024); // 1000 cycles/kcycle * 1 GB/1024^3 B
std::streamsize ss = cout.precision();
cout << "Device '" << props_.name << "' : Max Bandwidth = " << std::fixed << std::setprecision(1) << theoretical_bw_ << " GB/s" << endl;
cout << std::setprecision(ss) << resetiosflags( std::ios::fixed | std::ios::showpoint );
cout << std::setprecision(ss) << std::resetiosflags( std::ios::fixed | std::ios::showpoint );
}
cudaPrintLastError();
return pass;
Expand Down
6 changes: 5 additions & 1 deletion src/indices_generation.h
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
// Index array generation functions
#pragma once
#include <iostream>
#include <random>
#include <algorithm>
#include <vector>
#include <string>
Expand Down Expand Up @@ -258,8 +259,11 @@ int random_indices(it* indxs, unsigned long long N, int block_size, int shuffle_
for(int i=0; i < N; i++) {
indxs[i] = i;
}
std::random_device rd;
std::mt19937 generator(rd());

for(int i=0; i < N; i+=shuffle_size) {
std::random_shuffle(indxs+i, indxs + i + shuffle_size);
std::shuffle(indxs+i, indxs + i + shuffle_size, generator);
}

for(int i=0; i < N; i++) {
Expand Down
6 changes: 3 additions & 3 deletions src/kernel_context.cu
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ using std::vector;
template<typename kernel_ctx_t>
__global__
void compute_kernel(unsigned long long N, kernel_ctx_t ctx) {
uint idx = blockIdx.x * blockDim.x + threadIdx.x;
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx >= N) return;
ctx(idx);
}
Expand Down Expand Up @@ -94,8 +94,8 @@ float local_execute_template(int N, int Gsz, int Bsz, int shdmem_usage, device_c
template<typename vt, typename it>
struct KernelCPUContext {
public:
uint vt_size = sizeof(vt);
uint it_size = sizeof(it);
unsigned int vt_size = sizeof(vt);
unsigned int it_size = sizeof(it);
typedef it IT;
string name;
unsigned long long N=0;
Expand Down
8 changes: 4 additions & 4 deletions src/kernels/burst_mode/interleaved_copy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ using std::vector;

template<typename vt, typename it, int block_life, int elements>
__forceinline__ __host__ __device__
void interleaved_kernel(uint idx, vt* gpu_in, vt* gpu_out, unsigned long long N){
void interleaved_kernel(unsigned int idx, vt* gpu_in, vt* gpu_out, unsigned long long N){

unsigned long long b_idx = blockIdx.x;
unsigned long long t_idx = threadIdx.x;
Expand All @@ -47,7 +47,7 @@ void interleaved_kernel(uint idx, vt* gpu_in, vt* gpu_out, unsigned long long N)

template<typename vt, typename it, int block_life, int elements>
__global__
void uncoalesced_reuse_kernel_for_regs(uint idx, vt* gpu_in, vt* gpu_out, unsigned long long N){
void uncoalesced_reuse_kernel_for_regs(unsigned int idx, vt* gpu_in, vt* gpu_out, unsigned long long N){
extern __shared__ int dummy[];
interleaved_kernel<vt, it, block_life, elements>(idx, gpu_in, gpu_out, N);
}
Expand Down Expand Up @@ -75,7 +75,7 @@ struct InterleavedCopyContext : public KernelCPUContext<vt, it> {
unsigned long long N;

__device__
void operator() (uint idx){
void operator() (unsigned int idx){
extern __shared__ int dummy[];
interleaved_kernel<vt, it, block_life, elements>(idx, gpu_in, gpu_out, N);
}
Expand Down Expand Up @@ -146,7 +146,7 @@ struct InterleavedCopyContext : public KernelCPUContext<vt, it> {
void local_compute_register_usage(bool& pass) override {
// Kernel Registers
struct cudaFuncAttributes funcAttrib;
cudaErrChk(cudaFuncGetAttributes(&funcAttrib, *uncoalesced_reuse_kernel_for_regs<vt,it,block_life,elements>), "getting function attributes (for # registers)", pass);
cudaErrChk(cudaFuncGetAttributes(&funcAttrib, uncoalesced_reuse_kernel_for_regs<vt,it,block_life,elements>), "getting function attributes (for # registers)", pass);
if(!pass) {
this->okay = false;
return;
Expand Down
10 changes: 5 additions & 5 deletions src/kernels/burst_mode/interleaved_copy_full_life.cu
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ using std::vector;

template<typename vt, typename it, int elements>
__forceinline__ __host__ __device__
void interleaved_full_life_kernel(uint idx, vt* gpu_in, vt* gpu_out, unsigned long long N){
void interleaved_full_life_kernel(unsigned int idx, vt* gpu_in, vt* gpu_out, unsigned long long N){

// unsigned long long b_idx = blockIdx.x;
// unsigned long long t_idx = threadIdx.x;
Expand All @@ -37,7 +37,7 @@ void interleaved_full_life_kernel(uint idx, vt* gpu_in, vt* gpu_out, unsigned lo

// int block_life = N / gridDim.x / elements;
unsigned long long start_idx = blockIdx.x * blockDim.x * elements + threadIdx.x;
uint cycle_offset = gridDim.x * blockDim.x * elements;
unsigned int cycle_offset = gridDim.x * blockDim.x * elements;

for(int x=0; x < N / ( gridDim.x * blockDim.x * elements); ++x) {
for(int y=0; y < elements; ++y) {
Expand All @@ -51,7 +51,7 @@ void interleaved_full_life_kernel(uint idx, vt* gpu_in, vt* gpu_out, unsigned lo

template<typename vt, typename it, int elements>
__global__
void uncoalesced_reuse_kernel_for_regs(uint idx, vt* gpu_in, vt* gpu_out, unsigned long long N){
void uncoalesced_reuse_kernel_for_regs(unsigned int idx, vt* gpu_in, vt* gpu_out, unsigned long long N){
extern __shared__ int dummy[];
interleaved_full_life_kernel<vt, it, elements>(idx, gpu_in, gpu_out, N);
}
Expand Down Expand Up @@ -79,7 +79,7 @@ struct InterleavedCopyFullLifeContext : public KernelCPUContext<vt, it> {
unsigned long long N;

__device__
void operator() (uint idx){
void operator() (unsigned int idx){
extern __shared__ int dummy[];
interleaved_full_life_kernel<vt, it, elements>(idx, gpu_in, gpu_out, N);
}
Expand Down Expand Up @@ -153,7 +153,7 @@ struct InterleavedCopyFullLifeContext : public KernelCPUContext<vt, it> {
void local_compute_register_usage(bool& pass) override {
// Kernel Registers
struct cudaFuncAttributes funcAttrib;
cudaErrChk(cudaFuncGetAttributes(&funcAttrib, *uncoalesced_reuse_kernel_for_regs<vt,it,elements>), "getting function attributes (for # registers)", pass);
cudaErrChk(cudaFuncGetAttributes(&funcAttrib, uncoalesced_reuse_kernel_for_regs<vt,it,elements>), "getting function attributes (for # registers)", pass);
if(!pass) {
this->okay = false;
return;
Expand Down
Loading