Merge pull request #45 from daltonrpruitt/cross-platform-ify

Cross-platform-ify
daltonrpruitt · Nov 2, 2022 · 2be2725 · 2be2725
2 parents 84c9462 + 5657d10
commit 2be2725
Show file tree

Hide file tree

Showing 32 changed files with 397 additions and 198 deletions.
diff --git a/.vscode/c_cpp_properties.json b/.vscode/c_cpp_properties.json
@@ -22,7 +22,51 @@
             "cStandard": "gnu17",
             "cppStandard": "gnu++14",
             "intelliSenseMode": "linux-gcc-x64",
-            "configurationProvider": "ms-vscode.cmake-tools"
+            "mergeConfigurations": false,
+            "browse": {
+                "path": [
+                    "${workspaceFolder}/**",
+                    "${workspaceFolder}/src",
+                    "${CUDA_INCLUDE_DIRS}",
+                    "/usr/include/linux",
+                    "/apps/cuda-11.2.1/toolkit/targets/x86_64-linux/include",
+                    "/apps/gcc-8.3.0/gcc-8.3.0/lib/gcc/x86_64-pc-linux-gnu/8.3.0/../../../../include/c++/8.3.0",
+                    "/apps/gcc-8.3.0/gcc-8.3.0/lib/gcc/x86_64-pc-linux-gnu/8.3.0/../../../../include/c++/8.3.0/x86_64-pc-linux-gnu",
+                    "/apps/gcc-8.3.0/gcc-8.3.0/lib/gcc/x86_64-pc-linux-gnu/8.3.0/../../../../include/c++/8.3.0/backward",
+                    "/apps/gcc-8.3.0/gcc-8.3.0/lib/gcc/x86_64-pc-linux-gnu/8.3.0/include",
+                    "/usr/local/include",
+                    "/apps/gcc-8.3.0/gcc-8.3.0/include",
+                    "/apps/gcc-8.3.0/gcc-8.3.0/lib/gcc/x86_64-pc-linux-gnu/8.3.0/include-fixed",
+                    "/usr/include"
+                ],
+                "limitSymbolsToIncludedHeaders": true
+            }
+        },
+        {
+            "name": "GPU Laptop",
+            "includePath": [
+                "${workspaceFolder}/**",
+                "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.7\\include"
+            ],
+            "defines": [
+                "_DEBUG",
+                "UNICODE",
+                "_UNICODE"
+            ],
+            "windowsSdkVersion": "10.0.18362.0",
+            "compilerPath": "C:/Program Files/NVIDIA GPU Computing Toolkit/CUDA/v11.7/bin/nvcc.exe",
+            "cStandard": "c17",
+            "cppStandard": "c++17",
+            "intelliSenseMode": "windows-msvc-x64",
+            "mergeConfigurations": false,
+            "browse": {
+                "path": [
+                    "${workspaceFolder}/**",
+                    "C:\\Program Files\\NVIDIA GPU Computing Toolkit\\CUDA\\v11.7\\include"
+                ],
+                "limitSymbolsToIncludedHeaders": true
+            },
+            "configurationProvider": "ms-vscode.cpptools"
         }
     ],
     "version": 4

diff --git a/.vscode/settings.json b/.vscode/settings.json
@@ -77,7 +77,23 @@
         "__functional_base": "cpp",
         "__node_handle": "cpp",
         "__memory": "cpp",
-        "filesystem": "cpp"
+        "filesystem": "cpp",
+        "xstring": "cpp",
+        "stack": "cpp",
+        "xfacet": "cpp",
+        "xhash": "cpp",
+        "xiosbase": "cpp",
+        "xlocale": "cpp",
+        "xlocbuf": "cpp",
+        "xlocinfo": "cpp",
+        "xlocmes": "cpp",
+        "xlocmon": "cpp",
+        "xlocnum": "cpp",
+        "xloctime": "cpp",
+        "xmemory": "cpp",
+        "xstddef": "cpp",
+        "xtr1common": "cpp",
+        "xutility": "cpp"
     },
     "cSpell.words": [
         "Spmv",

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -18,25 +18,43 @@ include_directories(${CMAKE_SOURCE_DIR}/src/utilities/matrices)
 # add_executable(driver src/driver.cu)
 add_executable(main src/main.cu)
 set_property(TARGET main PROPERTY CXX_STANDARD 17)
-set_property(TARGET main PROPERTY CUDA_ARCHITECTURES 35 52 72 80) #35-K20m, 52-Quadro M2000, 72-V100, 80-A100  
+set_property(TARGET main PROPERTY CUDA_ARCHITECTURES  52 ) # 52-Quadro M2000 or GTX 980M (personal laptop)
+# set_property(TARGET main PROPERTY CUDA_ARCHITECTURES 35 52 72 80) #35-K20m, 52-Quadro M2000, 72-V100, 80-A100  
 # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --keep -Xptxas -dlcm=cg")
+if (MSVC)
+    target_compile_options(main PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: -Xcompiler "/bigobj" >)
+    # list(APPEND CUDA_NVCC_FLAGS  "--compiler-options;/bigobj")
+# else ()
+#   target_compile_options(main PRIVATE -Wa,-mbig-obj)
+endif ()
 target_compile_options(main PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
                         --keep 
                         -Xptxas -dlcm=cg
                         -Xcudafe "--diag_suppress=unsigned_compare_with_zero"
-                        >)
+>)
 
+if (NOT MSVC)
 add_executable(spmv src/spmv.cu)
 set_property(TARGET spmv PROPERTY CXX_STANDARD 17)
-set_property(TARGET spmv PROPERTY CUDA_ARCHITECTURES 35 52 72 80) #35-K20m, 52-Quadro M2000, 72-V100, 80-A100  
+set_property(TARGET main PROPERTY CUDA_ARCHITECTURES  52 ) # 52-Quadro M2000 or GTX 980M (personal laptop)
+# set_property(TARGET spmv PROPERTY CUDA_ARCHITECTURES 35 52 72 80) #35-K20m, 52-Quadro M2000, 72-V100, 80-A100  
 # set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} --expt-relaxed-constexpr --maxrregcount=255") #  -dlcm=cg  --expt-relaxed-constexpr
 target_compile_options(spmv PRIVATE $<$<COMPILE_LANGUAGE:CUDA>:
                         --expt-relaxed-constexpr 
                         --maxrregcount=255
                         --keep 
                         -Xptxas -dlcm=cg
                         -Xcudafe "--diag_suppress=set_but_not_used --diag_suppress=declared_but_not_referenced"
-                       >)
+>)
+
+if (MSVC)
+    target_compile_options(spmv PRIVATE $<$<COMPILE_LANGUAGE:CUDA>: -Xcompiler "/bigobj" >)
+# else ()
+#   target_compile_options(spmv PRIVATE -Wa,-mbig-obj)
+endif ()
+
+endif ()
+
 # message(STATUS "python exec: ${Python_EXECUTABLE}")
 
 # add_custom_target(

diff --git a/src/clock_gettime_windows.h b/src/clock_gettime_windows.h
@@ -0,0 +1,76 @@
+/**
+ * @file clock_gettime_windows.h
+ * @author Dalton Winans-Pruitt ([email protected])
+ * @brief Uses a SO post about porting the clock_gettime() function
+ * @version 0.1
+ * @date 2022-09-17
+ * 
+ * Based on https://stackoverflow.com/questions/5404277/porting-clock-gettime-to-windows, 
+ * which seems to be the established answer to this problem. 
+ * Also, this is only relevant due to trying to run on Windows.
+ * 
+ * @copyright Copyright (c) 2022
+ * 
+ */
+#include <windows.h>
+#include <time.h>
+
+LARGE_INTEGER
+getFILETIMEoffset()
+{
+    SYSTEMTIME s;
+    FILETIME f;
+    LARGE_INTEGER t;
+
+    s.wYear = 1970;
+    s.wMonth = 1;
+    s.wDay = 1;
+    s.wHour = 0;
+    s.wMinute = 0;
+    s.wSecond = 0;
+    s.wMilliseconds = 0;
+    SystemTimeToFileTime(&s, &f);
+    t.QuadPart = f.dwHighDateTime;
+    t.QuadPart <<= 32;
+    t.QuadPart |= f.dwLowDateTime;
+    return (t);
+}
+
+int
+clock_gettime(int X, struct timespec *tv)
+{
+    LARGE_INTEGER           t;
+    FILETIME            f;
+    double                  microseconds;
+    static LARGE_INTEGER    offset;
+    static double           frequencyToMicroseconds;
+    static int              initialized = 0;
+    static BOOL             usePerformanceCounter = 0;
+
+    if (!initialized) {
+        LARGE_INTEGER performanceFrequency;
+        initialized = 1;
+        usePerformanceCounter = QueryPerformanceFrequency(&performanceFrequency);
+        if (usePerformanceCounter) {
+            QueryPerformanceCounter(&offset);
+            frequencyToMicroseconds = (double)performanceFrequency.QuadPart / 1000000.;
+        } else {
+            offset = getFILETIMEoffset();
+            frequencyToMicroseconds = 10.;
+        }
+    }
+    if (usePerformanceCounter) QueryPerformanceCounter(&t);
+    else {
+        GetSystemTimeAsFileTime(&f);
+        t.QuadPart = f.dwHighDateTime;
+        t.QuadPart <<= 32;
+        t.QuadPart |= f.dwLowDateTime;
+    }
+
+    t.QuadPart -= offset.QuadPart;
+    microseconds = (double)t.QuadPart / frequencyToMicroseconds;
+    t.QuadPart = microseconds;
+    tv->tv_sec = t.QuadPart / 1000000;
+    tv->tv_nsec = t.QuadPart % 1000000 * 1000;
+    return (0);
+}
diff --git a/src/device_props.h b/src/device_props.h
@@ -43,7 +43,7 @@ class device_context {
                             * 1000 / (1024*1024*1024);                     // 1000 cycles/kcycle * 1 GB/1024^3 B 
 			std::streamsize ss = cout.precision();
             cout << "Device '" << props_.name << "' : Max Bandwidth = " << std::fixed << std::setprecision(1) << theoretical_bw_ << " GB/s" << endl;
-			cout << std::setprecision(ss) << resetiosflags( std::ios::fixed | std::ios::showpoint );
+			cout << std::setprecision(ss) << std::resetiosflags( std::ios::fixed | std::ios::showpoint );
         }
         cudaPrintLastError();
         return pass;

diff --git a/src/indices_generation.h b/src/indices_generation.h
@@ -1,6 +1,7 @@
 // Index array generation functions
 #pragma once
 #include <iostream>
+#include <random>
 #include <algorithm>
 #include <vector>
 #include <string>
@@ -258,8 +259,11 @@ int random_indices(it* indxs, unsigned long long N, int block_size, int shuffle_
     for(int i=0; i < N; i++) {
         indxs[i] = i;                                                                    
     }
+    std::random_device rd;
+    std::mt19937 generator(rd());
+
     for(int i=0; i < N; i+=shuffle_size) {
-        std::random_shuffle(indxs+i, indxs + i + shuffle_size);
+        std::shuffle(indxs+i, indxs + i + shuffle_size, generator);
     }
 
     for(int i=0; i < N; i++) {

diff --git a/src/kernel_context.cu b/src/kernel_context.cu
@@ -33,7 +33,7 @@ using std::vector;
 template<typename kernel_ctx_t>
 __global__
 void compute_kernel(unsigned long long N, kernel_ctx_t ctx) {
-    uint idx = blockIdx.x * blockDim.x + threadIdx.x;
+    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
     if(idx >= N) return;
     ctx(idx);
 }
@@ -94,8 +94,8 @@ float local_execute_template(int N, int Gsz, int Bsz, int shdmem_usage, device_c
 template<typename vt, typename it>
 struct KernelCPUContext {
     public:
-        uint vt_size = sizeof(vt);
-        uint it_size = sizeof(it);
+        unsigned int vt_size = sizeof(vt);
+        unsigned int it_size = sizeof(it);
         typedef it IT;
         string name;
         unsigned long long N=0;

diff --git a/src/kernels/burst_mode/interleaved_copy.cu b/src/kernels/burst_mode/interleaved_copy.cu
@@ -27,7 +27,7 @@ using std::vector;
 
 template<typename vt, typename it, int block_life, int elements>
 __forceinline__ __host__ __device__        
-void interleaved_kernel(uint idx, vt* gpu_in, vt* gpu_out, unsigned long long N){
+void interleaved_kernel(unsigned int idx, vt* gpu_in, vt* gpu_out, unsigned long long N){
 
     unsigned long long b_idx = blockIdx.x;
     unsigned long long t_idx = threadIdx.x;
@@ -47,7 +47,7 @@ void interleaved_kernel(uint idx, vt* gpu_in, vt* gpu_out, unsigned long long N)
 
 template<typename vt, typename it, int block_life, int elements>
 __global__        
-void uncoalesced_reuse_kernel_for_regs(uint idx, vt* gpu_in, vt* gpu_out, unsigned long long N){
+void uncoalesced_reuse_kernel_for_regs(unsigned int idx, vt* gpu_in, vt* gpu_out, unsigned long long N){
         extern __shared__ int dummy[];
         interleaved_kernel<vt, it, block_life, elements>(idx, gpu_in, gpu_out, N);
 }
@@ -75,7 +75,7 @@ struct InterleavedCopyContext : public KernelCPUContext<vt, it> {
             unsigned long long N;
 
             __device__        
-            void operator() (uint idx){
+            void operator() (unsigned int idx){
                 extern __shared__ int dummy[];
                 interleaved_kernel<vt, it, block_life, elements>(idx, gpu_in, gpu_out, N);
             }
@@ -146,7 +146,7 @@ struct InterleavedCopyContext : public KernelCPUContext<vt, it> {
         void local_compute_register_usage(bool& pass) override {   
             // Kernel Registers 
             struct cudaFuncAttributes funcAttrib;
-            cudaErrChk(cudaFuncGetAttributes(&funcAttrib, *uncoalesced_reuse_kernel_for_regs<vt,it,block_life,elements>), "getting function attributes (for # registers)", pass);
+            cudaErrChk(cudaFuncGetAttributes(&funcAttrib, uncoalesced_reuse_kernel_for_regs<vt,it,block_life,elements>), "getting function attributes (for # registers)", pass);
             if(!pass) {
                 this->okay = false; 
                 return;

diff --git a/src/kernels/burst_mode/interleaved_copy_full_life.cu b/src/kernels/burst_mode/interleaved_copy_full_life.cu
@@ -28,7 +28,7 @@ using std::vector;
 
 template<typename vt, typename it, int elements>
 __forceinline__ __host__ __device__        
-void interleaved_full_life_kernel(uint idx, vt* gpu_in, vt* gpu_out, unsigned long long N){
+void interleaved_full_life_kernel(unsigned int idx, vt* gpu_in, vt* gpu_out, unsigned long long N){
 
     // unsigned long long b_idx = blockIdx.x;
     // unsigned long long t_idx = threadIdx.x;
@@ -37,7 +37,7 @@ void interleaved_full_life_kernel(uint idx, vt* gpu_in, vt* gpu_out, unsigned lo
 
     // int block_life = N / gridDim.x / elements; 
     unsigned long long start_idx = blockIdx.x * blockDim.x * elements + threadIdx.x;
-    uint cycle_offset = gridDim.x * blockDim.x * elements;
+    unsigned int cycle_offset = gridDim.x * blockDim.x * elements;
 
     for(int x=0; x < N / ( gridDim.x * blockDim.x * elements); ++x) {
         for(int y=0; y < elements; ++y) {
@@ -51,7 +51,7 @@ void interleaved_full_life_kernel(uint idx, vt* gpu_in, vt* gpu_out, unsigned lo
 
 template<typename vt, typename it, int elements>
 __global__        
-void uncoalesced_reuse_kernel_for_regs(uint idx, vt* gpu_in, vt* gpu_out, unsigned long long N){
+void uncoalesced_reuse_kernel_for_regs(unsigned int idx, vt* gpu_in, vt* gpu_out, unsigned long long N){
         extern __shared__ int dummy[];
         interleaved_full_life_kernel<vt, it, elements>(idx, gpu_in, gpu_out, N);
 }
@@ -79,7 +79,7 @@ struct InterleavedCopyFullLifeContext : public KernelCPUContext<vt, it> {
             unsigned long long N;
 
             __device__        
-            void operator() (uint idx){
+            void operator() (unsigned int idx){
                 extern __shared__ int dummy[];
                 interleaved_full_life_kernel<vt, it, elements>(idx, gpu_in, gpu_out, N);
             }
@@ -153,7 +153,7 @@ struct InterleavedCopyFullLifeContext : public KernelCPUContext<vt, it> {
         void local_compute_register_usage(bool& pass) override {   
             // Kernel Registers 
             struct cudaFuncAttributes funcAttrib;
-            cudaErrChk(cudaFuncGetAttributes(&funcAttrib, *uncoalesced_reuse_kernel_for_regs<vt,it,elements>), "getting function attributes (for # registers)", pass);
+            cudaErrChk(cudaFuncGetAttributes(&funcAttrib, uncoalesced_reuse_kernel_for_regs<vt,it,elements>), "getting function attributes (for # registers)", pass);
             if(!pass) {
                 this->okay = false; 
                 return;