From dddc103fdbf1a8a72a9560aec73114cb1fbb9423 Mon Sep 17 00:00:00 2001
From: Connor Baker <connorbaker01@gmail.com>
Date: Thu, 16 Feb 2023 14:50:24 -0500
Subject: [PATCH] cudaFlags: rewrite to capture all architechtures and fix
 #215436

---
 .../science/math/mxnet/default.nix            |   2 +-
 .../compilers/cudatoolkit/flags.nix           | 179 ++++++++++++------
 .../compilers/cudatoolkit/gpus.nix            | 110 +++++++++++
 .../libraries/science/math/magma/default.nix  |   2 +-
 .../python-modules/jaxlib/default.nix         |   2 +-
 .../python-modules/tensorflow/default.nix     |   2 +-
 .../python-modules/torch/default.nix          |  47 ++++-
 .../python-modules/torchvision/default.nix    |   6 +-
 8 files changed, 281 insertions(+), 69 deletions(-)
 create mode 100644 pkgs/development/compilers/cudatoolkit/gpus.nix

diff --git a/pkgs/applications/science/math/mxnet/default.nix b/pkgs/applications/science/math/mxnet/default.nix
index a5c0ebc85b13b..c1a329c608864 100644
--- a/pkgs/applications/science/math/mxnet/default.nix
+++ b/pkgs/applications/science/math/mxnet/default.nix
@@ -50,7 +50,7 @@ stdenv.mkDerivation rec {
       "-DUSE_OLDCMAKECUDA=ON"  # see https://github.com/apache/incubator-mxnet/issues/10743
       "-DCUDA_ARCH_NAME=All"
       "-DCUDA_HOST_COMPILER=${cudatoolkit.cc}/bin/cc"
-      "-DMXNET_CUDA_ARCH=${cudaFlags.cudaCapabilitiesSemiColonString}"
+      "-DMXNET_CUDA_ARCH=${builtins.concatStringsSep ";" cudaFlags.cudaRealArches}"
     ] else [ "-DUSE_CUDA=OFF" ])
     ++ lib.optional (!cudnnSupport) "-DUSE_CUDNN=OFF";
 
diff --git a/pkgs/development/compilers/cudatoolkit/flags.nix b/pkgs/development/compilers/cudatoolkit/flags.nix
index 24f653ded29be..8e1e54723b2e4 100644
--- a/pkgs/development/compilers/cudatoolkit/flags.nix
+++ b/pkgs/development/compilers/cudatoolkit/flags.nix
@@ -2,7 +2,18 @@
 , lib
 , cudatoolkit
 }:
+
+# Type aliases
+# Gpu = {
+#   archName: String, # e.g., "Hopper"
+#   computeCapability: String, # e.g., "9.0"
+#   minCudaVersion: String, # e.g., "11.8"
+#   maxCudaVersion: String, # e.g., "12.0"
+# }
+
 let
+  inherit (lib) attrsets lists strings trivial versions;
+  cudaVersion = cudatoolkit.version;
 
   # Flags are determined based on your CUDA toolkit by default.  You may benefit
   # from improved performance, reduced file size, or greater hardware suppport by
@@ -13,66 +24,116 @@ let
   #
   # Please see the accompanying documentation or https://github.com/NixOS/nixpkgs/pull/205351
 
-  defaultCudaCapabilities = rec {
-    cuda9 = [
-      "3.0"
-      "3.5"
-      "5.0"
-      "5.2"
-      "6.0"
-      "6.1"
-      "7.0"
-    ];
-
-    cuda10 = cuda9 ++ [
-      "7.5"
-    ];
-
-    cuda11 = [
-      "3.5"
-      "5.0"
-      "5.2"
-      "6.0"
-      "6.1"
-      "7.0"
-      "7.5"
-      "8.0"
-      "8.6"
-    ];
-
-  };
-
-  cudaMicroarchitectureNames = {
-    "3" = "Kepler";
-    "5" = "Maxwell";
-    "6" = "Pascal";
-    "7" = "Volta";
-    "8" = "Ampere";
-    "9" = "Hopper";
-  };
-
-  defaultCudaArchList = defaultCudaCapabilities."cuda${lib.versions.major cudatoolkit.version}";
-  cudaRealCapabilities = config.cudaCapabilities or defaultCudaArchList;
-  capabilitiesForward = "${lib.last cudaRealCapabilities}+PTX";
-
-  dropDot = ver: builtins.replaceStrings ["."] [""] ver;
-
-  archMapper = feat: map (ver: "${feat}_${dropDot ver}");
-  gencodeMapper = feat: map (ver: "-gencode=arch=compute_${dropDot ver},code=${feat}_${dropDot ver}");
-  cudaRealArchs = archMapper "sm" cudaRealCapabilities;
-  cudaPTXArchs = archMapper "compute" cudaRealCapabilities;
-  cudaArchs = cudaRealArchs ++ [ (lib.last cudaPTXArchs) ];
-
-  cudaArchNames = lib.unique (map (v: cudaMicroarchitectureNames.${lib.versions.major v}) cudaRealCapabilities);
-  cudaCapabilities = cudaRealCapabilities ++ lib.optional (config.cudaForwardCompat or true) capabilitiesForward;
-  cudaGencode = gencodeMapper "sm" cudaRealCapabilities ++ lib.optionals (config.cudaForwardCompat or true) (gencodeMapper "compute" [ (lib.last cudaPTXArchs) ]);
-
-  cudaCapabilitiesCommaString = lib.strings.concatStringsSep "," cudaCapabilities;
-  cudaCapabilitiesSemiColonString = lib.strings.concatStringsSep ";" cudaCapabilities;
-  cudaRealCapabilitiesCommaString = lib.strings.concatStringsSep "," cudaRealCapabilities;
+  # gpus :: List Gpu
+  gpus = builtins.import ./gpus.nix;
+
+  # isVersionIn :: Gpu -> Bool
+  isSupported = gpu:
+    let
+      inherit (gpu) minCudaVersion maxCudaVersion;
+      lowerBoundSatisfied = strings.versionAtLeast cudaVersion minCudaVersion;
+      upperBoundSatisfied = !(strings.versionOlder maxCudaVersion cudaVersion);
+    in
+    lowerBoundSatisfied && upperBoundSatisfied;
+
+  # supportedGpus :: List Gpu
+  # GPUs which are supported by the provided CUDA version.
+  supportedGpus = builtins.filter isSupported gpus;
+
+  # cudaArchNameToVersions :: AttrSet String (List String)
+  # Maps the name of a GPU architecture to different versions of that architecture.
+  # For example, "Ampere" maps to [ "8.0" "8.6" "8.7" ].
+  cudaArchNameToVersions =
+    lists.groupBy'
+      (versions: gpu: versions ++ [ gpu.computeCapability ])
+      [ ]
+      (gpu: gpu.archName)
+      supportedGpus;
+
+  # cudaArchNames :: List String
+  # NOTE: It's important that we don't rely on builtins.attrNames cudaArchNameToVersions here;
+  #   otherwise, we'll get the names sorted in alphabetical order. The JSON array we read them
+  #   from is already sorted, so we'll preserve that order here.
+  cudaArchNames = lists.unique (lists.map (gpu: gpu.archName) supportedGpus);
+
+  # cudaComputeCapabilityToName :: AttrSet String String
+  # Maps the version of a GPU architecture to the name of that architecture.
+  # For example, "8.0" maps to "Ampere".
+  cudaComputeCapabilityToName = builtins.listToAttrs (
+    lists.map
+      (gpu: {
+        name = gpu.computeCapability;
+        value = gpu.archName;
+      })
+      supportedGpus
+  );
+
+  # cudaComputeCapabilities :: List String
+  # NOTE: It's important that we don't rely on builtins.attrNames cudaComputeCapabilityToName here;
+  #   otherwise, we'll get the versions sorted in alphabetical order. The JSON array we read them
+  #   from is already sorted, so we'll preserve that order here.
+  # Use the user-provided list of CUDA capabilities if it's provided.
+  cudaComputeCapabilities = config.cudaCapabilities
+    or (lists.map (gpu: gpu.computeCapability) supportedGpus);
+
+  # cudaForwardComputeCapability :: String
+  cudaForwardComputeCapability = (lists.last cudaComputeCapabilities) + "+PTX";
+
+  # cudaComputeCapabilitiesAndForward :: List String
+  # The list of supported CUDA architectures, including the forward compatibility architecture.
+  # If forward compatibility is disabled, this will be the same as cudaComputeCapabilities.
+  cudaComputeCapabilitiesAndForward = cudaComputeCapabilities
+    ++ lists.optional (config.cudaForwardCompat or true) cudaForwardComputeCapability;
+
+  # dropDot :: String -> String
+  dropDot = ver: builtins.replaceStrings [ "." ] [ "" ] ver;
+
+  # archMapper :: String -> List String -> List String
+  # Maps a feature across a list of architecture versions to produce a list of architectures.
+  # For example, "sm" and [ "8.0" "8.6" "8.7" ] produces [ "sm_80" "sm_86" "sm_87" ].
+  archMapper = feat: lists.map (computeCapability: "${feat}_${dropDot computeCapability}");
+
+  # gencodeMapper :: String -> List String -> List String
+  # Maps a feature across a list of architecture versions to produce a list of gencode arguments.
+  # For example, "sm" and [ "8.0" "8.6" "8.7" ] produces [ "-gencode=arch=compute_80,code=sm_80"
+  # "-gencode=arch=compute_86,code=sm_86" "-gencode=arch=compute_87,code=sm_87" ].
+  gencodeMapper = feat: lists.map (
+    computeCapability:
+    "-gencode=arch=compute_${dropDot computeCapability},code=${feat}_${dropDot computeCapability}"
+  );
+
+  # cudaRealArches :: List String
+  # The real architectures are physical architectures supported by the CUDA version.
+  # For example, "sm_80".
+  cudaRealArches = archMapper "sm" cudaComputeCapabilities;
+
+  # cudaVirtualArches :: List String
+  # The virtual architectures are typically used for forward compatibility, when trying to support
+  # an architecture newer than the CUDA version allows.
+  # For example, "compute_80".
+  cudaVirtualArches = archMapper "compute" cudaComputeCapabilities;
+
+  # cudaArches :: List String
+  # By default, build for all supported architectures and forward compatibility via a virtual
+  # architecture for the newest supported architecture.
+  cudaArches = cudaRealArches ++
+    lists.optional (config.cudaForwardCompat or true) (lists.last cudaVirtualArches);
+
+  # cudaGencode :: List String
+  # A list of CUDA gencode arguments to pass to NVCC.
+  cudaGencode =
+    let
+      base = gencodeMapper "sm" cudaComputeCapabilities;
+      forwardCompat = gencodeMapper "compute" [ (lists.last cudaComputeCapabilities) ];
+    in
+    base ++ lists.optionals (config.cudaForwardCompat or true) forwardCompat;
 
 in
 {
-   inherit cudaArchs cudaArchNames cudaCapabilities cudaCapabilitiesCommaString cudaCapabilitiesSemiColonString
-     cudaRealCapabilities cudaRealCapabilitiesCommaString cudaGencode cudaRealArchs cudaPTXArchs;
+  inherit
+    cudaArchNames
+    cudaArchNameToVersions cudaComputeCapabilityToName
+    cudaRealArches cudaVirtualArches cudaArches
+    cudaGencode;
+  cudaCapabilities = cudaComputeCapabilitiesAndForward;
 }
diff --git a/pkgs/development/compilers/cudatoolkit/gpus.nix b/pkgs/development/compilers/cudatoolkit/gpus.nix
new file mode 100644
index 0000000000000..e938e91297478
--- /dev/null
+++ b/pkgs/development/compilers/cudatoolkit/gpus.nix
@@ -0,0 +1,110 @@
+[
+  {
+    archName = "Kepler";
+    computeCapability = "3.0";
+    minCudaVersion = "10.0";
+    maxCudaVersion = "10.2";
+  }
+  {
+    archName = "Kepler";
+    computeCapability = "3.2";
+    minCudaVersion = "10.0";
+    maxCudaVersion = "10.2";
+  }
+  {
+    archName = "Kepler";
+    computeCapability = "3.5";
+    minCudaVersion = "10.0";
+    maxCudaVersion = "11.8";
+  }
+  {
+    archName = "Kepler";
+    computeCapability = "3.7";
+    minCudaVersion = "10.0";
+    maxCudaVersion = "11.8";
+  }
+  {
+    archName = "Maxwell";
+    computeCapability = "5.0";
+    minCudaVersion = "10.0";
+    maxCudaVersion = "12.0";
+  }
+  {
+    archName = "Maxwell";
+    computeCapability = "5.2";
+    minCudaVersion = "10.0";
+    maxCudaVersion = "12.0";
+  }
+  {
+    archName = "Maxwell";
+    computeCapability = "5.3";
+    minCudaVersion = "10.0";
+    maxCudaVersion = "12.0";
+  }
+  {
+    archName = "Pascal";
+    computeCapability = "6.0";
+    minCudaVersion = "10.0";
+    maxCudaVersion = "12.0";
+  }
+  {
+    archName = "Pascal";
+    computeCapability = "6.1";
+    minCudaVersion = "10.0";
+    maxCudaVersion = "12.0";
+  }
+  {
+    archName = "Pascal";
+    computeCapability = "6.2";
+    minCudaVersion = "10.0";
+    maxCudaVersion = "12.0";
+  }
+  {
+    archName = "Volta";
+    computeCapability = "7.0";
+    minCudaVersion = "10.0";
+    maxCudaVersion = "12.0";
+  }
+  {
+    archName = "Volta";
+    computeCapability = "7.2";
+    minCudaVersion = "10.0";
+    maxCudaVersion = "12.0";
+  }
+  {
+    archName = "Turing";
+    computeCapability = "7.5";
+    minCudaVersion = "10.0";
+    maxCudaVersion = "12.0";
+  }
+  {
+    archName = "Ampere";
+    computeCapability = "8.0";
+    minCudaVersion = "11.2";
+    maxCudaVersion = "12.0";
+  }
+  {
+    archName = "Ampere";
+    computeCapability = "8.6";
+    minCudaVersion = "11.2";
+    maxCudaVersion = "12.0";
+  }
+  {
+    archName = "Ampere";
+    computeCapability = "8.7";
+    minCudaVersion = "11.5";
+    maxCudaVersion = "12.0";
+  }
+  {
+    archName = "Ada";
+    computeCapability = "8.9";
+    minCudaVersion = "11.8";
+    maxCudaVersion = "12.0";
+  }
+  {
+    archName = "Hopper";
+    computeCapability = "9.0";
+    minCudaVersion = "11.8";
+    maxCudaVersion = "12.0";
+  }
+]
diff --git a/pkgs/development/libraries/science/math/magma/default.nix b/pkgs/development/libraries/science/math/magma/default.nix
index b7223690f4354..f70cbbcff5d82 100644
--- a/pkgs/development/libraries/science/math/magma/default.nix
+++ b/pkgs/development/libraries/science/math/magma/default.nix
@@ -52,7 +52,7 @@ in stdenv.mkDerivation (finalAttrs: {
     "-DCMAKE_C_COMPILER=${cudatoolkit.cc}/bin/gcc"
     "-DCMAKE_CXX_COMPILER=${cudatoolkit.cc}/bin/g++"
     "-DMAGMA_ENABLE_CUDA=ON"
-    "-DGPU_TARGET=${builtins.concatStringsSep "," cudaFlags.cudaRealArchs}"
+    "-DGPU_TARGET=${builtins.concatStringsSep "," cudaFlags.cudaRealArches}"
   ] ++ lib.optionals useROCM [
     "-DCMAKE_C_COMPILER=${hip}/bin/hipcc"
     "-DCMAKE_CXX_COMPILER=${hip}/bin/hipcc"
diff --git a/pkgs/development/python-modules/jaxlib/default.nix b/pkgs/development/python-modules/jaxlib/default.nix
index baa9a0dc1abfc..2c13defe43838 100644
--- a/pkgs/development/python-modules/jaxlib/default.nix
+++ b/pkgs/development/python-modules/jaxlib/default.nix
@@ -164,7 +164,7 @@ let
       build --action_env TF_CUDA_PATHS="${cudatoolkit_joined},${cudnn},${nccl}"
       build --action_env TF_CUDA_VERSION="${lib.versions.majorMinor cudatoolkit.version}"
       build --action_env TF_CUDNN_VERSION="${lib.versions.major cudnn.version}"
-      build:cuda --action_env TF_CUDA_COMPUTE_CAPABILITIES="${cudaFlags.cudaRealCapabilitiesCommaString}"
+      build:cuda --action_env TF_CUDA_COMPUTE_CAPABILITIES="${builtins.concatStringsSep "," cudaFlags.cudaRealArches}"
     '' + ''
       CFG
     '';
diff --git a/pkgs/development/python-modules/tensorflow/default.nix b/pkgs/development/python-modules/tensorflow/default.nix
index d41526552f4b4..f7d920c372217 100644
--- a/pkgs/development/python-modules/tensorflow/default.nix
+++ b/pkgs/development/python-modules/tensorflow/default.nix
@@ -301,7 +301,7 @@ let
     TF_CUDA_PATHS = lib.optionalString cudaSupport "${cudatoolkit_joined},${cudnn},${nccl}";
     GCC_HOST_COMPILER_PREFIX = lib.optionalString cudaSupport "${cudatoolkit_cc_joined}/bin";
     GCC_HOST_COMPILER_PATH = lib.optionalString cudaSupport "${cudatoolkit_cc_joined}/bin/gcc";
-    TF_CUDA_COMPUTE_CAPABILITIES = builtins.concatStringsSep "," cudaFlags.cudaRealArchs;
+    TF_CUDA_COMPUTE_CAPABILITIES = builtins.concatStringsSep "," cudaFlags.cudaRealArches;
 
     postPatch = ''
       # bazel 3.3 should work just as well as bazel 3.1
diff --git a/pkgs/development/python-modules/torch/default.nix b/pkgs/development/python-modules/torch/default.nix
index 8fe9e1fed629e..86889be4ee168 100644
--- a/pkgs/development/python-modules/torch/default.nix
+++ b/pkgs/development/python-modules/torch/default.nix
@@ -41,6 +41,7 @@
 }:
 
 let
+  inherit (lib) lists strings trivial;
   inherit (cudaPackages) cudatoolkit cudaFlags cudnn nccl;
 in
 
@@ -54,6 +55,45 @@ assert !cudaSupport || magma.cudatoolkit == cudatoolkit;
 
 let
   setBool = v: if v then "1" else "0";
+
+  # https://github.com/pytorch/pytorch/blob/v1.13.1/torch/utils/cpp_extension.py#L1751
+  supportedTorchCudaCapabilities =
+    let
+      real = ["3.5" "3.7" "5.0" "5.2" "5.3" "6.0" "6.1" "6.2" "7.0" "7.2" "7.5" "8.0" "8.6"];
+      ptx = lists.map (x: "${x}+PTX") real;
+    in
+    real ++ ptx;
+
+  # NOTE: The lists.subtractLists function is perhaps a bit unintuitive. It subtracts the elements
+  #   of the first list *from* the second list. That means:
+  #   lists.subtractLists a b = b - a
+
+  # For CUDA
+  supportedCudaCapabilities = lists.intersectLists cudaFlags.cudaCapabilities supportedTorchCudaCapabilities;
+  unsupportedCudaCapabilities = lists.subtractLists supportedCudaCapabilities cudaFlags.cudaCapabilities;
+
+  # Use trivial.warnIf to print a warning if any unsupported GPU targets are specified.
+  gpuArchWarner = supported: unsupported:
+    trivial.throwIf (supported == [ ])
+      (
+        "No supported GPU targets specified. Requested GPU targets: "
+        + strings.concatStringsSep ", " unsupported
+      )
+      supported;
+
+  # Create the gpuTargetString.
+  gpuTargetString = strings.concatStringsSep ";" (
+    if gpuTargets != [ ] then
+    # If gpuTargets is specified, it always takes priority.
+      gpuTargets
+    else if cudaSupport then
+      gpuArchWarner supportedCudaCapabilities unsupportedCudaCapabilities
+    else if rocmSupport then
+      hip.gpuTargets
+    else
+      throw "No GPU targets specified"
+  );
+
   cudatoolkit_joined = symlinkJoin {
     name = "${cudatoolkit.name}-unsplit";
     # nccl is here purely for semantic grouping it could be moved to nativeBuildInputs
@@ -146,14 +186,14 @@ in buildPythonPackage rec {
   '';
 
   preConfigure = lib.optionalString cudaSupport ''
-    export TORCH_CUDA_ARCH_LIST="${cudaFlags.cudaCapabilitiesSemiColonString}"
+    export TORCH_CUDA_ARCH_LIST="${gpuTargetString}"
     export CC=${cudatoolkit.cc}/bin/gcc CXX=${cudatoolkit.cc}/bin/g++
   '' + lib.optionalString (cudaSupport && cudnn != null) ''
     export CUDNN_INCLUDE_DIR=${cudnn}/include
   '' + lib.optionalString rocmSupport ''
     export ROCM_PATH=${rocmtoolkit_joined}
     export ROCM_SOURCE_DIR=${rocmtoolkit_joined}
-    export PYTORCH_ROCM_ARCH="${lib.strings.concatStringsSep ";" (if gpuTargets == [ ] then hip.gpuTargets else gpuTargets)}"
+    export PYTORCH_ROCM_ARCH="${gpuTargetString}"
     export CMAKE_CXX_FLAGS="-I${rocmtoolkit_joined}/include -I${rocmtoolkit_joined}/include/rocblas"
     python tools/amd_build/build_amd.py
   '';
@@ -320,7 +360,8 @@ in buildPythonPackage rec {
   requiredSystemFeatures = [ "big-parallel" ];
 
   passthru = {
-    inherit cudaSupport cudaPackages;
+    inherit cudaSupport cudaPackages gpuTargetString;
+    cudaCapabilities = supportedCudaCapabilities;
     # At least for 1.10.2 `torch.fft` is unavailable unless BLAS provider is MKL. This attribute allows for easy detection of its availability.
     blasProvider = blas.provider;
   };
diff --git a/pkgs/development/python-modules/torchvision/default.nix b/pkgs/development/python-modules/torchvision/default.nix
index de8852035c909..d36beb6575e09 100644
--- a/pkgs/development/python-modules/torchvision/default.nix
+++ b/pkgs/development/python-modules/torchvision/default.nix
@@ -15,13 +15,13 @@
 }:
 
 let
-  inherit (torch.cudaPackages) cudatoolkit cudaFlags cudnn;
+  inherit (torch) gpuTargetString;
+  inherit (torch.cudaPackages) cudatoolkit cudnn;
 
   cudatoolkit_joined = symlinkJoin {
     name = "${cudatoolkit.name}-unsplit";
     paths = [ cudatoolkit.out cudatoolkit.lib ];
   };
-  cudaArchStr = lib.optionalString cudaSupport lib.strings.concatStringsSep ";" torch.cudaArchList;
 in buildPythonPackage rec {
   pname = "torchvision";
   version = "0.14.1";
@@ -45,7 +45,7 @@ in buildPythonPackage rec {
   propagatedBuildInputs = [ numpy pillow torch scipy ];
 
   preBuild = lib.optionalString cudaSupport ''
-    export TORCH_CUDA_ARCH_LIST="${cudaFlags.cudaCapabilitiesSemiColonString}"
+    export TORCH_CUDA_ARCH_LIST="${gpuTargetString}"
     export FORCE_CUDA=1
   '';