use multiple cores in scan1_pg.R routine scan1_pg_clean (#234)

* modify scan1_pg_clean to branch by chr, pheno and pos * modified scan1_pg_clean to better divide positions relative to phenos when using cores
rqtl · Jun 20, 2024 · f5d2fae · f5d2fae
1 parent 2d39f6f
commit f5d2fae
Showing 1 changed file with 47 additions and 19 deletions.
diff --git a/R/scan1_pg.R b/R/scan1_pg.R
@@ -314,14 +314,53 @@ scan1_pg_clean <-
         else no_x <- FALSE
     } else loco <- TRUE
 
-    batches <- list(chr=rep(seq_len(length(genoprobs)), ncol(pheno)),
-                    phecol=rep(seq_len(ncol(pheno)), each=length(genoprobs)))
-
+    # This creates a batch for every chr, pheno and set of positions.
+    # If cores == 1, use all positions on each chr (original code).
+    # if cores != 1, then
+    #   for each chr, divide positions into contiguous intervals
+    #   number of intervals across chr and phenos =~ cores
+    npos_by_chr <- dim(genoprobs)[3,]
+    # Genoprob names not retained for qtl2fst
+    names(npos_by_chr) <- names(genoprobs)
+    if(n_cores(cores) == 1) {
+      # Add beginning and end pos of each chr to original batches list.
+      batches <- list(chr=rep(seq_len(length(genoprobs)), ncol(pheno)),
+                      pos1 = rep(1, length(genoprobs) * ncol(pheno)),
+                      pos2 = rep(npos_by_chr, ncol(pheno)),
+                      phecol=rep(seq_len(ncol(pheno)), each=length(genoprobs)))
+    } else {
+      # Divide each chr into intervals pos1:pos2 contiguous from 1 to length of chr.
+      tmpfn <- function(x) {
+        npos <- ceiling(x * ncol(pheno) * length(npos_by_chr) / n_cores(cores))
+        pos1 <- seq(1, x, by = npos)
+        pos2 <- c(pos1[-1] - 1, x)
+        list(pos1 = pos1, pos2 = pos2)
+      }
+      npos <- lapply(npos_by_chr, tmpfn)
+
+      # Collapse pos1 and pos2 into vectors.
+      pos1 <- pos2 <- NULL
+      for(i in names(npos)) {
+        pos1 <- c(pos1, npos[[i]]$pos1)
+        pos2 <- c(pos2, npos[[i]]$pos2)
+      }
+      # npos = number of intervals by chr
+      npos <- sapply(npos, function(x) length(x$pos1))
+
+      # Batches list
+      batches <- list(chr=rep(rep(seq_len(length(genoprobs)),npos), ncol(pheno)),
+                      pos1=rep(pos1, ncol(pheno)),
+                      pos2=rep(pos2, ncol(pheno)),
+                      phecol=rep(seq_len(ncol(pheno)), each=sum(npos)))
+    }
+
     # function that does the work
     by_batch_func <-
         function(batch)
         {
             chr <- batches$chr[batch]
+            pos1 <- batches$pos1[batch]
+            pos2 <- batches$pos2[batch]
             phecol <- batches$phecol[batch]
 
             if(loco) {
@@ -342,11 +381,12 @@ scan1_pg_clean <-
             # subset the genotype probabilities: drop cols with all 0s, plus the first column
             Xcol2drop <- genoprob_Xcol2drop[[chr]]
             if(length(Xcol2drop) > 0) {
-                pr <- genoprobs[[chr]][ind2keep,-Xcol2drop,,drop=FALSE]
+                pr <- genoprobs[[chr]][ind2keep,-Xcol2drop,pos1:pos2,drop=FALSE]
                 pr <- pr[,-1,,drop=FALSE]
             }
-            else
-                pr <- genoprobs[[chr]][ind2keep,-1,,drop=FALSE]
+            else {
+                pr <- genoprobs[[chr]][ind2keep,-1,pos1:pos2,drop=FALSE]
+            }
             # weight the probabilities
             pr <- weight_array(pr, weights)
 
@@ -384,19 +424,7 @@ scan1_pg_clean <-
     if(any(result_is_null))
         stop("cluster problem: returned ", sum(result_is_null), " NULLs.")
 
-    npos_by_chr <- dim(genoprobs)[3,]
-    totpos <- sum(npos_by_chr)
-    pos_index <- split(seq_len(totpos), rep(seq_len(length(genoprobs)), npos_by_chr))
-
-    # to contain the results
-    result <- matrix(nrow=totpos, ncol=ncol(pheno))
-    for(batch in seq_along(batches$chr)) {
-        chr <- batches$chr[batch]
-        phecol <- batches$phecol[batch]
-        result[pos_index[[chr]], phecol] <- lod_list[[batch]]
-    }
-
-    result
+    matrix(nrow=sum(npos_by_chr), ncol=ncol(pheno), unlist(lod_list))
 }