rqtl · kbroman · Nov 10, 2023 · Nov 10, 2023 · Nov 10, 2023 · Nov 10, 2023
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: qtl2
-Version: 0.33-3
-Date: 2023-11-04
+Version: 0.33-4
+Date: 2023-11-10
 Title: Quantitative Trait Locus Mapping in Experimental Crosses
 Description: Provides a set of tools to perform quantitative
     trait locus (QTL) analysis in experimental crosses. It is a

diff --git a/NAMESPACE b/NAMESPACE
@@ -80,6 +80,7 @@ export(drop_markers)
 export(drop_nullmarkers)
 export(est_herit)
 export(est_map)
+export(find_dup_markers)
 export(find_ibd_segments)
 export(find_index_snp)
 export(find_map_gaps)

diff --git a/NEWS.md b/NEWS.md
@@ -1,4 +1,4 @@
-## qtl2 0.33-3 (2023-11-04)
+## qtl2 0.33-4 (2023-11-10)
 
 ### Major changes
 
@@ -7,6 +7,10 @@
 
 - Similarly, changed `read_csv_numer()` to `fread_csv_numer()`.
 
+- Added function `fund_dup_markers()`, for identifying subsets of
+  markers with identical genotype data. This is a port of
+  `qtl::findDupMarkers()`.
+
 ### Minor changes
 
 - Have `calc_het()` stop with an error if the genotypes have labels

diff --git a/R/RcppExports.R b/R/RcppExports.R
@@ -149,6 +149,10 @@ invert_founder_index <- function(cross_info) {
     .Call(`_qtl2_is_phase_known`, crosstype)
 }
 
+.find_dup_markers_notexact <- function(Geno, order, markerloc, adjacent_only) {
+    .Call(`_qtl2_find_dup_markers_notexact`, Geno, order, markerloc, adjacent_only)
+}
+
 .find_ibd_segments <- function(g1, g2, p, error_prob) {
     .Call(`_qtl2_find_ibd_segments`, g1, g2, p, error_prob)
 }

diff --git a/R/drop_markers.R b/R/drop_markers.R
@@ -10,7 +10,7 @@
 #' @return The input `cross` with the specified markers removed.
 #'
 #' @export
-#' @seealso [pull_markers()], [drop_nullmarkers()]
+#' @seealso [pull_markers()], [drop_nullmarkers()], [reduce_markers()], [find_dup_markers()]
 #'
 #' @examples
 #' grav2 <- read_cross2(system.file("extdata", "grav2.zip", package="qtl2"))

diff --git a/R/find_dup_markers.R b/R/find_dup_markers.R
@@ -0,0 +1,138 @@
+# find_dup_markers
+#
+#' Find markers with identical genotype data
+#'
+#' Identify sets of markers with identical genotype data.
+#'
+#' @param cross Object of class `"cross2"`. For details, see the
+#' [R/qtl2 developer guide](https://kbroman.org/qtl2/assets/vignettes/developer_guide.html).
+#'
+#' @param chr Optional vector specifying which chromosomes to consider.
+#' This may be a logical, numeric, or character string vector.
+#'
+#' @param exact_only If TRUE, look only for markers that have matching
+#' genotypes and the same pattern of missing data; if FALSE, also look for
+#' cases where the observed genotypes at one marker match those at
+#' another, and where the first marker has missing genotype whenever the
+#' genotype for the second marker is missing.
+#'
+#' @param adjacent_only If TRUE, look only for sets of markers that are
+#' adjacent to each other.
+#'
+#' @return A list of marker names; each component is a set of markers whose
+#' genotypes match one other marker, and the name of the component is the
+#'  name of the marker that they match.
+#'
+#' @details
+#'  If `exact.only=TRUE`, we look only for groups of markers whose
+#'  pattern of missing data and observed genotypes match exactly.  One
+#'  marker (chosen at random) is selected as the name of the group (in the
+#'  output of the function).
+#'
+#'  If `exact.only=FALSE`, we look also for markers whose observed genotypes
+#'  are contained in the observed genotypes of another marker.  We use a
+#'  pair of nested loops, working from the markers with the most observed
+#'  genotypes to the markers with the fewest observed genotypes.
+#'
+#' @export
+#' @keywords utilities
+#'
+#' @seealso [drop_markers()], [drop_nullmarkers()], [reduce_markers()]
+#'
+#' @examples
+#' grav2 <- read_cross2(system.file("extdata", "grav2.zip", package="qtl2"))
+#' dup <- find_dup_markers(grav2)
+#' grav2_nodup <- drop_markers(grav2, unlist(dup))
+
+find_dup_markers <-
+    function(cross, chr, exact_only=TRUE, adjacent_only=FALSE)
+{
+    if(!missing(chr)) cross <- subset(cross, chr=chr)
+
+    if(!is.cross2(cross))
+        stop('Input cross should be a "cross2" object.')
+
+    g <- do.call("cbind", cross$geno)
+    markers <- colnames(g)
+    markerloc <- lapply(n_mar(cross), function(a) 1:a)
+    if(length(markerloc) > 1) {
+        for(j in 2:length(markerloc))
+            markerloc[[j]] <- markerloc[[j]] + max(markerloc[[j-1]]) + 10
+    }
+    markerloc <- unlist(markerloc)
+
+    if(exact_only) {
+        g[is.na(g)] <- 0
+
+        # genotype data patterns
+        pat <- apply(g, 2, paste, collapse="")
+
+        # table of unique values
+        tab <- table(pat)
+
+        # no duplicates; return
+        if(all(tab == 1)) return(NULL)
+
+        wh <- which(tab > 1)
+        theloc <- themar <- vector("list", length(wh))
+        for(i in seq(along=wh)) {
+            themar[[i]] <- names(pat)[pat==names(tab)[wh[i]]]
+            theloc[[i]] <- markerloc[pat==names(tab)[wh[i]]]
+        }
+
+        if(adjacent_only) {
+            extraloc <- list()
+            extramar <- list()
+            for(i in seq(along=theloc)) {
+                d <- diff(theloc[[i]]) # look for adjacency
+                if(any(d>1)) { # split into adjacent groups
+                    temp <- which(d>1)
+                    first <- c(1, temp+1)
+                    last <- c(temp, length(theloc[[i]]))
+                    for(j in 2:length(first)) {
+                        extraloc[[length(extraloc)+1]] <- theloc[[i]][first[j]:last[j]]
+                        extramar[[length(extramar)+1]] <- themar[[i]][first[j]:last[j]]
+                    }
+                    themar[[i]] <- themar[[i]][first[1]:last[1]]
+                    theloc[[i]] <- theloc[[i]][first[1]:last[1]]
+                }
+            }
+            themar <- c(themar, extramar)
+            theloc <- c(theloc, extraloc)
+
+            nm <- sapply(themar, length)
+            if(all(nm==1)) return(NULL) # nothing left
+            themar <- themar[nm>1]
+            theloc <- theloc[nm>1]
+        }
+
+        # order by first locus
+        o <- order(sapply(theloc, min))
+        themar <- themar[o]
+
+        randompics <- sapply(themar, function(a) sample(length(a), 1))
+        for(i in seq(along=themar)) {
+            names(themar)[i] <- themar[[i]][randompics[i]]
+            themar[[i]] <- themar[[i]][-randompics[i]]
+        }
+
+    }
+    else {
+        themar <- NULL
+
+        ntyp <- n_typed(cross, "marker")
+        o <- order(ntyp, decreasing=TRUE)
+
+        g[is.na(g)] <- 0
+        result <- .find_dup_markers_notexact(g, o, markerloc, adjacent_only)
+
+        if(all(result==0)) return(NULL)
+        u <- unique(result[result != 0])
+        themar <- vector("list", length(u))
+        names(themar) <- colnames(g)[u]
+        for(i in seq(along=themar))
+            themar[[i]] <- colnames(g)[result==u[i]]
+    }
+
+    themar
+}
diff --git a/R/reduce_markers.R b/R/reduce_markers.R
@@ -31,6 +31,8 @@
 #' the algorithm applied to each batch with min_distance smaller by a
 #' factor `min_distance_mult`, and then merged together for one last pass.
 #'
+#' @seealso [find_dup_markers()], [drop_markers()]
+#'
 #' @references Broman KW, Weber JL (1999) Method for constructing
 #' confidently ordered linkage maps. Genet Epidemiol 16:337--343
 #'

diff --git a/man/drop_markers.Rd b/man/drop_markers.Rd
diff --git a/man/find_dup_markers.Rd b/man/find_dup_markers.Rd
diff --git a/man/reduce_markers.Rd b/man/reduce_markers.Rd
diff --git a/src/RcppExports.cpp b/src/RcppExports.cpp
@@ -545,6 +545,20 @@ BEGIN_RCPP
     return rcpp_result_gen;
 END_RCPP
 }
+// find_dup_markers_notexact
+IntegerVector find_dup_markers_notexact(const IntegerMatrix& Geno, const IntegerVector& order, const IntegerVector& markerloc, const bool adjacent_only);
+RcppExport SEXP _qtl2_find_dup_markers_notexact(SEXP GenoSEXP, SEXP orderSEXP, SEXP markerlocSEXP, SEXP adjacent_onlySEXP) {
+BEGIN_RCPP
+    Rcpp::RObject rcpp_result_gen;
+    Rcpp::RNGScope rcpp_rngScope_gen;
+    Rcpp::traits::input_parameter< const IntegerMatrix& >::type Geno(GenoSEXP);
+    Rcpp::traits::input_parameter< const IntegerVector& >::type order(orderSEXP);
+    Rcpp::traits::input_parameter< const IntegerVector& >::type markerloc(markerlocSEXP);
+    Rcpp::traits::input_parameter< const bool >::type adjacent_only(adjacent_onlySEXP);
+    rcpp_result_gen = Rcpp::wrap(find_dup_markers_notexact(Geno, order, markerloc, adjacent_only));
+    return rcpp_result_gen;
+END_RCPP
+}
 // find_ibd_segments
 NumericMatrix find_ibd_segments(const IntegerVector& g1, const IntegerVector& g2, const NumericVector& p, const double error_prob);
 RcppExport SEXP _qtl2_find_ibd_segments(SEXP g1SEXP, SEXP g2SEXP, SEXP pSEXP, SEXP error_probSEXP) {
@@ -2455,6 +2469,7 @@ static const R_CallMethodDef CallEntries[] = {
     {"_qtl2_mpp_geno_names", (DL_FUNC) &_qtl2_mpp_geno_names, 2},
     {"_qtl2_invert_founder_index", (DL_FUNC) &_qtl2_invert_founder_index, 1},
     {"_qtl2_is_phase_known", (DL_FUNC) &_qtl2_is_phase_known, 1},
+    {"_qtl2_find_dup_markers_notexact", (DL_FUNC) &_qtl2_find_dup_markers_notexact, 4},
     {"_qtl2_find_ibd_segments", (DL_FUNC) &_qtl2_find_ibd_segments, 4},
     {"_qtl2_R_find_peaks", (DL_FUNC) &_qtl2_R_find_peaks, 3},
     {"_qtl2_R_find_peaks_and_lodint", (DL_FUNC) &_qtl2_R_find_peaks_and_lodint, 4},

diff --git a/src/find_dup_markers.cpp b/src/find_dup_markers.cpp
@@ -0,0 +1,51 @@
+// find subsets of markers with identical genotypes
+
+#include "find_dup_markers.h"
+#include <Rcpp.h>
+
+using namespace Rcpp;
+
+// [[Rcpp::export(".find_dup_markers_notexact")]]
+IntegerVector find_dup_markers_notexact(const IntegerMatrix& Geno, // matrix of genotypes, individuals x markers
+                                        const IntegerVector& order, // vector indicating order to be considered, most data to least
+                                        const IntegerVector& markerloc, //
+                                        const bool adjacent_only)   // if true, consider only adjacent markers
+{
+    const int n_ind = Geno.rows();
+    const int n_mar = Geno.cols();
+    if(order.size() != n_mar)
+        throw std::invalid_argument("length(order) != ncol(Geno)");
+    if(markerloc.size() != n_mar)
+        throw std::invalid_argument("length(markerloc) != ncol(Geno)");
+
+    IntegerVector result(n_mar);
+    for(int i=0; i<n_mar; i++) result[i] = 0;
+
+    for(int i=0; i<n_mar-1; i++) {
+        int oi = order[i]-1;
+        for(int j=(i+1); j<n_mar; j++) {
+            int oj = order[j]-1;
+
+            if(result[oj] != 0 ||
+               (adjacent_only && abs(markerloc[oi] - markerloc[oj]) > 1)) {
+                /* skip */
+            }
+            else {
+                int flag = 0;
+                for(int k=0; k<n_ind; k++) {
+                    if((Geno(k,oi)==0 && Geno(k,oj)!=0) ||
+                       (Geno(k,oi)!=0 && Geno(k,oj)!=0 && Geno(k,oi) != Geno(k,oj))) {
+                        flag = 1;
+                        break;
+                    }
+                }
+                if(!flag) { /* it worked */
+                    if(result[oi] != 0) result[oj] = result[oi];
+                    else result[oj] = oi+1;
+                }
+            }
+        }
+    }
+
+    return(result);
+}
diff --git a/src/find_dup_markers.h b/src/find_dup_markers.h
@@ -0,0 +1,12 @@
+// find subsets of markers with identical genotypes
+#ifndef FIND_DUP_MARKERS_H
+#define FIND_DUP_MARKERS_H
+
+#include <Rcpp.h>
+
+Rcpp::IntegerVector find_dup_markers_notexact(const Rcpp::IntegerMatrix& Geno, // matrix of genotypes, individuals x markers
+                                              const Rcpp::IntegerVector& order, // vector indicating order to be considered, most data to least
+                                              const Rcpp::IntegerVector markerloc, // integer vector indicating "position"
+                                              const bool adjacent_only);   // if true, consider only adjacent markers
+
+#endif // FIND_DUP_MARKERS_H