-
Notifications
You must be signed in to change notification settings - Fork 0
/
geneseek2qtl2.R
157 lines (132 loc) · 5.57 KB
/
geneseek2qtl2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# convert GeneSeek FinalReport files to format for R/qtl2
#
# - creates one genotype CSV file for each chromosome
#
# - also creates 2 files containing the two channels of SNP intensities for markers on the X and Y chr
# (these are useful for verifying the sex of the mice)
# file containing allele codes for GigaMUGA data
# - from GM_processed_files.zip, https://doi.org/10.6084/m9.figshare.5404759
codefile <- "C:/Users/edmondsonef/Desktop/AT_HS/AT_HS.NPT_Snps/GM_processed_files/GM/GM_allelecodes.csv"
# input files with GigaMUGA genotypes
# - can be a single file or a vector of multiple files
# - if samples appear in multiple files, the genotypes in later files
# will be used in place of genotypes in earlier files
# - files can be gzipped (".gz" extension)
ifiles <- c("C:/Users/edmondsonef/Desktop/AT_HS/AT_HS.NPT_Snps/AT_SNPs/CSU_batch1/batch1_FinalReport/batch1_FinalReport_noheader.txt",
"C:/Users/edmondsonef/Desktop/AT_HS/AT_HS.NPT_Snps/AT_SNPs/CSU_batch2/batch2_FinalReport/batch2_FinalReport_noheader.txt",
"C:/Users/edmondsonef/Desktop/AT_HS/AT_HS.NPT_Snps/AT_SNPs/CSU_batch3/batch3_FinalReport/batch3_FinalReport_noheader.txt",
"C:/Users/edmondsonef/Desktop/AT_HS/AT_HS.NPT_Snps/AT_SNPs/CSU_batch4/batch4_FinalReport/batch4_FinalReport_noheader.txt",
"C:/Users/edmondsonef/Desktop/AT_HS/AT_HS.NPT_Snps/AT_SNPs/CSU_batch5/batch5_FinalReport/batch5_FinalReport_noheader.txt")
# file "stem" for output files
# output files will be like "gm4qtl2_geno19.csv"
ostem <- "C:/Users/edmondsonef/Desktop/AT_HS/Data/gm4qtl2"
##############################
# define a couple of functions
##############################
# version of data.table::fread() where data.table=FALSE is the default
myfread <- function(filename, data.table=FALSE, ...) data.table::fread(filename, data.table=data.table, ...)
# cbind, replacing matching columns with second set and adding unique ones
cbind_smother <-
function(mat1, mat2)
{
cn1 <- colnames(mat1)
cn2 <- colnames(mat2)
m <- (cn2 %in% cn1)
if(any(m)) {
mat1[,cn2[m]] <- mat2[,cn2[m],drop=FALSE]
if(any(!m)) {
mat1 <- cbind(mat1, mat2[,cn2[!m],drop=FALSE])
}
}
else {
mat1 <- cbind(mat1, mat2)
}
mat1
}
##############################
# read genotype codes
codes <- myfread(codefile)
full_geno <- NULL
cXint <- cYint <- NULL
for(ifile in ifiles) {
cat(" -File:", ifile, "\n")
rezip <- FALSE
if(!file.exists(ifile)) {
cat(" -Unzipping file\n")
system(paste("gunzip", ifile))
rezip <- TRUE
}
cat(" -Reading data\n")
g <- myfread(ifile)
# subset to the markers in the codes object
g <- g[g[,"SNP Name"] %in% codes[,"marker"],]
# NOTE: may need to revise the IDs in the 2nd column
samples <- unique(g[,"Sample ID"])
# matrix to contain the genotypes
geno <- matrix(nrow=nrow(codes), ncol=length(samples))
dimnames(geno) <- list(codes[,"marker"], samples)
# fill in matrix
cat(" -Reorganizing data\n")
for(i in seq(along=samples)) {
if(i==round(i,-1)) cat(" --Sample", i, "of", length(samples), "\n")
wh <- (g[,"Sample ID"]==samples[i])
geno[g[wh,"SNP Name"],i] <- paste0(g[wh,"Allele1 - Forward"], g[wh,"Allele2 - Forward"])
}
cat(" -Encode genotypes\n")
geno <- qtl2convert::encode_geno(geno, as.matrix(codes[,c("A","B")]))
if(is.null(full_geno)) {
full_geno <- geno
} else {
# if any columns in both, use those from second set
full_geno <- cbind_smother(full_geno, geno)
}
# grab X and Y intensities
cat(" -Grab X and Y intensities\n")
gX <- g[g[,"SNP Name"] %in% codes[codes$chr=="X","marker"],]
gY <- g[g[,"SNP Name"] %in% codes[codes$chr=="Y","marker"],]
cX <- matrix(nrow=sum(codes$chr=="X"),
ncol=length(samples))
dimnames(cX) <- list(codes[codes$chr=="X","marker"], samples)
cY <- matrix(nrow=sum(codes$chr=="Y"),
ncol=length(samples))
dimnames(cY) <- list(codes[codes$chr=="Y","marker"], samples)
for(i in seq(along=samples)) {
if(i==round(i,-1)) cat(" --Sample", i, "of", length(samples), "\n")
wh <- (gX[,"Sample ID"]==samples[i])
cX[gX[wh,"SNP Name"],i] <- (gX$X[wh] + gX$Y[wh])/2
wh <- (gY[,"Sample ID"]==samples[i])
cY[gY[wh,"SNP Name"],i] <- (gY$X[wh] + gY$Y[wh])/2
}
if(is.null(cXint)) {
cXint <- cX
cYint <- cY
} else {
# if any columns in both, use those from second set
cXint <- cbind_smother(cXint, cX)
cYint <- cbind_smother(cYint, cY)
}
if(rezip) {
cat(" -Rezipping file\n")
system(paste("gzip", ifile))
}
}
# write X and Y intensities
cat(" -Writing X and Y intensities\n")
qtl2convert::write2csv(cbind(marker=rownames(cXint), cXint),
paste0(ostem, "_chrXint.csv"),
paste(ostem, "X chr intensities"),
overwrite=TRUE)
qtl2convert::write2csv(cbind(marker=rownames(cYint), cYint),
paste0(ostem, "_chrYint.csv"),
paste(ostem, "Y chr intensities"),
overwrite=TRUE)
# write data to chromosome-specific files
cat(" -Writing genotypes\n")
for(chr in c(1:19,"X","Y","M")) {
mar <- codes[codes$chr==chr,"marker"]
g <- full_geno[mar,]
qtl2convert::write2csv(cbind(marker=rownames(g), g),
paste0(ostem, "_geno", chr, ".csv"),
paste0(ostem, " genotypes for chr ", chr),
overwrite=TRUE)
}