-
Notifications
You must be signed in to change notification settings - Fork 0
/
Bedtools_Gene_Annotation.bash
167 lines (127 loc) · 8.8 KB
/
Bedtools_Gene_Annotation.bash
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
#!/bin/sh
#$ -cwd # Set the working directory for the job to the current directory
#$ -pe smp 1 # Request 1 core
#$ -l h_rt=120:0:0 # Request 24 hour runtime
#$ -l h_vmem=15G # Request 1GB RAM
module load bedtools
# ANNOTATION USING BED TOOLS ALL SNPS AND PROXIES #########################################################
# 1. First annotate the SNPs within the genes boundries (start and end of transciption)
# 2. Annotate remaining SNPs to within Feature
# 3. Annotate remaining SNPs to closest gene/feature
# 4. Merge lists
# 5. Annotate using ANNOVAR
#ALLGWAS_variants.txt columns:
#1. Chr
#2. Start
#3. End
#4. Ref
#5. Alt
#6. BETAsbp
#7. BETAdbp
#8. BETApp
#9. minP
#10. minTRAIT
#11. BETAmean
#12. CP
#ALLGWAS_variants.txt is just GWASresults750k_minP_meanBETA.csv with CP column created
head ALLGWAS_variants.txt
#Chr Start End Ref Alt BETAsbp BETAdbp BETApp minP minTRAIT BETAmean CP
#1 752566 752566 a g 0.0687 0.0605 0.0177 0.01714 DBP 0.0646 1:752566
#1 885689 885689 a g 0.235 0.1369 0.0807 0.0009269 DBP 0.18595 1:885689
#1 885699 885699 a g -0.2403 -0.1341 -0.0863 0.0009015 SBP -0.1872 1:885699
#1 886006 886006 t c -0.2408 -0.1346 -0.0871 0.0008683 SBP -0.1877 1:886006
#1 887801 887801 a g -0.2382 -0.1306 -0.0874 0.0009413 SBP -0.1844 1:887801
#1 888639 888639 t c -0.2472 -0.133 -0.0933 0.0006506 SBP -0.1901 1:888639
#1 888659 888659 t c -0.2323 -0.1277 -0.0845 0.001183 SBP -0.18 1:888659
#1 889238 889238 a g -0.2282 -0.1275 -0.0812 0.00167 SBP -0.17785 1:889238
#1 892745 892745 a g 0.2401 0.1317 0.0868 0.0008727 SBP 0.1859 1:892745
#1) Filter hg19Rel92_AllgeneTypes_EnsemblHavana_0kb.txt for antisense, processed_transcript, protein_coding, and pseudogenes
#For some reason filtering in R and writing a tab delim file later gives an error from bedtools that file is not in tab format - so not filtering before bedtools
#hg19Rel92_AllgeneTypes_0kb.txt and AllgeneTypes_filtered.txt are both listed as same file type in unix and running cat -t does not work either
#Ensure chromosomes are in 1-22 order
sort -n -k1,1 -k2,2 hg19Rel92_AllgeneTypes_0kb.txt > AllgeneTypes_sorted.txt
# 1) annotate within genes (bedtools v2.17.0)
sed 's/ /\t/g' ALLGWAS_variants.txt | tail -n +2 | intersectBed -a - -b AllgeneTypes_sorted.txt -wa -wb -loj > genes0kb.bed
# get those not in genes
awk 'BEGIN{OFS="\t"}{if($13 ==".")print $1,$2,$3,$4,$5,$6,$7,$8,$9,$10,$11,$12}' genes0kb.bed > toFeatures.avinput
wc -l genes0kb.bed #7083535
wc -l toFeatures.avinput #7083535
# 2) Annotate remaining SNPs to closest gene/feature (within 5KB)
sort -n -k1,1 -k2,2 toFeatures.avinput > toFeaturesSorted.avinput
closestBed -d -t all -a toFeaturesSorted.avinput -b AllgeneTypes_sorted.txt > toFeaturesSorted.bed
###########################################################################
head toFeaturesSorted.bed #-d
#1 918384 918384 t g 0.0533 0.0365 0.0155 0.05253 DBP 0.0449 1:918384 1 910579 917497 C1orf170 protein_coding ensembl_havana #887
#1 918573 918573 a g -0.0518 -0.0381 -0.0132 0.04347 DBP -0.04495 1:918573 1 910579 917497 C1orf170 protein_coding ensembl_havana 1076
#1 924898 924898 a c 0.3172 0.1081 0.2183 0.0003453 PP 0.21265 1:924898 1 931346 933431 RP11-54O7.17 lincRNA havana 6448
#1 926351 926351 t c 0.3092 0.1043 0.2149 0.0004249 PP 0.20675 1:926351 1 931346 933431 RP11-54O7.17 lincRNA havana 4995
#1 926431 926431 a t -0.3046 -0.1033 -0.2119 0.0005051 PP -0.20395 1:926431 1 931346 933431 RP11-54O7.17 lincRNA havana #4915
#1 926621 926621 a c -0.3099 -0.1046 -0.2153 0.0004156 PP -0.20725 1:926621 1 931346 933431 RP11-54O7.17 lincRNA havana 4725
#1 928578 928578 a g 0.3071 0.1027 0.2149 0.0004225 PP 0.2049 1:928578 1 931346 933431 RP11-54O7.17 lincRNA havana 2768
#1 928836 928836 t c 0.305 0.1049 0.2106 0.0005334 PP 0.20495 1:928836 1 931346 933431 RP11-54O7.17 lincRNA havana 2510
#1 929327 929327 a g -0.3015 -0.1 -0.2127 0.0004839 PP -0.20075 1:929327 1 931346 933431 RP11-54O7.17 lincRNA havana 2019
#1 940005 940005 a g 0.0344 0.0351 -0.0012 0.06487 DBP 0.03475 1:940005 1 943456 943609 RP11-54O7.10 pseudogene havana 3451
#-D *****ERROR: -D option must be followed with "ref", "a", or "b" - not sure how to apply yet
##use ALL DISTANCE ..FILTER FOR DISTANCE WHEN PERFORMING ANALYSIS
# LIST WITHIN 50KB
awk '{if($19 <= 5000) print $0}' toFeaturesSorted.bed > ClosestGeneFeature5kb.bed
#not actually used:
# # # # LIST TO ANNOTATE TO NAME as CHR-POS
# # # awk '{if($31 > 50000) print $0}' ClosestGeneFeature.bed |cut -f 1-26 - | uniq > toReNameLocusCHRPOS.txt
# # # wc -l ClosestGeneFeature50kb.bed
# # # 169791 ClosestGeneFeature50kb.bed
# # # wc -l toReNameLocusCHRPOS.txt
# # # 30393 toReNameLocusCHRPOS.txt
# 4) CONCATENATE LISTS
awk 'BEGIN{OFS="\t"}{if($13 != ".")print $0,0}' genes0kb.bed > T1
# ClosestGeneFeature.bed (as in its current format)
cat T1 ClosestGeneFeature5kb.bed | sort -k1,1 -k2,2 -n - > Total_Genes.bed
head Total_Genes.bed
#1 752566 752566 a g 0.0687 0.0605 0.0177 0.01714 DBP 0.0646 1:752566 1 745489 753092 RP11-206L10.10 processed_transcripthavana 0
#1 885689 885689 a g 0.235 0.1369 0.0807 0.0009269 DBP 0.18595 1:885689 1 879584 894689 NOC2L protein_coding ensembl_havana 0
#1 885699 885699 a g -0.2403 -0.1341 -0.0863 0.0009015 SBP -0.1872 1:885699 1 879584 894689 NOC2L protein_coding ensembl_havana 0
#1 886006 886006 t c -0.2408 -0.1346 -0.0871 0.0008683 SBP -0.1877 1:886006 1 879584 894689 NOC2L protein_coding ensembl_havana 0
#1 887801 887801 a g -0.2382 -0.1306 -0.0874 0.0009413 SBP -0.1844 1:887801 1 879584 894689 NOC2L protein_coding ensembl_havana 0
#1 888639 888639 t c -0.2472 -0.133 -0.0933 0.0006506 SBP -0.1901 1:888639 1 879584 894689 NOC2L protein_coding ensembl_havana 0
#1 888659 888659 t c -0.2323 -0.1277 -0.0845 0.001183 SBP -0.18 1:888659 1 879584 894689 NOC2L protein_coding ensembl_havana 0
#1 889238 889238 a g -0.2282 -0.1275 -0.0812 0.00167 SBP -0.17785 1:889238 1 879584 894689 NOC2L protein_coding ensembl_havana 0
#1 892745 892745 a g 0.2401 0.1317 0.0868 0.0008727 SBP 0.1859 1:892745 1 879584 894689 NOC2L protein_coding ensembl_havana 0
#1 893280 893280 a g 0.2378 0.1332 0.086 0.001053 SBP 0.1855 1:893280 1 879584 894689 NOC2L protein_coding ensembl_havana 0
module load R
R
library('data.table')
library('dplyr')
df <- fread('Total_Genes.bed')
colnames(df)[1] <-'Chr'
colnames(df)[2] <-'Start'
colnames(df)[3] <-'End'
colnames(df)[4] <-'Ref'
colnames(df)[5] <-'Alt'
colnames(df)[6] <-'BETAsbp'
colnames(df)[7] <-'BETAdbp'
colnames(df)[8] <-'BETApp'
colnames(df)[9] <-'minP'
colnames(df)[10] <-'minTRAIT'
colnames(df)[11] <-'BETAmean'
colnames(df)[12] <-'CP'
colnames(df)[13] <-'Feature.Chr'
colnames(df)[14] <-'Feature.Start'
colnames(df)[15] <-'Feature.End'
colnames(df)[16] <-'Gene'
colnames(df)[17] <-'Feature.Type'
colnames(df)[18] <-'Feature.Source'
colnames(df)[19] <-'Feature.Dist'
fwrite(df, 'Total_genes.txt', sep='\t', row.names=FALSE, col.names=TRUE)
head(df)
df1 <- filter(df, Feature.Type=='protein_coding')
df2 <- filter(df, Feature.Type=='processed_transcript')
df3 <- filter(df, Feature.Type=='pseudogene')
df4 <- filter(df, Feature.Type=='antisense')
final <- rbind(df1, df2, df3, df4)
nrow(final)
#[1] 3980785
tmp <- final[-which(duplicated(final$Gene)), ]
nrow(tmp)
#nrow: 33847
fwrite(final, 'Total_genes_final_selectTypes.txt', sep='\t', row.names=FALSE, col.names=TRUE)
fwrite(df, 'Total_genes_final_allTypes.txt', sep='\t', row.names=FALSE, col.names=TRUE)