-
Notifications
You must be signed in to change notification settings - Fork 0
/
STRINGdb PPI.R
141 lines (114 loc) · 6.5 KB
/
STRINGdb PPI.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
####Protein-Protein Interactions for GWAS associated BP genes####
#1. Match genes to their STRINGdb identifiers
#2. Match gene and string identifiers to interactors per gene
#3. Collapsing interactors to be listed per gene in each row
#setwd("~/PhD Year 2/")
library(data.table)
library(STRINGdb)
library(dplyr)
library(plyr)
library(tidyverse)
library(sqldf)
#############################################
#1. Match genes to their STRINGdb identifiers
Genelist <- fread("Genelist_forPPI.txt")
#Using STRINGdb's R package set to newest string version (11) and human species by NCBI taxonomy number (9606)
#Set score_threshold to 0 to view all possible interactions (score is string's combined score from all their sources for PPIs)
string_db <- STRINGdb$new(version="11", species=9606,
score_threshold=0, input_directory="")
#Identify gene's STRINGdb identifiers:
#string_db$map function adds an additional column with STRING identifiers to the dataframe that is passed as first parameter (gene list)
genes_mapped <- string_db$map(Genelist, "Gene", removeUnmappedRows = TRUE)
#genes_mapped
#Gene STRING_id
#A1CF 9606.ENSP00000378868
#A4GALT 9606.ENSP00000384794
#AACS 9606.ENSP00000324842
#############################################################
#2. Match gene and string identifiers to interactors per gene
#Total data downloaded from STRING including all score types per interaction loaded
#Data in proteindf and was downloaded from: https://string-db.org/cgi/download?sessionId=bLbfFhAKfphK
proteindf <- fread("9606.protein.links.full.v11.0.txt")
dim(proteindf)
#[1] 11759454 16
names(proteindf)
#[1] "protein1" "protein2" "neighborhood"
#[4] "neighborhood_transferred" "fusion" "cooccurence"
#[7] "homology" "coexpression" "coexpression_transferred"
#[10] "experiments" "experiments_transferred" "database"
#[13] "database_transferred" "textmining" "textmining_transferred"
#[16] "combined_score"
colnames(proteindf)[1] <- "STRING_id1"
colnames(proteindf)[2] <- "STRING_id2"
#Interaction scores only from coexpression, experiments and database columns used:
protdf <- select(proteindf, STRING_id1, STRING_id2, coexpression, experiments, database)
#Duplicating STRING_id in genes_mapped object to have renamed matching column STRING_id1
#- Aim is to match with named STRING_id1 interactors in STRINGdb dataset, finding the gene names for those interactors
genes_mapped$STRING_id1 <- genes_mapped$STRING_id
genes_mapped$STRING_id2 <- genes_mapped$STRING_id
#Identifying gene names for first STRING protein ID column:
mapped1 <- select(genes_mapped, STRING_id1, Gene)
colnames(mapped1)[2]<- "Gene1"
string1df <- merge(protdf, mapped1, by='STRING_id1', all.y=TRUE)
colnames(string1df)[6]<- "Gene1"
#head(string1df)
#STRING_id1 STRING_id2 coexpression experiments database Gene1
#9606.ENSP00000002596 9606.ENSP00000306425 0 0 0 HS3ST1
#9606.ENSP00000002596 9606.ENSP00000261523 0 0 0 HS3ST1
#9606.ENSP00000002596 9606.ENSP00000347088 0 0 0 HS3ST1
#9606.ENSP00000002596 9606.ENSP00000389176 0 0 0 HS3ST1
#9606.ENSP00000002596 9606.ENSP00000216124 0 0 0 HS3ST1
#9606.ENSP00000002596 9606.ENSP00000199447 0 0 0 HS3ST1
#Identifying gene names for second STRING protein ID column:
mapped2 <- select(genes_mapped, STRING_id2, Gene)
string2df <- merge(protdf, mapped2, by='STRING_id2', all.y=TRUE)
colnames(string2df)[6]<- "Gene2"
mergedIDs <- inner_join(string1df, string2df)
mergedIDs <- select(mergedIDs, STRING_id1, Gene1, STRING_id2, Gene2, coexpression, experiments, database)
#mergedIDs has matched gene symbols for string IDs in both interactor columns of the orignal protein data downloaded from string:
#dim(mergedIDs)
#2331566 7
#head(mergedIDs)
#STRING_id1 Gene1 STRING_id2 Gene2 coexpression experiments database
#9606.ENSP00000002596 HS3ST1 9606.ENSP00000306425 CHCHD7 0 0 0
#9606.ENSP00000002596 HS3ST1 9606.ENSP00000261523 RORA 0 0 0
#9606.ENSP00000002596 HS3ST1 9606.ENSP00000347088 LARGE 0 0 0
#9606.ENSP00000002596 HS3ST1 9606.ENSP00000389176 C1GALT1 0 0 0
#9606.ENSP00000002596 HS3ST1 9606.ENSP00000199447 NME8 0 0 0
#9606.ENSP00000002596 HS3ST1 9606.ENSP00000264893 SEPT11 0 0 0
#Filtering interactions by STRING's lowest confidence threshold and 3 measurements:
df2filt <- filter(mergedIDs , experiments!= 0 | coexpression!=0 | database!=0 )
df2filt$Max <- apply(df2filt[,5:7], 1, FUN=max)
d2filt <- filter(df2filt, Max >= 150)
############################################################
#3. Collapsing interactors to be listed per gene in each row
# Duplicated genes in above output examples needs to be condensed to 1 row with all counterpart interactors listed for it
#Compress gene's interactors for first Gene1 column:
genes1 <- df2filt[, c(2,4)]
genes1<-genes1[!(genes1$Gene1=="NA"),]
genes1<- data.table(genes1)
genes1 <- genes1[,lapply(.SD, function(col) paste(col, collapse=", ")),
by=.(Gene1)]
genes1$Gene2 <- gsub("NA,", "", genes1$Gene2)
#First row of genes1:
#Gene1 #Gene2
#HS3ST1 CACNG2, EHF, GPC6, SDC2, ...
#Compress gene's interactors for second Gene2 column:
genes2 <- df2filt[, c(2,4)]
genes2<-genes2[!(genes2$Gene2=="NA"),]
genes2<- data.table(genes2)
genes2 <- genes2[,lapply(.SD, function(col) paste(col, collapse=", ")),
by=.(Gene2)]
genes2$Gene1 <- gsub("NA,", "", genes2$Gene1)
#First row of genes2:
#Gene2 Gene1
#CACNG2 HS3ST1, CACNG3, LRRC7, ...
#Combine into one dataset:
colnames(genes1)[1] <- "Gene"
colnames(genes2)[1] <- "Gene"
interactdf <- merge(genes1, genes2, by='Gene', all=T)
#Combine interactors from both gene1 and gene2 and remove duplicates:
interactdf <- transform(interactdf, interactors = paste(Gene1, Gene2, sep = ","))
interactdf$interactors <- sapply(interactdf$interactors , function(x) paste(unique(unlist(str_split(x,", "))), collapse = ", "))
interactors <- select(interactdf, Gene, interactors)
write.table(interactors,"Total_genes_PPI_STRINGdb.txt",sep="\t",row.names=F, quote=F)