-
Notifications
You must be signed in to change notification settings - Fork 3
/
similarity-metrics.R
177 lines (146 loc) · 6.9 KB
/
similarity-metrics.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
#' @include search-degrees.R map-neighbors.R
NULL
#' Compute PathSim similarity.
#'
#' Use the PathSim metric to compute a meta-path based similarity score. Please note that PathSim can ONLY
#' be used for symmetric meta-paths.
#'
#' @template similarity-metrics
#' @return A list with two elements:
#' \describe{
#' \item{Metric}{The name of the similarity metric (i.e., \code{"PathSim"}).}
#' \item{Similarity}{The PathSim similarity score.}}
#' @references \cite{Sun, Y., Han, J., Yan, X., Yu, P. S. & Wu, T. PathSim: meta path-based top-K similarity search in heterogeneous information networks. Proc. VLDB Endow. 4, 992–1003 (2011).}
#' @export
get_pathsim = function(x, y, paths_x, paths_y, reference_list = NULL,
list_type = NULL, verbose = TRUE) {
x_y = sum(paths_x[[ncol(paths_x)]] == y)
x_x = sum(paths_x[[ncol(paths_x)]] == x)
y_y = sum(paths_y[[ncol(paths_y)]] == y)
if(x_y == 0 | (x_x + y_y) == 0) pathsim = 0 else pathsim = (2*x_y)/(x_x + y_y)
if(verbose) {
message("Similarity Metric: PathSim")
message("X -> Y Paths: ", x_y)
message("X -> X Paths: ", x_x)
message("Y -> Y Paths: ", y_y)
message("Similarity: ", pathsim)
}
return(list(Metric = "PathSim", Similarity = pathsim))
}
#' Compute path count similarity.
#'
#' Use the path count metric to compute a meta-path based similarity score.
#'
#' @template similarity-metrics
#' @return A list with two elements:
#' \describe{
#' \item{Metric}{The name of the similarity metric (i.e., \code{"Path Count"}).}
#' \item{Similarity}{The path count similarity score.}}
#' @references \cite{Himmelstein, D. S. & Baranzini, S. E. Heterogeneous Network Edge Prediction: A Data Integration Approach to Prioritize Disease-Associated Genes. PLOS Computational Biology 11, e1004259 (2015).}
#' @export
get_pc = function(x, y, paths_x, paths_y = NULL, reference_list = NULL,
list_type = NULL, verbose = TRUE) {
x_y = sum(paths_x[[ncol(paths_x)]] == y)
pc = x_y
if(verbose) {
message("Similarity Metric: Path Count")
message("X -> Y Paths: ", x_y)
message("Similarity: ", pc)
}
return(list(Metric = "Path Count", Similarity = pc))
}
#' Compute normalized path count similarity.
#'
#' Use the normalized path count metric to compute a meta-path based similarity score.
#'
#' @template similarity-metrics
#' @return A list with two elements:
#' \describe{
#' \item{Metric}{The name of the similarity metric (i.e., \code{"Normalized Path Count"}).}
#' \item{Similarity}{The normalized path count similarity score.}}
#' @references \cite{Himmelstein, D. S. & Baranzini, S. E. Heterogeneous Network Edge Prediction: A Data Integration Approach to Prioritize Disease-Associated Genes. PLOS Computational Biology 11, e1004259 (2015).}
#' @export
get_npc = function(x, y, paths_x, paths_y, reference_list = NULL,
list_type = NULL, verbose = TRUE) {
x_y = sum(paths_x[[ncol(paths_x)]] == y)
x_typey = nrow(paths_x)
y_typex = nrow(paths_y)
if(x_y == 0 | (x_typey + y_typex) == 0) npc = 0 else {
npc = x_y/(x_typey + y_typex) }
if(verbose) {
message("Similarity Metric: Normalized Path Count")
message("X -> Y Paths: ", x_y)
message("X -> Type Y Paths: ", x_typey)
message("Y -> Type X Paths: ", y_typex)
message("Similarity: ", npc)
}
return(list(Metric = "Normalized Path Count", Similarity = npc))
}
#' Compute degree-weighted path count similarity.
#'
#' Use the degree-weighted path count metric to compute a meta-path based similarity score. Node that, in this implementation,
#' type-specific degrees are used (except for the last step of the meta-path).
#'
#' @template similarity-metrics
#' @return A list with two elements:
#' \describe{
#' \item{Metric}{The name of the similarity metric (i.e., \code{"Degree-Weighted Path Count"}).}
#' \item{Similarity}{The degree-weighted path count similarity score.}}
#' @references \cite{Himmelstein, D. S. & Baranzini, S. E. Heterogeneous Network Edge Prediction: A Data Integration Approach to Prioritize Disease-Associated Genes. PLOS Computational Biology 11, e1004259 (2015).}
#' @export
get_dwpc = function(x, y, paths_x, paths_y = NULL, reference_list,
list_type = c("edge", "neighbor"), verbose = TRUE, w = 0.4) {
# get paths from origin to destination
x_y_paths = paths_x[paths_x[[ncol(paths_x)]] == y, ]
x_y = nrow(x_y_paths)
# get vector of next path in meta-path, last is NA (so total degree is computed)
next_type = c(clean_mp(colnames(x_y_paths)), NA)
# compute type-specific degrees
map_search = function(node_list, type_next) { map_dbl(node_list, ~search_degrees(.x, type_next, reference_list, list_type)) }
x_y_degrees = map2_dfr(x_y_paths, seq_along(x_y_paths), ~map_search(.x, next_type[.y + 1])) %>% setDT()
# for each path, compute path degree product (PDP)
pdp = pmap_dbl(x_y_degrees, ~prod(c(...)^-w))
if(x_y == 0) dwpc = 0 else dwpc = sum(pdp)
if(verbose) {
if(length(pdp) > 1) sd_msg = paste0(" [", sd(pdp), "]") else sd_msg = ""
message("Similarity Metric: Degree-Weighted Path Count")
message("X -> Y Paths: ", x_y)
message("Damping Exponent: ", w)
message("PDP (Mean/SD): ", mean(pdp), sd_msg)
message("Similarity: ", dwpc)
}
return(list(Metric = "Degree-Weighted Path Count", Similarity = dwpc))
}
#' Retrieve similarity score metric.
#'
#' For a desired meta-path based similarity metric, retrieve the function which computes the appropriate similarity score, then compute similarity.
#'
#' @param metric_name Readable description of the desired similarity score:
#' \describe{
#' \item{Path Count}{Use \code{"pc"}.}
#' \item{Normalized Path Count}{Use \code{"npc"}.}
#' \item{Degree-Weighted Path Count}{Use \code{"dwpc"}.}
#' \item{PathSim}{Use \code{"pathsim"}.}}
#' @param get_verbose Should output be printed to console?
#' @param ... Other arguments to pass to appropriate meta-path similarity function.
#' @return A list with two elements:
#' \describe{
#' \item{Metric}{The human readable name of the similarity metric.}
#' \item{Similarity}{The appropriate similarity score.}}
#' @seealso See the following functions to compute various similarity scores:
#' \describe{
#' \item{Path Count}{\code{get_pc()}}
#' \item{Normalized Path Count}{\code{get_npc()}}
#' \item{Degree-Weighted Path Count}{\code{get_dwpc()}}
#' \item{PathSim}{\code{get_pathsim()}}}
#' @export
get_similarity_function = function(metric_name, get_verbose, ...) {
# specify allowed values (i.e., defined functions)
if(metric_name %in% c("pc", "npc", "dwpc", "pathsim")) {
if(get_verbose) message()
return(get(paste0("get_", metric_name))(...))
} else {
if(get_verbose) message("\nSimilarity Metric: N/A")
return(list(Metric = "Not Available", Similarity = NA))
}
}