-
Notifications
You must be signed in to change notification settings - Fork 0
/
pathworks.h
356 lines (310 loc) · 12.9 KB
/
pathworks.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
#ifndef PATHWORKS_H
#define PATHWORKS_H 1
#endif
#include "stddef.h"
// use radix sort instead of qsort because it is faster
#define RADIX_SORT 1
#define CALC_OPTION_FE 0
#define CALC_OPTION_GPCC 1
#define CALC_OPTION_PERMUTE 2
#define CALC_OPTION_PERMUTE3 3
#define CALC_OPTION_GPCC2 4
#define CALC_OPTION_GPCC3 5
#define CALC_OPTION_GPCC4 6
#define PERMUTE_DEFAULT_LOW 5000
#define MAXBSIDPOSSIBLE 851568
// for pwharvest
#define MAXGENE 70000
// last count 61141 pathworksgenes.txt
#define MAX_INGENES 40000
// types
#define type_functional_set 1
#define type_pathway 2
#define type_structural_complex 3
#define type_custom 4
#define type_unknown 9
// scope
#define conserved_biosystem_scope 10
#define organism_specific_biosystem_scope 11
#define CAT_NCBI_BIOCYC (1<<0)
// count = 294
#define CAT_NCBI_GO (1<<1)
// count = 13515
#define CAT_NCBI_KEGG (1<<2)
// count = 485
#define CAT_NCBI_PANTH (1<<3)
// count = 129
#define CAT_NCBI_PID (1<<4)
// count = 183
#define CAT_NCBI_REACTOME (1<<5)
// count = 1548
#define CAT_NCBI_WikiPathways (1<<6)
// count = 345
#define CAT_MSIG_C1 (1<<7)
// count = 325
#define CAT_MSIG_C2 (1<<8)
// count = 3777
#define CAT_MSIG_C3 (1<<9)
// count = 836
#define CAT_MSIG_C4 (1<<10)
// count = 858
#define CAT_MSIG_C5 (1<<11)
// count = 5871
#define CAT_MSIG_C6 (1<<12)
// count = 144
#define CAT_MSIG_C7 (1<<13)
// count = 1888
#define CAT_MSIG_C8 (1<<14)
#define CAT_MSIG_H (1<<15)
// count = 50
#define CAT_CUSTOM (1<<16)
#if 0
#define CAT_MSIG_ARCHIVED (1<<15)
// count = 858
#endif
#define NCBI_PAT = (CAT_NCBI_BIOCYC|CAT_NCBI_GO|CAT_NCBI_KEGG|CAT_NCBI_PANTH|CAT_NCBI_PID|CAT_NCBI_REACTOME|CAT_NCBI_WikiPathways);
#define MSIG_PAT = (CAT_MSIG_C1|CAT_MSIG_C2|CAT_MSIG_C3|CAT_MSIG_C4|CAT_MSIG_C5|CAT_MSIG_C6|CAT_MSIG_C7|CAT_MSIG_C8|CAT_MSIG_H);
struct binpathouttype // the "binary" pathway information file
{
int bsid; // 32 bit integer. note: originally "bs" was for "biosystems"
int category; // bit patern for each category (particular may be bit set to turn on) examples:CAT_NCBI_GO
int accession; // spill from char * this is an "offset from spill space start" points to
int name; // spill for char * .
int type; // 1 byte, from char *
int scope; // 32 bit int from char *
int taxid; // 32 bit int , taxonomyid
int desc; // spill from char * "points" (really offset) into spill space
unsigned int numgenes;
// little tricky here: "hits" is not set at record creation (it is set to null), then, later (i.e when running l2p) ,
// it is used in processing when the binpath[] data is read in. use this "hits" field for the count of
// genes that hit this pathway
int offset2geneids; // pointer to "numgenes" geneids
// ***** NOTE: Different C compilers produce different sized records for this structure (binpathouttype).
// ***** The output file for this will only contain the important to save fields
// ***** we only need to writeout the above 10 fields. So output record size is 10*4=40 bytes.
#if 0
// hits is a ptr to an array with
// firstelemen=[0]=numhits, then the rest of the array is [1...n] ptrs to struct of generecs (bingentype? right?)
void *hits; // used in l2p for user "hits" to this pathway
#endif
};
#define MAXGENENAME 26
// maximum length ARHGAP27P1-BPTFP1-KPNA2P3 = 25
struct bingenetype
{
int geneid; // entrez gene id
char hugo[MAXGENENAME];
char ensembl[MAXGENENAME];
int pathcount; // count of paths, ids are in int array famous at "pathplace"
int pathplace; // index to path (to a struct binpathouttype record,see above)
int categories; // bit patterns
};
struct updated_genes_type
{
char *newname;
char *oldname;
int change_flag;
int status;
int is_legit_name;
};
struct genelisttype // used by harvest programs
{
int geneid;
struct genelisttype *n;
};
struct raw_genelisttype // used by harvest programs
{
char *raw; // raw gene name
struct raw_genelisttype *n;
};
struct bstype // biosystems id and info - input into this array -- used by harvest programs
{
int bsid;
int category; // use CAT_ bitpattern defines (above)
char *accession;
char *name;
char *type;
char *scope;
int taxid;
char *desc;
int redundant; // flag for checking to see if this pathway is duplicated by another pathway
// next two fields get values from other file
int numgenes; // "count of" in next line of code line (i.e. number of genes)
struct genelisttype *geneslinkedlist; // a linked list of FINAL genes
struct raw_genelisttype *raw_genes_linkedlist; // a linked list of raw genes
};
struct hugo_type
{
char *hugo;
struct bingenetype *generec_ptr;
int status; // this can be used for various purposes, initial reason is to use for "universe" masking
};
struct genetype // from ncbi
{ // this is (may) only used in pwharvest, l2p uses bingenetype
int geneid;
char *hugo;
char *ensembl;
int categories;
};
struct hit_type
{
unsigned int hitcnt;
unsigned int maxhits;
unsigned int *hitsindexes;
};
// pathway commons
#define chemical_affects (1<<0)
#define in_complex_with (1<<1)
#define catalysis_precedes (1<<2)
#define controls_expression_of (1<<3)
#define controls_state_change_of (1<<4)
#define controls_production_of (1<<5)
#define consumption_controlled_by (1<<6)
#define controls_phosphorylation_of (1<<7)
#define used_to_produce (1<<8)
#define transport (1<<9)
#define reacts_with (1<<10)
#define interacts_with (1<<11)
#define reference (1<<12)
#define multiple (1<<13)
#define other (1<<14)
#define ABdirection (1<<15)
#define MAXPC 2000000
// latest 1915769 PathwayCommons12.All.hgnc.txt
struct pctype // pathway commons type
{
int ID_Interactor_A;
int ID_Interactor_B;
char *hugo1;
char *hugo2;
unsigned short int interaction_type;
int is_dupe;
};
#define MAXBIOGRID 303568
// bits for "interaction_type" field ... // count name
#define association 1 // 8931 psi-mi:"MI:0914(association)"
#define colocalization 2 // 44101 psi-mi:"MI:0403(colocalization)"
#define synthetic_genetic_interaction_defined_by_inequality 4 // 50045 psi-mi:"MI:0794(synthetic genetic interaction defined by inequality)"
#define suppressive_genetic_interaction_defined_by_inequality 8 // 197811 psi-mi:"MI:0796(suppressive genetic interaction defined by inequality)"
#define direct_interaction 16 // 206875 psi-mi:"MI:0407(direct interaction)"
#define physical_association 32 // 329721 psi-mi:"MI:0915(physical association)"
#define additive_genetic_interaction_defined_by_inequality 64 // 535593 psi-mi:"MI:0799(additive genetic interaction defined by inequality)"
struct biogridtype
{
int ID_Interactor_A;
int ID_Interactor_B;
int interaction_type;
};
struct smallgenetype
{
char *hugo; // hugo = human gene name nomenclature authority ("official gene name")
unsigned int egid; // entrez gene id
};
struct used_path_type
{
unsigned int category;
char *custom_category_name;
char *acc;
char *name;
unsigned int numgenes; // original number of genes in pathway
unsigned int numfixedgenes; // after fixing
unsigned int *egids;
unsigned int hitcnt;
unsigned int *genehits; // put hits here. reason: need to print them out
unsigned int aughitcnt; // not used . fix
double pathhits_gpsum; // # of pathways by each hit gene in pathway
unsigned int pathcountsum; // # of pathways for each gene in pathway
double OR;
double gpcc_OR;
double pval;
double pval2; // alt
double permute_pval; // permute
double gpcc_p;
double fdr;
double gpcc_fdr;
double enrichment_score; // ratio
unsigned int pwgenesindex;
// orginal george int a,b,c,d; // a=universe-userinput-pwgenes-list b=pw-hits, c=degs-hits , d = number of hits
unsigned int a,b,c,d; // a=universe-userinput-pwgenes-list b=pw-hits, c=degs-hits , d = number of hits
unsigned int A_scaled,B_scaled,C_scaled,D_scaled;
// #if NELSON_C
#if 1
unsigned int randhits;
unsigned int countover; // data hits value > permutation p hits
unsigned int countequal;
unsigned int countunder; // redundant
double p_permute_over;
double p_permute_under; // redundant
#endif
double pval4;
double fdr4;
};
struct tree_with_count
{
unsigned int val; // entrez gene id : sometimes called "egid"
unsigned int count; // number of pathways this gene hits
unsigned int deg; // 1 on deglist, 0 not on ( deglist = "differentially expressed gene list" , aka user inlist)
struct tree_with_count *left;
struct tree_with_count *right;
struct used_path_type **all_gene_paths; // all gene paths is an array of pointers ( of "count" size).
unsigned int pathindex; // which array member gets the pointer to pathway?
};
struct custom_type
{
char *name;
char *optional; // should in practice be the accession ?
unsigned int numgenes;
unsigned int *genes;
};
struct ens2gene_type
{
char *ens;
char *symbol;
};
struct a2a_type {
int taxid1;
int taxid2;
int ensidx1;
int ensidx2;
};
struct synonym_type {
char *Synonym;
int GeneID;
char *Symbol;
int status;
};
struct entrez_hugo_ensemble_type
{
unsigned int gene_id; // note case of value is zero
char *hugo;
char *ens;
};
void category_set_all(unsigned int *pat);
void category_code_to_string(unsigned int cat,char puthere[]);
int string_to_category_code(char cats[]);
void categories_pattern_to_strings(unsigned int cat,char puthere[]);
double exact22(int n11_,int n12_,int n21_,int n22_); // fishers exact
double exact22_oneside(int n11_,int n12_,int n21_,int n22_, int dbg);
unsigned int string2type(char *s);
int bitCount(int n);
int setup_by_egids(void);
char *egid2hugo(int egid);
unsigned int hugo2egid(char *h);
char *type2string(int type);
int cmp_ui(const void *a, const void *b);
unsigned int *get_used_universe(struct used_path_type *u, unsigned int num_used, unsigned int *real_universe_cnt);
int cmp_ordertype_by_val_REV(const void *a, const void *b);
int cmp_usi(const void *a, const void *b);
int do_pvals_and_bh(unsigned int ingenecnt, struct used_path_type usedpaths[], unsigned int numusedpaths,unsigned int real_universe_cnt, int oneside);
unsigned int GPCC(struct used_path_type usedpaths[], unsigned int num_used_paths, unsigned int real_universe_cnt, unsigned int *real_universe);
int do_just_bh(unsigned int ingenecnt, struct used_path_type usedpaths[], unsigned int num_used_paths,unsigned int real_universe_cnt);
// void malloc_pathpointers(struct tree_with_count *node); // counts aligned with universe (real_universe)
void radix_ui(register unsigned int vector[], register const unsigned int size) ;
int l2pfunc(struct used_path_type *usedpaths,unsigned int num_used_paths,unsigned int real_universe_cnt,
unsigned int *real_universe, int calc_option, int *user_incnt_ptr, int oneside, unsigned int numpermutes);
struct updated_genes_type *updategenesR(char *genes[], const int len);
struct entrez_hugo_ensemble_type *egids2hugos(unsigned int egids[], const int len);
struct used_path_type *setup_used_paths(unsigned int *num_used_paths, unsigned int catspat, char universe_file[], unsigned int in_universe_cnt,unsigned int *in_universe, char custom_file[], unsigned int gmtfld2, unsigned int *real_universe_cnt_ptr,unsigned int **real_universe,unsigned int lencust,struct custom_type *mycustompw);
void bh_adjusted(const double *p, double *pa, int size) ;
double kt_fisher_exact(int n11, int n12, int n21, int n22, double *_left, double *_right, double *two);
size_t my_strlcpy(char *dst, const char *s, size_t maxx);