diff --git a/src/Cmd.cpp b/src/Cmd.cpp index dfd3524..4bc2af0 100644 --- a/src/Cmd.cpp +++ b/src/Cmd.cpp @@ -25,10 +25,10 @@ Param::Param(int argc, char **argv) { "\033[0m\n"}; OptionParser opts(copyr + "Main options"); auto help_opt = opts.add("h", "help", "print all options including hidden advanced options"); - auto svd_opt = opts.add>("d", "svd", "svd method to be applied. default 2 is recommended for big data.\n" + auto svd_opt = opts.add>("d", "svd", "SVD method to be applied. default 2 is recommended for big data.\n" "0: the Implicitly Restarted Arnoldi Method (IRAM)\n" "1: the Yu's single-pass Randomized SVD with power iterations\n" - "2: the proposed window-based Randomized SVD method\n" + "2: the accurate window-based Randomized SVD method (PCAone)\n" "3: the full Singular Value Decomposition.", 2); auto plinkfile = opts.add>("b", "bfile", "prefix to PLINK .bed/.bim/.fam files", "", &filein); auto binfile = opts.add>("B", "binary", "path of binary file", "", &filein); @@ -37,25 +37,25 @@ Param::Param(int argc, char **argv) { auto beaglefile = opts.add>("G", "beagle", "path of BEAGLE file compressed by gzip", "", &filein); opts.add>("k", "pc", "top k principal components (PCs) to be calculated", k, &k); opts.add>("m", "memory", "desired RAM usage in GB unit for out-of-core mode. default is in-core mode", memory, &memory); - opts.add>("n", "threads", "number of threads to be used", threads, &threads); + opts.add>("n", "threads", "the number of threads to be used", threads, &threads); opts.add>("o", "out", "prefix to output files. default [pcaone]", fileout, &fileout); opts.add>("p", "maxp", "maximum number of power iterations for RSVD algorithm", maxp, &maxp); - opts.add("S", "no-shuffle", "do not shuffle the data for --svd 2 if it is already permuted", &noshuffle); + opts.add("S", "no-shuffle", "do not shuffle columns of data for --svd 2 (if not locally correlated)", &noshuffle); opts.add("v", "verbose", "verbose message output", &verbose); - opts.add, Attribute::advanced>("w", "batches", "number of mini-batches to be used by PCAone --svd 2", bands, &bands); + opts.add, Attribute::advanced>("w", "batches", "the number of mini-batches used by --svd 2", bands, &bands); opts.add>("C", "scale", "do scaling for input file.\n" "0: do just centering\n" "1: do log transformation eg. log(x+0.01) for RNA-seq data\n" "2: do count per median log transformation (CPMED) for scRNAs", scale, &scale); - opts.add("", "emu", "uses EMU algorithm for genotype input with missingness", &emu); - opts.add("", "pcangsd", "uses PCAngsd algorithm for genotype likelihood input", &pcangsd); + opts.add("", "emu", "use EMU algorithm for genotype input with missingness", &emu); + opts.add("", "pcangsd", "use PCAngsd algorithm for genotype likelihood input", &pcangsd); opts.add>("", "maf", "exclude variants with MAF lower than this value", maf, &maf); opts.add("V", "printv", "output the right eigenvectors with suffix .loadings", &printv); opts.add("", "ld", "output a binary matrix for downstream LD related analysis", &ld); auto bimfile = opts.add>("", "ld-bim", "variants information in plink bim file related to LD matrix", "", &filebim); - opts.add>("", "ld-r2", "r2 cutoff for LD-based pruning.", ld_r2, &ld_r2); - opts.add>("", "ld-bp", "physical distance threshold in bases for LD (usually. 1000000)", ld_bp, &ld_bp); + opts.add>("", "ld-r2", "r2 cutoff for LD-based pruning. (usually 0.2)", ld_r2, &ld_r2); + opts.add>("", "ld-bp", "physical distance threshold in bases for LD. (usually 1000000)", ld_bp, &ld_bp); opts.add>("", "ld-stats", "statistics to calculate LD r2 for pairwise SNPs.\n" "0: the ancestry adjusted, i.e. correlation between residuals\n" "1: the standard, i.e. correlation between two alleles\n", @@ -67,16 +67,16 @@ Param::Param(int argc, char **argv) { opts.add>("", "clump-p2", "secondary significance threshold for clumped SNPs", clump_p2, &clump_p2); opts.add>("", "clump-r2", "r2 cutoff for LD-based clumping", clump_r2, &clump_r2); opts.add>("", "clump-bp", "physical distance threshold in bases for clumping", clump_bp, &clump_bp); - opts.add("U", "printu", "output eigen vector of each epoch (for tests)", &printu); - opts.add, Attribute::advanced>("", "M", "number of features (eg. SNPs) if already known", 0, &nsnps); - opts.add, Attribute::advanced>("", "N", "number of samples if already known", 0, &nsamples); + opts.add("", "printu", "output eigen vector of each epoch (for tests)", &printu); + opts.add, Attribute::advanced>("", "M", "the number of features (eg. SNPs) if already known", 0, &nsnps); + opts.add, Attribute::advanced>("", "N", "the number of samples if already known", 0, &nsamples); // opts.add("", "debug", "turn on debugging mode", &debug); opts.add("", "haploid", "the plink format represents haploid data", &haploid); opts.add, Attribute::advanced>("", "buffer", "memory buffer in GB unit for permuting the data", buffer, &buffer); - opts.add, Attribute::advanced>("", "imaxiter", "maximum number of IRAM interations", imaxiter, &imaxiter); - opts.add, Attribute::advanced>("", "itol", "tolerance for IRAM algorithm", itol, &itol); - opts.add, Attribute::advanced>("", "ncv", "number of Lanzcos basis vectors for IRAM", ncv, &ncv); - opts.add, Attribute::advanced>("", "oversamples", "number of oversampling columns for RSVD", oversamples, &oversamples); + opts.add, Attribute::advanced>("", "imaxiter", "maximum number of IRAM iterations", imaxiter, &imaxiter); + opts.add, Attribute::advanced>("", "itol", "stopping tolerance for IRAM algorithm", itol, &itol); + opts.add, Attribute::advanced>("", "ncv", "the number of Lanzcos basis vectors for IRAM", ncv, &ncv); + opts.add, Attribute::advanced>("", "oversamples", "the number of oversampling columns for RSVD", oversamples, &oversamples); opts.add, Attribute::advanced>("", "rand", "the random matrix type. 0: uniform, 1: guassian", rand, &rand); opts.add, Attribute::advanced>("", "tol-rsvd", "tolerance for RSVD algorithm", tol, &tol); opts.add, Attribute::advanced>("", "tol-em", "tolerance for EMU/PCAngsd algorithm", tolem, &tolem); @@ -114,7 +114,7 @@ Param::Param(int argc, char **argv) { exit(EXIT_SUCCESS); } else if (argc == 1) { std::cout << opts << "\n"; - exit(EXIT_FAILURE); + exit(EXIT_SUCCESS); } if (svd_opt->value() == 0) svd_t = SvdType::IRAM; diff --git a/src/Cmd.hpp b/src/Cmd.hpp index 01f25e9..da6373f 100644 --- a/src/Cmd.hpp +++ b/src/Cmd.hpp @@ -26,7 +26,7 @@ class Param { uint nsnps = 0; uint k = 10; uint maxp = 40; // maximum number of power iterations - uint threads = 10; + uint threads = 8; uint blocksize = 0; uint bands = 64; bool pca = true; diff --git a/src/Data.cpp b/src/Data.cpp index becb10e..5db4ea0 100644 --- a/src/Data.cpp +++ b/src/Data.cpp @@ -33,8 +33,8 @@ void Data::prepare() { if (params.memory > 1.1 * m) m = 0; else - cao.warning("minimum RAM required is " + to_string(m) + - " GB. trying to allocate more RAM."); + cao.warn("minimum RAM required is " + to_string(m) + + " GB. trying to allocate more RAM."); params.blocksize = (unsigned int)ceil( (double)((m + params.memory) * 134217728 - 3 * nsamples * l - 2 * nsnps * l - 5 * nsnps) / @@ -47,7 +47,7 @@ void Data::prepare() { if (nblocks == 1) { params.out_of_core = false; read_all(); - cao.warning("only one block exists. will run with in-core mode"); + cao.warn("only one block exists. will run with in-core mode"); } else { if (params.svd_t == SvdType::PCAoneAlg2 && params.pca) { // decrease blocksize to fit the fancy halko diff --git a/src/FileBinary.cpp b/src/FileBinary.cpp index d71f192..b718fca 100644 --- a/src/FileBinary.cpp +++ b/src/FileBinary.cpp @@ -16,7 +16,7 @@ void FileBin::check_file_offset_first_var() { } else { ifs_bin.seekg(magic, std::ios_base::beg); if (params.verbose) - cao.warning("make sure you are runing PCAone algorithm2"); + cao.warn("confirm you are running the window-based RSVD (algorithm2)"); } } diff --git a/src/FileCsv.cpp b/src/FileCsv.cpp index 0c8df7c..9c009b1 100644 --- a/src/FileCsv.cpp +++ b/src/FileCsv.cpp @@ -68,7 +68,7 @@ void FileCsv::check_file_offset_first_var() { } else { rewind(zbuf.fin); if (params.verbose) - cao.warning("make sure you are runing PCAone algorithm2"); + cao.warn("confirm you are running the window-based RSVD (algorithm2)"); } zbuf.lastRet = 1; zbuf.buffCur = ""; diff --git a/src/FilePlink.cpp b/src/FilePlink.cpp index b9f56b6..3a9dce4 100644 --- a/src/FilePlink.cpp +++ b/src/FilePlink.cpp @@ -20,7 +20,7 @@ void FileBed::check_file_offset_first_var() { } else { bed_ifstream.seekg(3, std::ios_base::beg); if (params.verbose) - cao.warning("make sure you are running PCAone (algorithm2)"); + cao.warn("confirm you are running the window-based RSVD (algorithm2)"); } } @@ -55,7 +55,7 @@ void FileBed::read_all() { // should remove sites with F=0, 0.5, 1.0 // especially,F=0.5 means sample standard deviation is 0 if (F(i) == 0.0 || F(i) == 0.5 || F(i) == 1.0) - cao.warning("please do remove SNPs with AF=0, 0.5 and 1.0"); + cao.warn("recommend to remove SNPs with AF=0, 0.5 and 1.0 first"); } filter_snps_resize_F(); // filter and resize nsnps @@ -156,7 +156,7 @@ void FileBed::read_block_initial(uint64 start_idx, uint64 stop_idx, F(snp_idx) /= c; } if (F(snp_idx) == 0.0 || F(snp_idx) == 0.5 || F(snp_idx) == 1.0) - cao.warning("please do remove SNPs with AF=0, 0.5 and 1.0"); + cao.warn("recommend to remove SNPs with AF=0, 0.5 and 1.0 first"); // do centering and initialing centered_geno_lookup(1, snp_idx) = 0.0; // missing centered_geno_lookup(0, snp_idx) = BED2GENO[0] - F(snp_idx); // minor hom diff --git a/src/Halko.cpp b/src/Halko.cpp index 1b4dbda..d4bad8c 100644 --- a/src/Halko.cpp +++ b/src/Halko.cpp @@ -173,9 +173,7 @@ void FancyRsvdOpData::computeGandH(MyMatrix& G, MyMatrix& H, int pi) { band = 1; blocksize = (unsigned int)ceil((double)data->nsnps / data->params.bands); if (blocksize < data->params.bands) - cao.warning( - "blocksize is smaller than window size. please consider IRAM " - "method."); + cao.warn("block size < window size. please consider the IRAM method"); // permute snps of G, see // https://stackoverflow.com/questions/15858569/randomly-permute-rows-columns-of-a-matrix-with-eigen if (data->params.perm) PCAone::permute_matrix(data->G, data->perm); @@ -228,7 +226,7 @@ void FancyRsvdOpData::computeGandH(MyMatrix& G, MyMatrix& H, int pi) { flip_Omg(Omg2, Omg); H2.setZero(); } else if ((b + 1) == data->nblocks) { - cao.warning("shouldn't go here if the bands is proper, ie. 2^x"); + cao.warn("shouldn't see this if mini-batches is 2^x"); H = H1 + H2; Eigen::HouseholderQR qr(H); Omg.noalias() = diff --git a/src/Logger.hpp b/src/Logger.hpp index a13de7a..e62b868 100644 --- a/src/Logger.hpp +++ b/src/Logger.hpp @@ -46,7 +46,7 @@ class Logger { } template - void warning(const T &s) { + void warn(const T &s) { if (is_screen) std::cout << std::endl << "\x1B[33m"