diff --git a/CHANGES.md b/CHANGES.md index 815c46f..f3dd1a5 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -1,9 +1,10 @@ -v0.2.7 (dev) +v0.2.7 ====== + new subcommand `ancestry` to predict ancestry using a simple neural network on the somalier sketches. creates an interactive html output and a text file + fix for "Argument list too long" on huge cohorts (#37) + sub-sample .pairs.tsv output for huge cohorts -- only for unrelated samples. ++ better sub-sampling of html output v0.2.6 ====== diff --git a/README.md b/README.md index 7e50380..912780f 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,9 @@ to add/remove samples or adjust a pedigree file and re-run iteratively. For example to add the **n + 1th** samples, just run `somalier extract` on the new sample and then re-use the already extracted data from the `n` original samples. +For *huge* sample-sets, if you run into a bash error for *argument list too long*, you can pass the somalier files as quoted +glob strings like: `"/path/to/set-a/*.somalier" "/path/to/set-b/*.somalier"`. + ## Usage The usage is also described above. Briefly, run: diff --git a/src/somalier.nim b/src/somalier.nim index 07922c1..edc7f5c 100644 --- a/src/somalier.nim +++ b/src/somalier.nim @@ -232,7 +232,7 @@ proc main() = "extract": pair(f:extract_main, description: "extract genotype-like information for a single sample from VCF/BAM/CRAM."), "relate": pair(f:rel_main, description: "aggregate `extract`ed information and calculate relatedness among samples."), "ancestry": pair(f:ancestry_main, description: "perform ancestry prediction on a set of samples, given a set of labeled samples"), - "depthview": pair(f:depth_main, description: "plot per-chromosome depth for each sample for quick quality-control"), + #"depthview": pair(f:depth_main, description: "plot per-chromosome depth for each sample for quick quality-control"), "find-sites": pair(f:findsites_main, description: "create a new sites.vcf.gz file from a population VCF (this is rarely needed)."), }.toOrderedTable diff --git a/src/somalierpkg/ancestry.nim b/src/somalierpkg/ancestry.nim index 94fdcac..c381a17 100644 --- a/src/somalierpkg/ancestry.nim +++ b/src/somalierpkg/ancestry.nim @@ -52,6 +52,11 @@ type ForHtml = ref object probs: seq[float32] # probability of maximum prediction ancestry_label: string +proc subset(T: var Tensor[float32], Q: var Tensor[float32], labels: var Tensor[int]) = + echo T.shape + echo Q.shape + echo labels.shape + proc ancestry_main*() = var argv = commandLineParams() @@ -117,13 +122,17 @@ proc ancestry_main*() = vec[j] = ac.ab(5).alts.float32 query_mat[i] = vec + var nPCs = parseInt(opts.n_pcs) T = train_mat.toTensor() + Q = query_mat.toTensor() Y = int_labels.toTensor() #.astype(float32)#.unsqueeze(0).transpose t0 = cpuTime() res = T.pca(nPCs) #, center=true) #, n_power_iters=4) + #subset(T, Q, Y) + stderr.write_line &"[somalier] time for dimensionality reduction to shape {res.projected.shape}: {cpuTime() - t0:.2f} seconds" let @@ -187,7 +196,6 @@ proc ancestry_main*() = let t_probs = model.forward(X).value.softmax #.argmax(axis=1).squeeze let - Q = query_mat.toTensor() q_proj = Q * res.components q_probs = model.forward(ctx.variable q_proj).value.softmax q_pred = q_probs.argmax(axis=1).squeeze diff --git a/src/somalierpkg/results.html b/src/somalierpkg/results.html index 2cb47b4..4d942b6 100644 --- a/src/somalierpkg/results.html +++ b/src/somalierpkg/results.html @@ -157,18 +157,18 @@
Sample Depth Metrics
var sample_data = var input = -var colors = ['rgba(55,126,184,0.7)', 'rgba(228,26,28,0.7)', 'rgba(77,175,74,0.7)', 'rgba(152,78,163,0.7)', 'rgba(255,127,0,0.7)', 'rgba(166,86,40,0.7)', 'rgba(247,129,191,0.7)'] +var colors = ['rgba(55,126,184,0.7)', 'rgba(228,26,28,0.7)', 'rgba(152,78,163,0.7)', 'rgba(255,127,0,0.7)', 'rgba(166,86,40,0.7)', 'rgba(247,129,191,0.7)', 'rgba(77,175,74,0.7)',] var size if (sample_data.length > 700) { - size = 6 + size = 8 } else if (sample_data.length > 200) { - size = 7 + size = 9 } else if (sample_data.length > 50) { - size = 8 -} else if (sample_data.length > 20) { size = 10 -} else { +} else if (sample_data.length > 20) { size = 12 +} else { + size = 15 } function set_xy_data_by_group(input, metric, is_x) {