-
Notifications
You must be signed in to change notification settings - Fork 1
/
02_split.sh
61 lines (49 loc) · 2.2 KB
/
02_split.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#!/bin/bash
set -eo pipefail
cores=$1
metadata=$2
# in the format YYYY-MM
time=$3
mkdir -p data/subsets
xzcat prefiltered.fasta.xz | python3 src/split_randomly.py data/tree_subset.txt data/subsets 1000 10000 100000 1000000
xz -T $cores data/subsets/*.fasta
mkdir -p data/time
xzcat prefiltered.fasta.xz | python3 src/split_by_time.py data/tree_subset.txt data/time/ --last $time
xz -T $cores data/time/*.fasta
mkdir -p data/time-10000
xzcat prefiltered.fasta.xz | python3 src/split_by_time.py data/tree_subset.txt data/time-10000/ --last $time -n 10000
xz -T $cores data/time-10000/*.fasta
mkdir -p data/time-100k
xzcat prefiltered.fasta.xz | python3 src/split_by_time.py data/tree_subset.txt data/time-100k/ --last $time -n 100000
xz -T $cores data/time-100k/*.fasta
nextclade dataset get -n sars-cov-2 -o data/nextclade
mkdir -p out/time-lineages
for i in $(ls data/time/ | grep fasta.xz);
do
echo "nextclade run data/time/$i --output-tsv out/time-lineages/$(basename $i .fasta.xz).tsv --input-dataset data/nextclade/ -j 1";
done > nextclade_jobs.txt
parallel -j $cores --progress < nextclade_jobs.txt
rm nextclade_jobs.txt
mkdir -p data/time-filtered/
cp covariants/scripts/bad_sequences.py src/
python3 src/filter_times.py data/time $metadata data/time-filtered/
mkdir -p out/time-10000-lineages
for i in $(ls data/time-10000/ | grep fasta.xz);
do
echo "nextclade run data/time-10000/$i --output-tsv out/time-10000-lineages/$(basename $i .fasta.xz).tsv --input-dataset data/nextclade/ -j 1";
done > nextclade_jobs.txt
parallel -j $cores --progress < nextclade_jobs.txt
rm nextclade_jobs.txt
mkdir -p data/time-10000-filtered/
cp covariants/scripts/bad_sequences.py src/
python3 src/filter_times.py data/time-10000 $metadata data/time-10000-filtered/
mkdir -p out/time-100k-lineages
for i in $(ls data/time-100k/ | grep fasta.xz);
do
echo "nextclade run data/time-100k/$i --output-tsv out/time-100k-lineages/$(basename $i .fasta.xz).tsv --input-dataset data/nextclade/ -j 1";
done > nextclade_jobs.txt
parallel -j $cores --progress < nextclade_jobs.txt
rm nextclade_jobs.txt
mkdir -p data/time-100k-filtered/
cp covariants/scripts/bad_sequences.py src/
python3 src/filter_times.py data/time-100k $metadata data/time-100k-filtered/