Skip to content

Commit

Permalink
LAST alignment statistics for MultiQC (#5902)
Browse files Browse the repository at this point in the history
* last train modified to output multiqc

* Report alignment length and percent similarity, for MultiQC.

* Update snapshot file.

* Document the new channels

* Fix indentation

* Update test results.

* Give a longer suffix for better MultiQC search patterns.

---------

Co-authored-by: Mohammed Mahdi <[email protected]>
  • Loading branch information
charles-plessy and U13bs1125 authored Jul 11, 2024
1 parent d8f64bc commit 882e20c
Show file tree
Hide file tree
Showing 9 changed files with 209 additions and 18 deletions.
28 changes: 24 additions & 4 deletions modules/nf-core/last/lastal/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ process LAST_LASTAL {

output:
tuple val(meta), path("*.maf.gz"), emit: maf
tuple val(meta), path("*.tsv") , emit: multiqc
path "versions.yml" , emit: versions

when:
Expand All @@ -25,15 +26,33 @@ process LAST_LASTAL {
"""
INDEX_NAME=\$(basename \$(ls $index/*.des) .des)
set -o pipefail
function calculate_psl_metrics() {
awk 'BEGIN {
FS="\t"; # Set field separator as tab
totalMatches = 0;
totalAlignmentLength = 0;
print "Sample\tTotalAlignmentLength\tPercentSimilarity"; # Header for MultiQC
}
{
totalMatches += \$1 + \$3; # Sum matches and repMatches
totalAlignmentLength += \$1 + \$2 + \$3 + \$6 + \$8; # Sum matches, misMatches, repMatches, qBaseInsert, and tBaseInsert
}
END {
percentSimilarity = (totalAlignmentLength > 0) ? (totalMatches / totalAlignmentLength * 100) : 0;
print "$meta.id" "\t" totalAlignmentLength "\t" percentSimilarity; # Data in TSV format
}'
}
lastal \\
-P $task.cpus \\
$trained_params \\
$args \\
${index}/\$INDEX_NAME \\
$fastx \\
| gzip --no-name > ${prefix}.\$INDEX_NAME.maf.gz
# gzip needs --no-name otherwise it puts a timestamp in the file,
# which makes its checksum non-reproducible.
$fastx |
tee >(gzip --no-name > ${prefix}.maf.gz) |
maf-convert psl |
calculate_psl_metrics > ${prefix}.tsv
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand All @@ -48,6 +67,7 @@ process LAST_LASTAL {
"""
INDEX_NAME=STUB
echo stub | gzip --no-name > ${prefix}.\$INDEX_NAME.maf.gz
touch ${prefix}.tsv
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
4 changes: 4 additions & 0 deletions modules/nf-core/last/lastal/meta.yml
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ output:
type: file
description: Gzipped MAF (Multiple Alignment Format) file
pattern: "*.{maf.gz}"
- multiqc:
type: file
description: Alignment summary for MultiQC
pattern: "*.tsv"
authors:
- "@charles-plessy"
maintainers:
Expand Down
68 changes: 61 additions & 7 deletions modules/nf-core/last/lastal/tests/main.nf.test.snap
Original file line number Diff line number Diff line change
Expand Up @@ -8,10 +8,19 @@
"id": "contigs",
"single_end": false
},
"contigs.genome.maf.gz:md5,902274b72657f62d270d284dc211aa7f"
"contigs.maf.gz:md5,902274b72657f62d270d284dc211aa7f"
]
],
"1": [
[
{
"id": "contigs",
"single_end": false
},
"contigs.tsv:md5,f028e69bd64e54080b9a03fd809cba74"
]
],
"2": [
"versions.yml:md5,e0a425d7cbca674252a1e4328b247ca2"
],
"maf": [
Expand All @@ -20,7 +29,16 @@
"id": "contigs",
"single_end": false
},
"contigs.genome.maf.gz:md5,902274b72657f62d270d284dc211aa7f"
"contigs.maf.gz:md5,902274b72657f62d270d284dc211aa7f"
]
],
"multiqc": [
[
{
"id": "contigs",
"single_end": false
},
"contigs.tsv:md5,f028e69bd64e54080b9a03fd809cba74"
]
],
"versions": [
Expand All @@ -32,7 +50,7 @@
"nf-test": "0.8.4",
"nextflow": "24.04.2"
},
"timestamp": "2024-06-06T23:11:59.764152"
"timestamp": "2024-07-02T17:57:48.589408"
},
"sarscov2 - contigs - genome - stub": {
"content": [
Expand All @@ -47,6 +65,15 @@
]
],
"1": [
[
{
"id": "contigs",
"single_end": false
},
"contigs.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
]
],
"2": [
"versions.yml:md5,e0a425d7cbca674252a1e4328b247ca2"
],
"maf": [
Expand All @@ -58,6 +85,15 @@
"contigs.STUB.maf.gz:md5,f50b84b1db4b83ba62ec1deacc69c260"
]
],
"multiqc": [
[
{
"id": "contigs",
"single_end": false
},
"contigs.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
]
],
"versions": [
"versions.yml:md5,e0a425d7cbca674252a1e4328b247ca2"
]
Expand All @@ -67,7 +103,7 @@
"nf-test": "0.8.4",
"nextflow": "24.04.2"
},
"timestamp": "2024-06-06T23:12:43.028075"
"timestamp": "2024-07-02T17:58:30.521811"
},
"sarscov2 - contigs - genome - withparams": {
"content": [
Expand All @@ -78,10 +114,19 @@
"id": "contigs",
"single_end": false
},
"contigs.genome.maf.gz:md5,8cb97b6daa34dbf9c723a2c4a984992d"
"contigs.maf.gz:md5,8cb97b6daa34dbf9c723a2c4a984992d"
]
],
"1": [
[
{
"id": "contigs",
"single_end": false
},
"contigs.tsv:md5,f315664aa18f1f6bad79486f9750f200"
]
],
"2": [
"versions.yml:md5,e0a425d7cbca674252a1e4328b247ca2"
],
"maf": [
Expand All @@ -90,7 +135,16 @@
"id": "contigs",
"single_end": false
},
"contigs.genome.maf.gz:md5,8cb97b6daa34dbf9c723a2c4a984992d"
"contigs.maf.gz:md5,8cb97b6daa34dbf9c723a2c4a984992d"
]
],
"multiqc": [
[
{
"id": "contigs",
"single_end": false
},
"contigs.tsv:md5,f315664aa18f1f6bad79486f9750f200"
]
],
"versions": [
Expand All @@ -102,6 +156,6 @@
"nf-test": "0.8.4",
"nextflow": "24.04.2"
},
"timestamp": "2024-06-06T23:12:21.536568"
"timestamp": "2024-07-02T17:58:09.677672"
}
}
26 changes: 25 additions & 1 deletion modules/nf-core/last/split/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ process LAST_SPLIT {

output:
tuple val(meta), path("*.maf.gz"), emit: maf
tuple val(meta), path("*.tsv") , emit: multiqc
path "versions.yml" , emit: versions

when:
Expand All @@ -23,7 +24,29 @@ process LAST_SPLIT {
if( "$maf" == "${prefix}.maf.gz" ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!"
"""
set -o pipefail
zcat < $maf | last-split $args | gzip --no-name > ${prefix}.maf.gz
function calculate_psl_metrics() {
awk 'BEGIN {
FS="\t"; # Set field separator as tab
totalMatches = 0;
totalAlignmentLength = 0;
print "Sample\tTotalAlignmentLength\tPercentSimilarity"; # Header for MultiQC
}
{
totalMatches += \$1 + \$3; # Sum matches and repMatches
totalAlignmentLength += \$1 + \$2 + \$3 + \$6 + \$8; # Sum matches, misMatches, repMatches, qBaseInsert, and tBaseInsert
}
END {
percentSimilarity = (totalAlignmentLength > 0) ? (totalMatches / totalAlignmentLength * 100) : 0;
print "$meta.id" "\t" totalAlignmentLength "\t" percentSimilarity; # Data in TSV format
}'
}
zcat < $maf |
last-split $args |
tee >(gzip --no-name > ${prefix}.maf.gz) |
maf-convert psl |
calculate_psl_metrics > ${prefix}.tsv
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand All @@ -37,6 +60,7 @@ process LAST_SPLIT {
if( "$maf" == "${prefix}.maf.gz" ) error "Input and output names are the same, use \"task.ext.prefix\" to disambiguate!"
"""
echo stub | gzip --no-name > ${prefix}.maf.gz
touch ${prefix}.tsv
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
4 changes: 4 additions & 0 deletions modules/nf-core/last/split/meta.yml
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,10 @@ output:
type: file
description: Multiple Aligment Format (MAF) file, compressed with gzip
pattern: "*.{maf.gz}"
- multiqc:
type: file
description: Alignment summary for MultiQC
pattern: "*.tsv"
authors:
- "@aleksandrabliznina"
- "@charles-plessy"
Expand Down
36 changes: 34 additions & 2 deletions modules/nf-core/last/split/tests/main.nf.test.snap
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@
]
],
"1": [
[
{
"id": "sarscov.contigs.genome"
},
"sarscov.contigs.genome.tsv:md5,b625a3b37343e9e6a279b8625d4c2da8"
]
],
"2": [
"versions.yml:md5,9e429d0800988ae0bbe5000827d34ad1"
],
"maf": [
Expand All @@ -21,6 +29,14 @@
"sarscov.contigs.genome.maf.gz:md5,689cb18ff7098ff90eaf87017f590208"
]
],
"multiqc": [
[
{
"id": "sarscov.contigs.genome"
},
"sarscov.contigs.genome.tsv:md5,b625a3b37343e9e6a279b8625d4c2da8"
]
],
"versions": [
"versions.yml:md5,9e429d0800988ae0bbe5000827d34ad1"
]
Expand All @@ -30,7 +46,7 @@
"nf-test": "0.8.4",
"nextflow": "24.04.2"
},
"timestamp": "2024-06-06T17:49:24.045661"
"timestamp": "2024-07-02T11:45:00.535348"
},
"sarscov2 - contigs_genome - stub": {
"content": [
Expand All @@ -44,6 +60,14 @@
]
],
"1": [
[
{
"id": "sarscov.contigs.genome"
},
"sarscov.contigs.genome.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
]
],
"2": [
"versions.yml:md5,9e429d0800988ae0bbe5000827d34ad1"
],
"maf": [
Expand All @@ -54,6 +78,14 @@
"sarscov.contigs.genome.maf.gz:md5,f50b84b1db4b83ba62ec1deacc69c260"
]
],
"multiqc": [
[
{
"id": "sarscov.contigs.genome"
},
"sarscov.contigs.genome.tsv:md5,d41d8cd98f00b204e9800998ecf8427e"
]
],
"versions": [
"versions.yml:md5,9e429d0800988ae0bbe5000827d34ad1"
]
Expand All @@ -63,6 +95,6 @@
"nf-test": "0.8.4",
"nextflow": "24.04.2"
},
"timestamp": "2024-06-06T17:50:20.139442"
"timestamp": "2024-07-02T11:45:21.243325"
}
}
12 changes: 12 additions & 0 deletions modules/nf-core/last/train/main.nf
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ process LAST_TRAIN {

output:
tuple val(meta), path("*.train"), emit: param_file
tuple val(meta), path("*.tsv") , emit: multiqc
path "versions.yml" , emit: versions

when:
Expand All @@ -31,6 +32,16 @@ process LAST_TRAIN {
$fastx \\
> ${prefix}.\$INDEX_NAME.train
echo "id\tsubstitution_percent_identity\tlast -t\tlast -a\tlast -A\tlast -b\tlast -B\tlast -S" > ${prefix}.train.tsv
printf "\$(basename ${prefix}.\$INDEX_NAME.train .target.train)\t" >> ${prefix}.train.tsv
grep 'substitution percent identity' ${prefix}.\$INDEX_NAME.train | tail -n 1 | awk '{print \$5}' | tr '\n' '\t' >> ${prefix}.train.tsv
grep 'last -t' ${prefix}.\$INDEX_NAME.train | tail -n 1 | awk '{print \$2}' | sed -e 's/-t//' | tr '\n' '\t' >> ${prefix}.train.tsv
grep 'last -a' ${prefix}.\$INDEX_NAME.train | tail -n 1 | awk '{print \$3}' | tr '\n' '\t' >> ${prefix}.train.tsv
grep 'last -A' ${prefix}.\$INDEX_NAME.train | tail -n 1 | awk '{print \$3}' | tr '\n' '\t' >> ${prefix}.train.tsv
grep 'last -b' ${prefix}.\$INDEX_NAME.train | tail -n 1 | awk '{print \$3}' | tr '\n' '\t' >> ${prefix}.train.tsv
grep 'last -B' ${prefix}.\$INDEX_NAME.train | tail -n 1 | awk '{print \$3}' | tr '\n' '\t' >> ${prefix}.train.tsv
grep 'last -S' ${prefix}.\$INDEX_NAME.train | tail -n 1 | awk '{print \$3}' >> ${prefix}.train.tsv
cat <<-END_VERSIONS > versions.yml
"${task.process}":
last: \$(lastdb --version | sed 's/lastdb //')
Expand All @@ -43,6 +54,7 @@ process LAST_TRAIN {
"""
INDEX_NAME=STUB
touch ${prefix}.\$INDEX_NAME.train
touch ${prefix}.train.tsv
cat <<-END_VERSIONS > versions.yml
"${task.process}":
Expand Down
5 changes: 5 additions & 0 deletions modules/nf-core/last/train/meta.yml
Original file line number Diff line number Diff line change
Expand Up @@ -42,8 +42,13 @@ output:
type: file
description: Trained parameter file
pattern: "*.train"
- multiqc:
type: file
description: Alignment parameter summary for MultiQC
pattern: "*.tsv"
authors:
- "@aleksandrabliznina"
- "@charles-plessy"
- "@U13bs1125"
maintainers:
- "@charles-plessy"
Loading

0 comments on commit 882e20c

Please sign in to comment.