From f53a9feaa9114077d9f8bb2084ccf12df414c174 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Sat, 12 Mar 2022 14:07:40 +0000 Subject: [PATCH] Use SAM headers from index directory's .hdr if present Introduce a .hdr file alongside the other index files, useful for setting up detailed `@SQ` headers etc to be used by default when mapping. Output SAM headers from: -H options; .hdr file stored with the index files; the basic hard-coded `@HD`/`@SQ` default headers. An `@HD` header is always printed, with -H overriding one from .hdr (if any), which overrides the default one. A set of `@SQ` headers is always printed, with -H options overriding .hdr overriding the default, similarly. Other headers specified in either -H or .hdr are all output. If -H includes `@SQ` headers, we consider that the user has carefully set up all headers to be output and ignore .hdr entirely. Add descriptions of .alt (brief) and .hdr (complete) to the manual page. This .hdr file is implemented for mem, bwase, and bwape. It is not implemented for bwasw, which is deprecated. --- bwa.1 | 34 ++++++++++++++++- bwa.c | 107 +++++++++++++++++++++++++++++++++++++++++++++--------- bwa.h | 1 + bwape.c | 2 +- bwase.c | 2 +- fastmap.c | 2 +- 6 files changed, 127 insertions(+), 21 deletions(-) diff --git a/bwa.1 b/bwa.1 index eefb235a..37a91e32 100644 --- a/bwa.1 +++ b/bwa.1 @@ -71,6 +71,33 @@ RAM and does not work for databases with total length longer than 2GB. The second algorithm is adapted from the BWT-SW source code. It in theory works with database with trillions of bases. When this option is not specified, the appropriate algorithm will be chosen automatically. + +.PP +.B ADDITIONAL FILES: + +In addition to the files generated by the +.B index +command, several other files may be used by the mapping sub-commands if they +are placed alongside the generated index files: +.TP 4 +.IR prefix .alt +Identifies reference sequences that are alternative alleles, and provides +alignment information that can be used to lift ALT hits over to corresponding +positions within the primary assembly. +.TP +.IR prefix .hdr +Provides SAM headers to be used as the mapping output's headers, instead of +the default simple `@HD' and `@SQ' headers. This can be used to add descriptive +fields to the `@SQ' headers output: species, assembly, alternative names (which +enables querying for e.g. `chr1' or `1' interchangeably), alternate loci (ALT +contigs listed in +.IR prefix .alt +should have `AH:*' or similar), etc. If the file contains an `@HD' header, +it should include `SO:unsorted GO:query'. + +The output from +.B samtools dict +is a suitable starting point for creating this file. .RE .TP @@ -293,7 +320,12 @@ attached to every read in the output. An example is '@RG\\tID:foo\\tSM:bar'. .BI -H \ ARG If ARG starts with @, it is interpreted as a string and gets inserted into the output SAM header; otherwise, ARG is interpreted as a file with all lines -starting with @ in the file inserted into the SAM header. [null] +starting with @ in the file inserted into the SAM header. If the header lines +added via +.B -H +include any `@SQ' headers, the +.IR db.prefix .hdr +file (if it exists) will be ignored. [null] .TP .BI -o \ FILE Write the output SAM file to diff --git a/bwa.c b/bwa.c index 104c95c5..1c538ec6 100644 --- a/bwa.c +++ b/bwa.c @@ -404,39 +404,112 @@ int bwa_idx2mem(bwaidx_t *idx) * SAM header routines * ***********************/ -void bwa_print_sam_hdr(const bntseq_t *bns, const char *hdr_line) +// Remove the first line from lines, possibly printing it +// (lines is newline-separated but NOT newline-terminated) +static const char *remove_line(const char *lines, int print) +{ + const char *nl = strchr(lines, '\n'); + if (nl) { + if (print) err_printf("%.*s\n", (int)(nl - lines), lines); + return nl+1; + } + else { + if (print) err_printf("%s\n", lines); + return NULL; + } +} + +// Return 1 iff there are any @SQ header lines +static int has_SQ(const char *lines) +{ + if (lines == NULL) return 0; + return strncmp(lines, "@SQ\t", 4) == 0 || strstr(lines, "\n@SQ\t"); +} + +// Count the number of @SQ header lines +static int count_SQ(const char *lines) { - int i, n_HD = 0, n_SQ = 0; - extern char *bwa_pg; - - if (hdr_line) { - // check for HD line - const char *p = hdr_line; - if ((p = strstr(p, "@HD")) != 0) { - ++n_HD; - } + int n_SQ = 0; + if (lines) { + const char *p = lines; // check for SQ lines - p = hdr_line; while ((p = strstr(p, "@SQ\t")) != 0) { - if (p == hdr_line || *(p-1) == '\n') ++n_SQ; + if (p == lines || *(p-1) == '\n') ++n_SQ; p += 4; } } - if (n_SQ == 0) { + return n_SQ; +} + +static void print_sam_hdr(const bntseq_t *bns, const char *bns_hdr, const char *hdr_line) +{ + int i, n_HD = 0, n_SQ = count_SQ(hdr_line); + + // Print the user's @HD line (if present), or the index's, or a default + // (As a byproduct, remove any @HD lines from both user and index headers) + if (hdr_line && strncmp(hdr_line, "@HD\t", 4) == 0) { + hdr_line = remove_line(hdr_line, n_HD == 0); + n_HD++; + } + if (bns_hdr && strncmp(bns_hdr, "@HD\t", 4) == 0) { + bns_hdr = remove_line(bns_hdr, n_HD == 0); + n_HD++; + } + if (n_HD == 0) err_printf("@HD\tVN:1.5\tSO:unsorted\tGO:query\n"); + + // If neither the user's nor the index's headers provide @SQ lines, generate them + if (n_SQ == 0 && !has_SQ(bns_hdr)) { for (i = 0; i < bns->n_seqs; ++i) { err_printf("@SQ\tSN:%s\tLN:%d", bns->anns[i].name, bns->anns[i].len); if (bns->anns[i].is_alt) err_printf("\tAH:*\n"); else err_fputc('\n', stdout); } - } else if (n_SQ != bns->n_seqs && bwa_verbose >= 2) - fprintf(stderr, "[W::%s] %d @SQ lines provided with -H; %d sequences in the index. Continue anyway.\n", __func__, n_SQ, bns->n_seqs); - if (n_HD == 0) { - err_printf("@HD\tVN:1.5\tSO:unsorted\tGO:query\n"); } + + if (n_SQ != 0 && n_SQ != bns->n_seqs && bwa_verbose >= 2) + fprintf(stderr, "[W::%s] %d @SQ lines provided with -H; %d sequences in the index. Continue anyway.\n", __func__, n_SQ, bns->n_seqs); + + if (bns_hdr) err_printf("%s\n", bns_hdr); if (hdr_line) err_printf("%s\n", hdr_line); if (bwa_pg) err_printf("%s\n", bwa_pg); } +void bwa_print_sam_hdr(const bntseq_t *bns, const char *hdr_line) +{ + print_sam_hdr(bns, NULL, hdr_line); +} + +void bwa_print_sam_hdr2(const bntseq_t *bns, const char *prefix, const char *hdr_line) +{ + kstring_t str = { 0, 0, NULL }; + char *bns_hdr = NULL; + + // Ignore index .hdr file entirely if the user's headers provide @SQ lines + if (!has_SQ(hdr_line)) { + FILE *fp; + // Otherwise read the .hdr file if present + ksprintf(&str, "%s.hdr", prefix); + if ((fp = fopen(str.s, "r")) != NULL) { + int c, n_SQ; + str.l = 0; + while ((c = getc(fp)) != EOF) + if (c != '\r') kputc(c, &str); + fclose(fp); + + while (str.l > 0 && str.s[str.l-1] == '\n') str.l--; + str.s[str.l] = '\0'; + if (str.l > 0) bns_hdr = str.s; + + n_SQ = count_SQ(str.s); + if (n_SQ != 0 && n_SQ != bns->n_seqs && bwa_verbose >= 2) + fprintf(stderr, "[W::%s] %d @SQ lines provided in \"%s.hdr\"; %d sequences in the index. Continue anyway.\n", __func__, n_SQ, prefix, bns->n_seqs); + } + } + + print_sam_hdr(bns, bns_hdr, hdr_line); + free(str.s); +} + static char *bwa_escape(char *s) { char *p, *q; diff --git a/bwa.h b/bwa.h index 95c324b7..ac6d0008 100644 --- a/bwa.h +++ b/bwa.h @@ -87,6 +87,7 @@ extern "C" { int bwa_mem2idx(int64_t l_mem, uint8_t *mem, bwaidx_t *idx); void bwa_print_sam_hdr(const bntseq_t *bns, const char *hdr_line); + void bwa_print_sam_hdr2(const bntseq_t *bns, const char *prefix, const char *hdr_line); char *bwa_set_rg(const char *s); char *bwa_insert_header(const char *s, char *hdr); diff --git a/bwape.c b/bwape.c index fa4f7f7c..af688723 100644 --- a/bwape.c +++ b/bwape.c @@ -671,7 +671,7 @@ void bwa_sai2sam_pe_core(const char *prefix, char *const fn_sa[2], char *const f } // core loop - bwa_print_sam_hdr(bns, rg_line); + bwa_print_sam_hdr2(bns, prefix, rg_line); while ((seqs[0] = bwa_read_seq(ks[0], 0x40000, &n_seqs, opt0.mode, opt0.trim_qual)) != 0) { int cnt_chg; isize_info_t ii; diff --git a/bwase.c b/bwase.c index 18e86718..8d91951b 100644 --- a/bwase.c +++ b/bwase.c @@ -531,7 +531,7 @@ void bwa_sai2sam_se_core(const char *prefix, const char *fn_sa, const char *fn_f exit(1); } err_fread_noeof(&opt, sizeof(gap_opt_t), 1, fp_sa); - bwa_print_sam_hdr(bns, rg_line); + bwa_print_sam_hdr2(bns, prefix, rg_line); // set ks ks = bwa_open_reads(opt.mode, fn_fa); // core loop diff --git a/fastmap.c b/fastmap.c index 15f0aa42..a3a0c75c 100644 --- a/fastmap.c +++ b/fastmap.c @@ -390,7 +390,7 @@ int main_mem(int argc, char *argv[]) opt->flag |= MEM_F_PE; } } - bwa_print_sam_hdr(aux.idx->bns, hdr_line); + bwa_print_sam_hdr2(aux.idx->bns, argv[optind], hdr_line); aux.actual_chunk_size = fixed_chunk_size > 0? fixed_chunk_size : opt->chunk_size * opt->n_threads; kt_pipeline(no_mt_io? 1 : 2, process, &aux, 3); free(hdr_line);