From af41ed9d52c222b76193ddc9541b4539ea952147 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Mon, 28 Jul 2014 16:03:48 +0200 Subject: [PATCH 01/34] Added VCFv4.3 draft, changes marked in red color --- Makefile | 4 +- VCFv4.3.tex | 1629 +++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 1632 insertions(+), 1 deletion(-) create mode 100644 VCFv4.3.tex diff --git a/Makefile b/Makefile index 7b980c5cc..a0ddef5de 100644 --- a/Makefile +++ b/Makefile @@ -7,7 +7,8 @@ PDFS = BCFv1_qref.pdf \ SAMv1.pdf \ tabix.pdf \ VCFv4.1.pdf \ - VCFv4.2.pdf + VCFv4.2.pdf \ + VCFv4.3.pdf pdf: $(PDFS) @@ -15,6 +16,7 @@ CRAMv2.1.pdf: CRAMv2.1.tex CRAMv2.1.ver SAMv1.pdf: SAMv1.tex SAMv1.ver VCFv4.1.pdf: VCFv4.1.tex VCFv4.1.ver VCFv4.2.pdf: VCFv4.2.tex VCFv4.2.ver +VCFv4.3.pdf: VCFv4.3.tex VCFv4.3.ver .SUFFIXES: .tex .pdf .ver diff --git a/VCFv4.3.tex b/VCFv4.3.tex new file mode 100644 index 000000000..fe9a8a85b --- /dev/null +++ b/VCFv4.3.tex @@ -0,0 +1,1629 @@ +\documentclass[8pt]{article} +\usepackage{enumerate} +\usepackage{graphicx} +\usepackage{lscape} +\usepackage[margin=0.75in]{geometry} +\usepackage[pdfborder={0 0 0}]{hyperref} + +\usepackage{color} +\renewcommand{\thefootnote}{\color{red}\fnsymbol{footnote}} + +\begin{document} +\input{VCFv4.3.ver} +\title{The Variant Call Format Specification \\ \vspace{0.5em} \large VCFv4.3 and BCFv2.2} +\date{\headdate} +\maketitle +\begin{quote}\small +The master version of this document can be found at +\url{https://github.com/samtools/hts-specs}.\\ +This printing is version~\commitdesc\ from that repository, +last modified on the date shown above. +\end{quote} +\vspace*{1em} + +\newpage +\tableofcontents +\newpage + +\section{The VCF specification} +VCF is a text file format (most likely stored in a compressed manner). +It contains meta-information lines {\color{red}(prefixed with "\#\#")}, a header +line {\color{red}(prefixed with "\#")}, and data lines +each containing information about a position in the genome and genotype +information on samples for each position +{\color{red}(text fields separated by tabs). Zero length fields are not allowed, a dot (".") can +be used instead. +In order to ensure interoperability across platforms, VCF compliant implementations must support +both LF (\texttt{\textbackslash n}) and CR+LF (\texttt{\textbackslash r\textbackslash n}) newline conventions. +} + +\subsection{An example} +\scriptsize +\begin{verbatim} +##fileformat=VCFv4.3 +##fileDate=20090805 +##source=myImputationProgramV3.1 +##reference=file:///seq/references/1000GenomesPilot-NCBI36.fasta +##contig= +##phasing=partial +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##FILTER= +##FILTER= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 +20 14370 rs6054257 G A 29 PASS NS=3;DP=14;AF=0.5;DB;H2 GT:GQ:DP:HQ 0|0:48:1:51,51 1|0:48:8:51,51 1/1:43:5:.,. +20 17330 . T A 3 q10 NS=3;DP=11;AF=0.017 GT:GQ:DP:HQ 0|0:49:3:58,50 0|1:3:5:65,3 0/0:41:3 +20 1110696 rs6040355 A G,T 67 PASS NS=2;DP=10;AF=0.333,0.667;AA=T;DB GT:GQ:DP:HQ 1|2:21:6:23,27 2|1:2:0:18,2 2/2:35:4 +20 1230237 . T . 47 PASS NS=3;DP=13;AA=T GT:GQ:DP:HQ 0|0:54:7:56,60 0|0:48:4:51,51 0/0:61:2 +20 1234567 microsat1 GTC G,GTCT 50 PASS NS=3;DP=9;AA=G GT:GQ:DP 0/1:35:4 0/2:17:2 1/1:40:3 +\end{verbatim} +\normalsize +This example shows (in order): a good simple SNP, a possible SNP that has been filtered out because its quality is below 10, a site at which two alternate alleles are called, with one of them (T) being ancestral (possibly a reference sequencing error), a site that is called monomorphic reference (i.e. with no alternate alleles), and a microsatellite with two alternative alleles, one a deletion of 2 bases (TC), and the other an insertion of one base (T). Genotype data are given for three samples, two of which are phased and the third unphased, with per sample genotype quality, depth and haplotype qualities (the latter only for the phased samples) given as well as the genotypes. The microsatellite calls are unphased. +\subsection{Meta-information lines} + +File meta-information is included after the \#\# string and must be key=value +pairs. It is strongly encouraged that information lines describing the +\texttt{INFO}, \texttt{FILTER}, \texttt{FORMAT} and +{\color{red}\texttt{contig}} entries used in the body of the VCF file be +included in the meta-information section. Although they are optional, if these +lines are present then they must be completely well-formed {\color{red} and +their ID must be unique within their type. +Meta-information lines can be in any order with the exception of `fileformat` +which must come first. Note that BCF, the binary counterpart of VCF, requires +that all entries are present.} + +\subsubsection{File format} +A single `fileformat' field is always required, must be the first line in the file, and details the VCF format version number. For example, for VCF version 4.2, this line should read: + +\begin{verbatim} +##fileformat=VCFv4.2 +\end{verbatim} + +{\color{red} +\subsubsection{Character encoding} +Character encoding is declared by the `fileencoding' header line. Only +encodings that are a superset of US-ASCII are allowed, the default encoding is +UTF-8. + +\begin{verbatim} +##fileencoding=UTF-8 +\end{verbatim} +} + +\subsubsection{Information field format} +INFO fields should be described as follows (first four keys are required, source and version are recommended): + +\begin{verbatim} +##INFO= +\end{verbatim} + +Possible Types for INFO fields are: Integer {\color{red} (32-bit, signed)}, Float {\color{red} (32-bit)}, Flag, Character, and +String. The Number entry is an Integer that describes the number of values that +can be included with the INFO field. For example, if the INFO field contains a +single number, then this value should be $1$; if the INFO field describes a +pair of numbers, then this value should be $2$ and so on. There are also +certain special characters used to define special cases: + +\begin{itemize} + \item If the field has one value per alternate allele then this value should be `A'. + \item If the field has one value for each possible allele (including the reference), then this value should be `R'. + \item If the field has one value for each possible genotype (more relevant to the FORMAT tags) then this value should be `G'. + \item If the number of possible values varies, is unknown, or is unbounded, then this value should be `.'. +\end{itemize} + +The `Flag' type indicates that the INFO field does not contain a Value entry, and hence the Number should be $0$ in this case. The Description value must be surrounded by double-quotes. Double-quote character can be escaped with backslash $\backslash$ and backslash as $\backslash\backslash$. Source and Version values likewise should be surrounded by double-quotes and specify the annotation source (case-insensitive, e.g. ``dbsnp'') and exact version (e.g. ``138''), respectively for computational use. + +\subsubsection{Filter field format} +FILTERs that have been applied to the data should be described as follows: + +\begin{verbatim} +##FILTER= +\end{verbatim} + +\subsubsection{Individual format field format} +Likewise, Genotype fields specified in the FORMAT field should be described as follows: + +\begin{verbatim} +##FORMAT= +\end{verbatim} + +Possible Types for FORMAT fields are: Integer, Float, Character, and String (this field is otherwise defined precisely as the INFO field). + +\subsubsection{Alternative allele field format} +Symbolic alternate alleles for imprecise structural variants: + +\begin{verbatim} +##ALT= +\end{verbatim} +The ID field indicates the type of structural variant, and can be a colon-separated list of types and subtypes. ID values are case sensitive strings and may not contain whitespace or angle brackets. The first level type must be one of the following: +\begin{itemize} + \item DEL Deletion relative to the reference + \item INS Insertion of novel sequence relative to the reference + \item DUP Region of elevated copy number relative to the reference + \item INV Inversion of reference sequence + \item CNV Copy number variable region (may be both deletion and duplication) +\end{itemize} + +The CNV category should not be used when a more specific category can be applied. Reserved subtypes include: +\begin{itemize} + \item DUP:TANDEM Tandem duplication + \item DEL:ME Deletion of mobile element relative to the reference + \item INS:ME Insertion of a mobile element relative to the reference +\end{itemize} + +In addition, it is highly recommended (but not required) that the header include tags describing the reference and contigs backing the data contained in the file. These tags are based on the SQ field from the SAM spec; all tags are optional (see the VCF example above). + +For all of the \#\#INFO, \#\#FORMAT, \#\#FILTER, and \#\#ALT metainformation, extra fields can be included after the default fields. For example: + +\begin{verbatim} +##INFO= +\end{verbatim} +In the above example, the extra fields of ``Source'' and ``Version'' are provided. Optional fields should be stored as strings even for numeric values. + +\subsubsection{Assembly field format} +Breakpoint assemblies for structural variations may use an external file: +\begin{verbatim} +##assembly=url +\end{verbatim} + +The URL field specifies the location of a fasta file containing breakpoint assemblies referenced in the VCF records for structural variants via the BKPTID INFO key. +\subsubsection{Contig field format} +As with chromosomal sequences it is highly recommended (but not required) that the header include tags describing the contigs referred to in the VCF file. This furthermore allows these contigs to come from different files. The format is identical to that of a reference sequence, but with an additional URL tag to indicate where that sequence can be found. For example:. +\begin{verbatim} +##contig= +\end{verbatim} +\subsubsection{Sample field format} +It is possible to define sample to genome mappings as shown below:\footnote{\color{red}Some description of SAMPLE would be nice. What +is Genomes, Mixture, $k$?} +\small +{\color{red} +\begin{verbatim} +##SAMPLE= +\end{verbatim} +} +\normalsize +\subsubsection{Pedigree field format} +It is possible to record relationships between genomes using the following syntax: +\begin{verbatim} +##PEDIGREE= +\end{verbatim} +or a link to a database: +\begin{verbatim} +##pedigreeDB= +\end{verbatim} +\subsection{Header line syntax} +The header line names the 8 fixed, mandatory columns. These columns are as follows: + +\begin{enumerate} + \item \#CHROM + \item POS + \item ID + \item REF + \item ALT + \item QUAL + \item FILTER + \item INFO +\end{enumerate} + +If genotype data is present in the file, these are followed by a FORMAT column header, then an arbitrary number of sample IDs. The header line is tab-delimited. + +\subsection{Data lines} +\subsubsection{Fixed fields} +There are 8 fixed fields per record. All data lines are tab-delimited. In all cases, missing values are specified with a dot (`.'). Fixed fields are: + +\begin{enumerate} + \item CHROM - chromosome: An identifier from the reference genome or an angle-bracketed ID String (``$<$ID$>$'') pointing to a contig in the assembly file (cf. the \#\#assembly line in the header). All entries for a specific CHROM should form a contiguous block within the VCF file. The colon symbol (:) must be absent from all chromosome names to avoid parsing errors when dealing with breakends. (String, no white-space permitted, Required). + \item POS - position: The reference position, with the 1st base having position 1. Positions are sorted numerically, in increasing order, within each reference sequence CHROM. It is permitted to have multiple records with the same POS. Telomeres are indicated by using positions 0 or N+1, where N is the length of the corresponding chromosome or contig. (Integer, Required) + \item ID - identifier: Semi-colon separated list of unique identifiers where available. If this is a dbSNP variant it is encouraged to use the rs number(s). No identifier should be present in more than one data record. If there is no identifier available, then the missing value should be used. (String, no white-space or semi-colons permitted) + \item REF - reference base(s): Each base must be one of A,C,G,T,N (case insensitive). Multiple bases are permitted. The value in the POS field refers to the position of the first base in the String. For simple insertions and deletions in which either the REF or one of the ALT alleles would otherwise be null/empty, the REF and ALT Strings must include the base before the event (which must be reflected in the POS field), unless the event occurs at position 1 on the contig in which case it must include the base after the event; this padding base is not required (although it is permitted) for e.g. complex substitutions or other events where all alleles have at least one base represented in their Strings. If any of the ALT alleles is a symbolic allele (an angle-bracketed ID String ``$<$ID$>$'') then the padding base is required and POS denotes the coordinate of the base preceding the polymorphism. Tools processing VCF files are not required to preserve case in the allele Strings. (String, Required). + \item ALT - alternate base(s): Comma separated list of alternate non-reference alleles called on at least one of the samples. Options are base Strings made up of the bases A,C,G,T,N,*, (case insensitive) or an angle-bracketed ID String (``$<$ID$>$'') or a breakend replacement string as described in the section on breakends. The `*' allele is reserved to indicate that the allele is missing due to a upstream deletion. If there are no alternative alleles, then the missing value should be used. Tools processing VCF files are not required to preserve case in the allele String, except for IDs, which are case sensitive. (String; no whitespace, commas, or angle-brackets are permitted in the ID String itself) + \item QUAL - quality: Phred-scaled quality score for the assertion made in ALT. i.e. $-10log_{10}$ prob(call in ALT is wrong). If ALT is `.' (no variant) then this is $-10log_{10}$ prob(variant), and if ALT is not `.' this is $-10log_{10}$ prob(no variant). If unknown, the missing value should be specified. (Float) + \item FILTER - filter status: PASS if this position has passed all filters, i.e. a call is made at this position. Otherwise, if the site has not passed all filters, a semicolon-separated list of codes for filters that fail. e.g. ``q10;s50'' might indicate that at this site the quality is below 10 and the number of samples with data is below 50\% of the total number of samples. `0' is reserved and should not be used as a filter String. If filters have not been applied, then this field should be set to the missing value. (String, no white-space or semi-colons permitted) + \item INFO - additional information: (String, no semi-colons or + equals-signs permitted; commas are permitted only as delimiters for lists of + values; characters with special meaning can be encoded using URL encoding; space characters are allowed) + INFO fields are encoded as a semicolon-separated series of short keys + with optional values in the format: $<$key$>$=$<$data$>$[,data]. + {\color{red} INFO keys must match the regular expression \texttt{\^{}[A-Za-z\_][0-9A-Za-z\_.]*\$}.} + Arbitrary keys are permitted, although the following sub-fields are reserved (albeit optional): +\begin{itemize} + \item AA : ancestral allele + \item AC : allele count in genotypes, for each ALT allele, in the same order as listed + \item AF : allele frequency for each ALT allele in the same order as listed: use this when estimated from primary data, not called genotypes + \item AN : total number of alleles in called genotypes + \item BQ : RMS base quality at this position + \item CIGAR : cigar string describing how to align an alternate allele to the reference allele + \item DB : dbSNP membership + \item DP : combined depth across samples, e.g. DP=154 + \item END : end position of the variant described in this record (for use with symbolic alleles) + \item H2 : membership in hapmap2 + \item H3 : membership in hapmap3 + \item MQ : RMS mapping quality, e.g. MQ=52 + \item MQ0 : Number of MAPQ == 0 reads covering this record + \item NS : Number of samples with data + \item SB : strand bias at this position + \item SOMATIC : indicates that the record is a somatic mutation, for cancer genomics + \item VALIDATED : validated by follow-up experiment + \item 1000G : membership in 1000 Genomes +\end{itemize} +\end{enumerate} +The exact format of each INFO sub-field should be specified in the meta-information (as described above). +Example for an INFO field: DP=154;MQ=52;H2. Keys without corresponding values are allowed in order to indicate group membership (e.g. H2 indicates the SNP is found in HapMap 2). It is not necessary to list all the properties that a site does NOT have, by e.g. H2=0. See below for additional reserved INFO sub-fields used to encode structural variants. +\subsubsection{Genotype fields} +If genotype information is present, then the same types of data must be present +for all samples. First a FORMAT field is given specifying the data types and +order ({\color{red} colon-separated FORMAT ids matching the regular expression \texttt{\^{}[A-Za-z\_][0-9A-Za-z\_.]*\$}}). This is followed by one data block per +sample, with the colon-separated data corresponding to the types +specified in the format. The first sub-field must always be the genotype (GT) +if it is present. There are no required sub-fields. + +As with the INFO field, there are several common, reserved keywords that are standards across the community: + +\begin{itemize} +\renewcommand{\labelitemii}{$\circ$} + \item GT : genotype, encoded as allele values separated by either of $/$ or $\mid$. The allele values are 0 for the reference allele (what is in the REF field), 1 for the first allele listed in ALT, 2 for the second allele list in ALT and so on. For diploid calls examples could be $0/1$, $1\mid0$, or $1/2$, etc. For haploid calls, e.g. on Y, male non-pseudoautosomal X, or mitochondrion, only one allele value should be given; a triploid call might look like $0/0/1$. If a call cannot be made for a sample at a given locus, `.' should be specified for each missing allele in the GT field (for example `$./.$' for a diploid genotype and `.' for haploid genotype). The meanings of the separators are as follows (see the PS field below for more details on incorporating phasing information into the genotypes): + \begin{itemize} + \item $/$ : genotype unphased + \item $\mid$ : genotype phased + \end{itemize} + \item DP : read depth at this position for this sample (Integer) + \item FT : sample genotype filter indicating if this genotype was ``called'' (similar in concept to the FILTER field). Again, use PASS to indicate that all filters have been passed, a semi-colon separated list of codes for filters that fail, or `.' to indicate that filters have not been applied. These values should be described in the meta-information in the same way as FILTERs (String, no white-space or semi-colons permitted) + \item GL : genotype likelihoods comprised of comma separated floating point $log_{10}$-scaled likelihoods for all possible genotypes given the set of alleles defined in the REF and ALT fields. In presence of the GT field the same ploidy is expected and the canonical order is used; without GT field, diploidy is assumed. If A is the allele in REF and B,C,... are the alleles as ordered in ALT, the ordering of genotypes for the likelihoods is given by: F(j/k) = (k*(k+1)/2)+j. In other words, for biallelic sites the ordering is: AA,AB,BB; for triallelic sites the ordering is: AA,AB,BB,AC,BC,CC, etc. For example: GT:GL 0/1:-323.03,-99.29,-802.53 (Floats) + \item GLE : genotype likelihoods of heterogeneous ploidy, used in presence of uncertain copy number. For example: GLE=0:-75.22,1:-223.42,0/0:-323.03,1/0:-99.29,1/1:-802.53 (String) + \item PL : the phred-scaled genotype likelihoods rounded to the closest integer (and otherwise defined precisely as the GL field) (Integers) + \item GP : {\color{red} genotype posterior probabilities in the range 0 to 1 using the same ordering as the GL field; one use can be to store imputed genotype probabilities (Float)} + \item GQ : conditional genotype quality, encoded as a phred quality $-10log_{10}$ p(genotype call is wrong, conditioned on the site's being variant) (Integer) + \item HQ : haplotype qualities, two comma separated phred qualities (Integers) + \item PS : phase set. A phase set is defined as a set of phased genotypes to which this genotype belongs. Phased genotypes for an individual that are on the same chromosome and have the same PS value are in the same phased set. A phase set specifies multi-marker haplotypes for the phased genotypes in the set. All phased genotypes that do not contain a PS subfield are assumed to belong to the same phased set. If the genotype in the GT field is unphased, the corresponding PS field is ignored. The recommended convention is to use the position of the first variant in the set as the PS identifier (although this is not required). (Non-negative 32-bit Integer) + \item PQ : phasing quality, the phred-scaled probability that alleles are ordered incorrectly in a heterozygote (against all other members in the phase set). We note that we have not yet included the specific measure for precisely defining ``phasing quality''; our intention for now is simply to reserve the PQ tag for future use as a measure of phasing quality. (Integer) + \item EC : comma separated list of expected alternate allele counts for each alternate allele in the same order as listed in the ALT field (typically used in association analyses) (Integers) + \item MQ : RMS mapping quality, similar to the version in the INFO field. (Integer) +\end{itemize} + +If any of the fields is missing, it is replaced with the missing value. For example if the FORMAT is GT:GQ:DP:HQ then $0\mid0:.:23:23,34$ indicates that GQ is missing. Trailing fields can be dropped (with the exception of the GT field, which should always be present if specified in the FORMAT field). + +See below for additional genotype fields used to encode structural variants. Additional Genotype fields can be defined in the meta-information. However, software support for such fields is not guaranteed. + +\section{Understanding the VCF format and the haplotype representation} +VCF records use a single general system for representing genetic variation data composed of: +\begin{itemize} + \item Allele: representing single genetic haplotypes (A, T, ATC). + \item Genotype: an assignment of alleles for each chromosome of a single named sample at a particular locus. + \item VCF record: a record holding all segregating alleles at a locus (as well as genotypes, if appropriate, for multiple individuals containing alleles at that locus). +\end{itemize} +VCF records use a simple haplotype representation for REF and ALT alleles to describe variant haplotypes at a locus. ALT haplotypes are constructed from the REF haplotype by taking the REF allele bases at the POS in the reference genotype and replacing them with the ALT bases. In essence, the VCF record specifies a-REF-t and the alternative haplotypes are a-ALT-t for each alternative allele. + + +\section{INFO keys used for structural variants} +When the INFO keys reserved for encoding structural variants are used for imprecise variants, the values should be best estimates. When a key reflects a property of a single alt allele (e.g. SVLEN), then when there are multiple alt alleles there will be multiple values for the key corresponding to each alelle (e.g. SVLEN=-100,-110 for a deletion with two distinct alt alleles). + +The following INFO keys are reserved for encoding structural variants. In general, when these keys are used by imprecise variants, the values should be best estimates. When a key reflects a property of a single alt allele (e.g. SVLEN), then when there are multiple alt alleles there will be multiple values for the key corresponding to each alelle (e.g. SVLEN=-100,-110 for a deletion with two distinct alt alleles). +\footnotesize +\begin{verbatim} +##INFO= +##INFO= +##INFO= +\end{verbatim} +\normalsize +For precise variants, END is POS + length of REF allele - 1, and the for imprecise variants the corresponding best estimate. +\footnotesize +\begin{verbatim} +##INFO= +\end{verbatim} +\normalsize +Value should be one of DEL, INS, DUP, INV, CNV, BND. This key can be derived from the REF/ALT fields but is useful for filtering. +\footnotesize +\begin{verbatim} +##INFO= +\end{verbatim} +\normalsize +One value for each ALT allele. Longer ALT alleles (e.g. insertions) have positive values, shorter ALT alleles (e.g. deletions) have negative values. +\footnotesize +\begin{verbatim} +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +\end{verbatim} +\normalsize +For precise variants, the consensus sequence the alternate allele assembly is derivable from the REF and ALT fields. However, the alternate allele assembly file may contain additional information about the characteristics of the alt allele contigs. +\footnotesize +\begin{verbatim} +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +\end{verbatim} +\normalsize + +\section{FORMAT keys used for structural variants} +\footnotesize +\begin{verbatim} +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +\end{verbatim} +\normalsize +These keys are analogous to GT/GQ/GL/GP and are provided for genotyping +imprecise events by copy number (either because there is an unknown number of +alternate alleles or because the haplotypes cannot be determined). CN specifies +the integer copy number of the variant in this sample. CNQ is encoded as a +phred quality $-10log_{10}$ p(copy number genotype call is wrong). CNL +specifies a list of $log_{10}$ likelihoods for each potential copy number, +starting from zero. {\color{red} CNP is 0 to 1-scaled copy number posterior +probabilities (and otherwise defined precisely as the CNL field), intended to +store imputed genotype probabilities.} When possible, GT/GQ/GL/GP should be used +instead of (or in addition to) these keys. + +\section{Representing variation in VCF records} +\subsection{Creating VCF entries for SNPs and small indels} +\subsubsection{Example 1} +For example, suppose we are looking at a locus in the genome: + +\vspace{0.3cm} +\begin{tabular}{ | l | l | l | } +\hline +Example & Sequence & Alteration \\ \hline +Ref & a t C g a & C is the reference base \\ \hline +1 & a t G g a & C base is a G in some individuals \\ \hline +2 & a t \ - \ g a & C base is deleted w.r.t. the reference sequence\\ \hline +3 & a t CAg a & A base is inserted w.r.t. the reference sequence \\ \hline +\end{tabular} +\vspace{0.3cm} + +Representing these as VCF records would be done as follows: +\begin{enumerate} + \item A SNP polymorphism of C/G $\rightarrow \{C,G\} \rightarrow$ C is the reference allele + \item A single base deletion of C $\rightarrow \{tC,t\} \rightarrow$ tC is the reference allele + \item A single base insertion of A $\rightarrow \{tC,tCA\} \rightarrow$ tC is the reference allele +\end{enumerate} + +\begin{tabular}{ l l l l l l l l} + \#CHROM & POS & ID & REF & ALT & QUAL & FILTER & INFO \\ + $20$ & $3$ & . & C & G & . & PASS & DP=100 \\ + $20$ & $2$ & . & TC & T & . & PASS & DP=100 \\ + $20$ & $2$ & . & TC & TCA & . & PASS & DP=100 \\ +\end{tabular} + +\subsubsection{Example 2} +Suppose I see a the following in a population of individuals and want to represent these three segregating alleles: + +\vspace{0.3cm} +\begin{tabular}{ | l | l | l | } +\hline +Example & Sequence & Alteration \\ \hline +Ref & a t C g a & C is the reference base \\ \hline +$1$ & a t G g a & C base is a G in some individuals \\ \hline +$2$ & a t \ - \ g a & C base is deleted w.r.t. the reference sequence \\ \hline +\end{tabular} +\vspace{0.3cm} + +In this case there are three segregating alleles: $\{tC,tG,t\}$ with a corresponding VCF record: + +\vspace{0.3cm} +\begin{tabular}{ l l l l l l l l} + \#CHROM & POS & ID & REF & ALT & QUAL & FILTER & INFO \\ + $20$ & $2$ & . & TC & TG,T & . & PASS & DP=100 \\ +\end{tabular} +\subsubsection{Example 3} +Now suppose I have this more complex example: + +\vspace{0.3cm} +\begin{tabular}{ | l | l | l | } +\hline +Example & Sequence & Alteration \\ \hline +Ref & a t C g a & C is the reference base \\ \hline +$1$ & a t \ - \ g a & C base is is deleted w.r.t. the reference sequence \\ \hline +$2$ & a t \ - - \ a & C and G bases are deleted w.r.t. the reference sequence\\ \hline +$3$ & a t CAg a & A base is inserted w.r.t. the reference sequence \\ \hline +\end{tabular} + +\vspace{0.3cm} +There are actually four segregating alleles: $\{tCg,tg,t,tCAg\}$ over bases 2-4. This complex set of allele is represented in VCF as: + +\vspace{0.3cm} +\begin{tabular}{ l l l l l l l l} + \#CHROM & POS & ID & REF & ALT & QUAL & FILTER & INFO \\ + $20$ & $2$ & . & TCG & TG,T,TCAG & . & PASS & DP=100 \\ +\end{tabular} +\vspace{0.3cm} + +Note that in VCF records, the molecular equivalence explicitly listed above in the per-base alignment is discarded, so the actual placement of equivalent g isn't retained. For completeness, VCF records are dynamically typed, so whether a VCF record is a SNP, Indel, Mixed, or Reference site depends on the properties of the alleles in the record. + +\subsection{Decoding VCF entries for SNPs and small indels} +\subsubsection{SNP VCF record} +Suppose I receive the following VCF record: + +\vspace{0.3cm} +\begin{tabular}{ l l l l l l l l} + \#CHROM & POS & ID & REF & ALT & QUAL & FILTER & INFO \\ + $20$ & $3$ & . & C & T & . & PASS & DP=100 \\ +\end{tabular} +\vspace{0.3cm} + +This is a SNP since its only single base substitution and there are only two alleles so I have the two following segregating haplotypes: + +\vspace{0.3cm} +\begin{tabular}{ | l | l | l | } +\hline +Example & Sequence & Alteration \\ \hline +Ref & \verb|a t C g a| & C is the reference base \\ \hline +$1$ & \verb|a t T g a| & C base is a T in some individuals \\ \hline +\end{tabular} + +\subsubsection{Insertion VCF record} +Suppose I receive the following VCF record: + +\vspace{0.3cm} +\begin{tabular}{ l l l l l l l l} + \#CHROM & POS & ID & REF & ALT & QUAL & FILTER & INFO \\ + $20$ & $3$ & . & C & CTAG & . & PASS & DP=100 \\ +\end{tabular} +\vspace{0.3cm} + +This is a insertion since the reference base C is being replaced by C [the reference base] plus three insertion bases TAG. Again there are only two alleles so I have the two following segregating haplotypes: + +\vspace{0.3cm} +\begin{tabular}{ | l | l | l | } +\hline +Example & Sequence & Alteration \\ \hline +Ref & \verb|a t C - - - g a| & C is the reference base \\ \hline +$1$ & \verb|a t C T A G g a| & following the C base is an insertion of 3 bases \\ \hline +\end{tabular} + +\subsubsection{Deletion VCF record} +Suppose I receive the following VCF record: + +\vspace{0.3cm} +\begin{tabular}{ l l l l l l l l} + \#CHROM & POS & ID & REF & ALT & QUAL & FILTER & INFO \\ + $20$ & $2$ & . & TCG & T & . & PASS & DP=100 \\ +\end{tabular} +\vspace{0.3cm} + +This is a deletion of two reference bases since the reference allele TCG is being replaced by just the T [the reference base]. Again there are only two alleles so I have the two following segregating haplotypes: + +\vspace{0.3cm} +\begin{tabular}{ | l | l | l | } +\hline +Example & Sequence & Alteration \\ \hline +Ref & \verb|a T C G a| & T is the (first) reference base \\ \hline +$1$ & \verb|a T - - a| & following the T base is a deletion of 2 bases \\ \hline +\end{tabular} + +\subsubsection{Mixed VCF record for a microsatellite} +Suppose I receive the following VCF record: + +\vspace{0.3cm} +\begin{tabular}{ l l l l l l l l} + \#CHROM & POS & ID & REF & ALT & QUAL & FILTER & INFO \\ + $20$ & $4$ & . & GCG & G,GCGCG & . & PASS & DP=100 \\ +\end{tabular} +\vspace{0.3cm} + +This is a mixed type record containing a 2 base insertion and a 2 base deletion. There are are three segregating alleles so I have the three following haplotypes: + +\vspace{0.3cm} +\begin{tabular}{ | l | l | l | } +\hline +Example & Sequence & Alteration \\ \hline +Ref & \verb|a t c G C G - - a| & G is the (first) reference base \\ \hline +$1$ & \verb|a t c G - - - - a| & following the G base is a deletion of 2 bases \\ \hline +$2$ & \verb|a t c G C G C G a| & following the G base is a insertion of 2 bases \\ \hline +\end{tabular} +\vspace{0.3cm} + +Note that in all of these examples dashes have been added to make the haplotypes clearer but of course the equivalence among bases isn't provided by the VCF. Technically the following is an equivalent alignment: + +\vspace{0.3cm} +\begin{tabular}{ | l | l | l | } +\hline +Example & Sequence & Alteration \\ \hline +Ref & \verb|a t c G - - C G a| & G is the (first) reference base \\ \hline +$1$ & \verb|a t c G - - - - a| & following the G base is a deletion of 2 bases \\ \hline +$2$ & \verb|a t c G C G C G a| & following the G base is a insertion of 2 bases \\ \hline +\end{tabular} + +\subsection{Encoding Structural Variants} +The following page contains examples of structural variants encoded in VCF: +\pagebreak +\footnotesize +\begin{landscape} +\begin{verbatim} +VCF STRUCTURAL VARIANT EXAMPLE + +##fileformat=VCFv4.1 +##fileDate=20100501 +##reference=1000GenomesPilot-NCBI36 +##assembly=ftp://ftp-trace.ncbi.nih.gov/1000genomes/ftp/release/sv/breakpoint_assemblies.fasta +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##INFO= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##ALT= +##FORMAT= +##FORMAT= +##FORMAT= +##FORMAT= +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 +1 2827694 rs2376870 CGTGGATGCGGGGAC C . PASS SVTYPE=DEL;END=2827762;HOMLEN=1;HOMSEQ=G;SVLEN=-68 GT:GQ 1/1:13.9 +2 321682 . T 6 PASS SVTYPE=DEL;END=321887;SVLEN=-205;CIPOS=-56,20;CIEND=-10,62 GT:GQ 0/1:12 +2 14477084 . C 12 PASS SVTYPE=DEL;END=14477381;SVLEN=-297;CIPOS=-22,18;CIEND=-12,32 GT:GQ 0/1:12 +3 9425916 . C 23 PASS SVTYPE=INS;END=9425916;SVLEN=6027;CIPOS=-16,22 GT:GQ 1/1:15 +3 12665100 . A 14 PASS SVTYPE=DUP;END=12686200;SVLEN=21100;CIPOS=-500,500;CIEND=-500,500 GT:GQ:CN:CNQ ./.:0:3:16.2 +4 18665128 . T 11 PASS SVTYPE=DUP;END=18665204;SVLEN=76;CIPOS=-10,10;CIEND=-10,10 GT:GQ:CN:CNQ ./.:0:5:8.3 +\end{verbatim} +\end{landscape} +\pagebreak +\normalsize + +The example shows in order: +\begin{enumerate} + \item A precise deletion with known breakpoint, a one base micro-homology, and a sample that is homozygous for the deletion. + \item An imprecise deletion of approximately 105 bp. + \item An imprecise deletion of an ALU element relative to the reference. + \item An imprecise insertion of an L1 element relative to the reference. + \item An imprecise duplication of approximately 21Kb. The sample genotype is copy number 3 (one extra copy of the duplicated sequence). + \item An imprecise tandem duplication of 76bp. The sample genotype is copy number 5 (but the two haplotypes are not known). +\end{enumerate} + +\subsection{Specifying complex rearrangements with breakends} + +An arbitrary rearrangement event can be summarized as a set of novel \textbf{adjacencies}. Each adjacency ties together $2$ \textbf{breakends}. The two breakends at either end of a novel adjacency are called \textbf{mates}. + +There is one line of VCF (i.e. one record) for each of the two breakends in a novel adjacency. A breakend record is identified with the tag ``SYTYPE=BND'' in the INFO field. The REF field of a breakend record indicates a base or sequence s of bases beginning at position POS, as in all VCF records. The ALT field of a breakend record indicates a replacement for s. This ``breakend replacement'' has three parts: +\begin{enumerate} + \item The string t that replaces places s. The string t may be an extended version of s if some novel bases are inserted during the formation of the novel adjacency. + \item The position p of the mate breakend, indicated by a string of the form ``chr:pos''. This is the location of the first mapped base in the piece being joined at this novel adjacency. + \item The direction that the joined sequence continues in, starting from p. This is indicated by the orientation of square brackets surrounding p. + +\end{enumerate} +These 3 elements are combined in 4 possible ways to create the ALT. In each of the 4 cases, the assertion is that s is replaced with t, and then some piece starting at position p is joined to t. The cases are: + +\vspace{0.3cm} +\begin{tabular}{ l l l } +REF & ALT & Meaning \\ +s & t$[$p$[$ & piece extending to the right of p is joined after t \\ +s & t$]$p$]$ & reverse comp piece extending left of p is joined after t \\ +s & $]$p$]$t & piece extending to the left of p is joined before t \\ +s & $[$p$[$t & reverse comp piece extending right of p is joined before t \\ +\end{tabular} +\vspace{0.3cm} + +The example in Figure 1 shows a 3-break operation involving 6 breakends. It exemplifies all possible orientations of breakends in adjacencies. Notice how the ALT field expresses the orientation of the breakends. + +\begin{figure}[ht] +\centering +\includegraphics[width=4in,height=2.96in]{img/all_orientations-400x296.png} +\caption{All possible orientations of breakends} +\end{figure} + +\vspace{0.3cm} +\begin{tabular}{ l l l l l l l l } +\#CHROM &POS & ID & REF & ALT & QUAL & FILTER & INFO \\ +$2$ & $321681$ & bnd\_W & G & G$]17$:$198982]$ & $6$ & PASS & SVTYPE=BND \\ +$2$ & $321682$ & bnd\_V & T & $]$13:123456$]$T & 6 & PASS & SVTYPE=BND \\ +$13$ & $123456$ & bnd\_U & C & C$[$2:321682$[$ & 6 & PASS & SVTYPE=BND \\ +$13$ & $123457$ & bnd\_X & A & $[$17:198983$[$A & 6 & PASS & SVTYPE=BND \\ +$17$ & $198982$ & bnd\_Y & A & A$]$2:321681$]$ & 6 & PASS & SVTYPE=BND \\ +$17$ & $198983$ & bnd\_Z & C & $[$13:123457$[$C & 6 & PASS & SVTYPE=BND \\ +\end{tabular} + +\subsubsection{Inserted Sequence} + +Sometimes, as shown in Figure 2, some bases are inserted between the two breakends, this information is also carried in the ALT column: + +\begin{figure}[h] +\centering +\includegraphics[width=4in,height=1.89in]{img/inserted_sequence-400x189.png} +\caption{Inserted sequence between breakends} +\end{figure} + +\vspace{0.3cm} +\footnotesize +\begin{tabular}{ l l l l l l l l } +\#CHROM & POS & ID & REF & ALT & QUAL & FILTER & INFO \\ +$2$ & $321682$ & bnd\_V & T & $]13:123456]$AGTNNNNNCAT & $6$ & PASS & SVTYPE=BND;MATEID=bnd\_U \\ +$13$ & $123456$ & bnd\_U & C & CAGTNNNNNCA$[2:321682[$ & $6$ & PASS & SVTYPE=BND;MATEID=bnd\_V \\ +\end{tabular} +\normalsize +\vspace{0.3cm} + +\subsubsection{Large Insertions} +If the insertion is too long to be conveniently stored in the ALT column, as in the 329 base insertion shown in Figure 3, it can be represented by a contig from the assembly file: + +\begin{figure}[h] +\centering +\includegraphics[width=4in,height=2.47in]{img/inserted_contig-400x247.png} +\caption{Inserted contig} +\end{figure} + +\vspace{0.3cm} +\small +\begin{tabular}{ l l l l l l l l } +\#CHROM & POS & ID & REF & ALT & QUAL & FILTER & INFO \\ +$13$ & $123456$ & bnd\_U & C & C$[<$ctg1$>:1[$ & $6$ & PASS & SVTYPE=BND \\ +$13$ & $123457$ & bnd\_V & A & $]<$ctg$1>:329]$A & $6$ & PASS & SVTYPE=BND \\ +\end{tabular} +\normalsize +\vspace{0.3cm} + +\textbf{Note}: In the special case of the complete insertion of a sequence between two base pairs, it is recommended to use the shorthand notation described above: + +\vspace{0.3cm} +\begin{tabular}{ l l l l l l l l } +\#CHROM & POS & ID & REF & ALT & QUAL & FILTER & INFO \\ +$13$ & $321682$ & INS0 & T & C$<$ctg$1>$ & $6$ & PASS & SVTYPE=INS \\ +\end{tabular} +\vspace{0.3cm} + +If only a portion of $<$ctg$1>$, say from position $7$ to position $214$, is inserted, the VCF would be: + +\vspace{0.3cm} +\small +\begin{tabular}{ l l l l l l l l } +\#CHROM & POS & ID & REF & ALT & QUAL & FILTER & INFO \\ +$13$ & $123456$ & bnd\_U & C & C$[<$ctg1$>:7[$ & $6$ & PASS & SVTYPE=BND \\ +$13$ & $123457$ & bnd\_V & A & $]<$ctg$1>:214]$A & $6$ & PASS & SVTYPE=BND \\ +\end{tabular} +\normalsize +\vspace{0.3cm} + +If $<$ctg$1>$ is circular and a segment from position 229 to position 45 is inserted, i.e. continuing from position 329 on to position 1, this is represented by adding a circular adjacency: + +\vspace{0.3cm} +\small +\begin{tabular}{ l l l l l l l l } +\#CHROM & POS & ID & REF & ALT & QUAL & FILTER & INFO \\ +$13$ & $123456$ & bnd\_U & C & C$[<$ctg$1>:229[$ & 6 & PASS & SVTYPE=BND \\ +$13$ & $123457$ & bnd\_V & A & $]<$ctg$1>:45]$A & 6 & PASS & SVTYPE=BND \\ +$<$ctg$1>$ & 1 & bnd\_X & A & $]<$ctg$1>:329]$A & 6 & PASS & SVTYPE=BND \\ +$<$ctg$1>$ & 329 & bnd\_Y & T & T$[<$ctg$1>:1[$ & 6 & PASS & SVTYPE=BND \\ +\end{tabular} +\normalsize + +\subsubsection{Multiple mates} +If a breakend has multiple mates such as in Figure 4 (either because of breakend reuse or of uncertainty in the measurement), these alternate adjacencies are treated as alternate alleles: + +\begin{figure}[h] +\centering +\includegraphics[width=4in,height=2.80in]{img/multiple_mates-400x280.png} +\caption{Breakend with multiple mates} +\end{figure} + +\footnotesize +\begin{tabular}{ l l l l l l l l } +\#CHROM & POS & ID & REF & ALT & QUAL & FILTER & INFO \\ +$2$ & $321682$ & bnd\_V & T & $]13:123456]$T & 6 & PASS & SVTYPE=BND;MATEID=bnd\_U \\ +$13$ & $123456$ & bnd\_U & C & C$[2:321682[$,C$[17:198983[$ & 6 & PASS & SVTYPE=BND;MATEID=bnd\_V,bnd\_Z \\ +$17$ & $198983$ & bnd\_Z & A & $]13:123456]$A & 6 & PASS & SVTYPE=BND;MATEID=bnd\_U \\ +\end{tabular} +\normalsize + +\subsubsection{Explicit partners} +Two breakends which are connected in the reference genome but disconnected in the variants are called partners. Each breakend only has one partner, typically one basepair left or right. However, it is not uncommon to observe loss of a few basepairs during the rearrangement. It is then possible to explicitly name a breakend's partner, such as in Figure 5.: + +\begin{figure}[ht] +\centering +\includegraphics[width=4in,height=2.11in]{img/erosion-400x211.png} +\caption{Partner breakends} +\end{figure} + +\small +\begin{tabular}{ l l l l l l l l } +\#CHROM & POS & ID & REF & ALT & QUAL & FILTER & INFO \\ +2 & 321681 & bnd\_W & G & G$[13:123460[$ & 6 & PASS & PARID=bnd\_V;MATEID=bnd\_X \\ +2 & 321682 & bnd\_V & T & $]13:123456]$T & 6 & PASS & PARID=bnd\_W;MATEID=bnd\_U \\ +13 & 123456 & bnd\_U & C & C$[2:321682[$ & 6 & PASS & PARID=bnd\_X;MATEID=bnd\_V \\ +13 & 123460 & bnd\_X & A & $]2:321681]$A & 6 & PASS & PARID=bnd\_U;MATEID=bnd\_W \\ +\end{tabular} +\normalsize + +\subsubsection{Telomeres} +For a rearrangement involving the telomere end of a reference chromosome, we define a virtual telomeric breakend that serves as a breakend partner for the breakend at the telomere. That way every breakend has a partner. If the chromosome extends from position 1 to N, then the virtual telomeric breakends are at positions 0 and N+1. For example, to describe the reciprocal translocation of the entire chromosome 1 into chromosome 13, as illustrated in Figure 6: + +\begin{figure}[h] +\centering +\includegraphics[width=4in,height=2.51in]{img/telomere-400x251.png} +\caption{Telomeres} +\end{figure} + +the records would look like: + +\small +\begin{tabular}{ l l l l l l l l } +\#CHROM & POS & ID & REF & ALT & QUAL & FILTER & INFO \\ +1 & 0 & bnd\_X & N & $.[13:123457[$ & 6 & PASS & SVTYPE=BND;MATEID=bnd\_V \\ +1 & 1 & bnd\_Y & T & $]13:123456]$T & 6 & PASS & SVTYPE=BND;MATEID=bnd\_U \\ +13 & 123456 & bnd\_U & C & C$[1:1[$ & 6 & PASS & SVTYPE=BND;MATEID=bnd\_Y \\ +13 & 123457 & bnd\_V & A & $]1:0]$A & 6 & PASS & SVTYPE=BND;MATEID=bnd\_X \\ +\end{tabular} +\normalsize + +\subsubsection{Event modifiers} +As mentioned previously, a single rearrangement event can be described as a set of novel adjacencies. For example, a reciprocal rearrangement such as in Figure 7: + +\begin{figure}[h] +\centering +\includegraphics[width=4in,height=1.92in]{img/reciprocal_rearrangement-400x192.png} +\caption{Rearrangements} +\end{figure} + +would be described as: + +\vspace{0.3cm} +\footnotesize +\begin{tabular}{ l l l l l l l l } +\#CHROM & POS & ID & REF & ALT & QUAL & FILTER & INFO \\ +2 & 321681 & bnd\_W & G & G$[13:123457[$ & 6 & PASS & SVTYPE=BND;MATEID=bnd\_X;EVENT=RR0 \\ +2 & 321682 & bnd\_V & T & $]13:123456]$T & 6 & PASS & SVTYPE=BND;MATEID=bnd\_U;EVENT=RR0 \\ +13 & 123456 & bnd\_U & C & C$[2:321682[$ & 6 & PASS & SVTYPE=BND;MATEID=bnd\_V;EVENT=RR0 \\ +13 & 123457 & bnd\_X & A & $]2:321681]$A & 6 & PASS & SVTYPE=BND;MATEID=bnd\_W;EVENT=RR0 \\ +\end{tabular} +\normalsize + +\subsubsection{Inversions} +Similarly an inversion such as in Figure 8: + +\begin{figure}[ht] +\centering +\includegraphics[width=4in,height=0.95in]{img/inversion-400x95.png} +\caption{Inversion} +\end{figure} + +can be described equivalently in two ways. Either one uses the short hand notation described previously (recommended for simple cases): + +\vspace{0.3cm} +\small +\begin{tabular}{ l l l l l l l l } +\#CHROM & POS & ID & REF & ALT & QUAL & FILTER & INFO \\ +2 & 321682 & INV0 & T & $<$INV$>$ & 6 & PASS & SVTYPE=INV;END=421681 \\ +\end{tabular} +\normalsize +\vspace{0.3cm} + +or one describes the breakends: + +\vspace{0.3cm} +\footnotesize +\begin{tabular}{ l l l l l l l l } +\#CHROM & POS & ID & REF & ALT & QUAL & FILTER & INFO \\ +2 & 321681 & bnd\_W & G & G$]2:421681]$ & 6 & PASS & SVTYPE=BND;MATEID=bnd\_U;EVENT=INV0 \\ +2 & 321682 & bnd\_V & T & $[2:421682[$T & 6 & PASS & SVTYPE=BND;MATEID=bnd\_X;EVENT=INV0 \\ +2 & 421681 & bnd\_U & A & A$]2:321681]$ & 6 & PASS & SVTYPE=BND;MATEID=bnd\_W;EVENT=INV0 \\ +2 & 421682 & bnd\_X & C & $[2:321682[$C & 6 & PASS & SVTYPE=BND;MATEID=bnd\_V;EVENT=INV0 \\ +\end{tabular} +\normalsize + +\subsubsection{Uncertainty around breakend location} +It sometimes is difficult to determine the exact position of a break, generally because of homologies between the sequences being modified, such as in Figure 9. The breakend is then placed arbitrarily at the left most position, and the uncertainty is represented with the CIPOS tag. The ALT string is then constructed assuming this arbitrary breakend choice. + +\begin{figure}[h] +\centering +\includegraphics[width=4in,height=2.48in]{img/microhomology-400x248.png} +\caption{Homology} +\end{figure} + +The figure above represents a nonreciprocal translocation with microhomology. Even if we know that breakend U is rearranged with breakend V, actually placing these breaks can be extremely difficult. The red and green dashed lines represent the most extreme possible recombination events which are allowed by the sequence evidence available. We therefore place both U and V arbitrarily within the interval of possibility: + +\vspace{0.3cm} +\footnotesize +\begin{tabular}{ l l l l l l l l } +\#CHROM & POS & ID & REF & ALT & QUAL & FILTER & INFO \\ +2 & 321681 & bnd\_V & T & T$]13:123462]$ & 6 & PASS & SVTYPE=BND;MATEID=bnd\_U;CIPOS=0,6 \\ +13 & 123456 & bnd\_U & A & A$]2:321687]$ & 6 & PASS & SVTYPE=BND;MATEID=bnd\_V;CIPOS=0,6 \\ +\end{tabular} +\normalsize +\vspace{0.3cm} + +Note that the coordinate in breakend U's ALT string does not correspond to the designated position of breakend V, but to the position that V would take if U's position were fixed (and vice-versa). The CIPOS tags describe the uncertainty around the positions of U and V. + +The fact that breakends U and V are mates is preserved thanks to the MATEID tags. If this were a reciprocal translocation, then there would be additional breakends X and Y, say with X the partner of V on Chr 2 and Y the partner of U on Chr 13, and there would be two more lines of VCF for the XY novel adjacency. Depending on which positions are chosen for the breakends X and Y, it might not be obvious that X is the partner of V and Y is the partner of U from their locations alone. This partner relation ship can be specified explicitly with the tag PARID=bnd\_X in the VCF line for breakend V and PARID=bnd\_Y in the VCF line for breakend U, and vice versa. + +\subsubsection{Single breakends} +We allow for the definition of a breakend that is not part of a novel adjacency, also identified by the tag SVTYPE=BND. We call these single breakends, because they lack a mate. Breakends that are unobserved partners of breakends in observed novel adjacencies are one kind of single breakend. For example, if the true situation is known to be either as depicted back in Figure 1, and we only observe the adjacency (U,V), and no adjacencies for W, X, Y, or Z, then we cannot be sure whether we have a simple reciprocal translocation or a more complex 3-break operation. Yet we know the partner X of U and the partner W of V exist and are breakends. In this case we can specify these as single breakends, with unknown mates. The 4 lines of VCF representing this situation would be: + +\vspace{0.3cm} +\small +\begin{tabular}{ l l l l l l l l } +\#CHROM & POS & ID & REF & ALT & QUAL & FILTER & INFO \\ +2 & 321681 & bnd\_W & G & G. & 6 & PASS & SVTYPE=BND \\ +2 & 321682 & bnd\_V & T & $]13:123456]$T & 6 & PASS & SVTYPE=BND;MATEID=bnd\_U \\ +13 & 123456 & bnd\_U & C & C$[2:321682[$ & 6 & PASS & SVTYPE=BND;MATEID=bnd\_V \\ +13 & 123457 & bnd\_X & A & .A & 6 & PASS & SVTYPE=BND \\ +\end{tabular} +\normalsize +\vspace{0.3cm} + +On the other hand, if we know a simple reciprocal translocation has occurred as in Figure 7, then even if we have no evidence for the (W,X) adjacency, for accounting purposes an adjacency between W and X may also be recorded in the VCF file. These two breakends W and X can still be crossed-referenced as mates. The 4 VCF records describing this situation would look exactly as below, but perhaps with a special quality or filter value for the breakends W and X. + +Another possible reason for calling single breakends is an observed but unexplained change in copy number along a chromosome. + +\vspace{0.3cm} +\scriptsize +\begin{tabular}{ l l l l l l l l } +\#CHROM & POS & ID & REF & ALT & QUAL & FILTER & INFO \\ +3 & 12665 & bnd\_X & A & .A & 6 & PASS & SVTYPE=BND;CIPOS=-50,50 \\ +3 & 12665 & . & A & $<$DUP$>$ & 14 & PASS & SVTYPE=DUP;END=13686;CIPOS=-50,50;CIEND=-50,50 \\ +3 & 13686 & bnd\_Y & T & T. & 6 & PASS & SVTYPE=BND;CIPOS=-50,50 \\ +\end{tabular} +\normalsize +\vspace{0.3cm} + +Finally, if an insertion is detected but only the first few base-pairs provided by overhanging reads could be assembled, then this inserted sequence can be provided on that line, in analogy to paired breakends: + +\vspace{0.3cm} +\scriptsize +\begin{tabular}{ l l l l l l l l } +\#CHROM & POS & ID & REF & ALT & QUAL & FILTER & INFO \\ +3 & 12665 & bnd\_X & A & .TGCA & 6 & PASS & SVTYPE=BND;CIPOS=-50,50 \\ +3 & 12665 & . & A & $<$DUP$>$ & 14 & PASS & SVTYPE=DUP;END=13686;CIPOS=-50,50;CIEND=-50,50 \\ +3 & 13686 & bnd\_Y & T & TCC. & 6 & PASS & SVTYPE=BND;CIPOS=-50,50 \\ +\end{tabular} +\normalsize + +\subsubsection{Sample mixtures} +It may be extremely difficult to obtain clinically perfect samples, with only one type of cell. Let's imagine that two samples are taken from a cancer patient: healthy blood, and some tumor tissue with an estimated 30\% stromal contamination. This would then be expressed in the header as: + +\footnotesize +\begin{verbatim} +##SAMPLE= +##SAMPLE= +\end{verbatim} +\normalsize + +Because of this distinction between sample and genome, it is possible to express the data along both distinctions. For example, in a first pass, a structural variant caller would simply report counts per sample. Using the example of the inversion just above, the VCF code could become: + +\vspace{0.3cm} +\tiny +\begin{flushleft} +\begin{tabular}{ l l l l l l l l l l l } +\#CHROM & POS & ID & REF & ALT & QUAL & FILTER & INFO & FORMAT & Blood & TissueSample\\ +2 & 321681 & bnd\_W & G & G$]2:421681]$ & 6 & PASS & SVTYPE=BND;MATEID=bnd\_U & GT:DPADJ & 0:32 & $0\mid1:9\mid21$ \\ +2 & 321682 & bnd\_V & T & $[2:421682[$T & 6 & PASS & SVTYPE=BND;MATEID=bnd\_X & GT:DPADJ & 0:29 & $0\mid1:11\mid25$ \\ +13 & 421681 & bnd\_U & A & A$]2:321681]$ & 6 & PASS & SVTYPE=BND;MATEID=bnd\_W & GT:DPADJ & 0:34 & $0\mid1:10\mid23$ \\ +13 & 421682 & bnd\_X & C & $[2:321682[$C & 6 & PASS & SVTYPE=BND;MATEID=bnd\_V & GT:DPADJ & 0:31 & $0\mid1:8\mid20$ \\ +\end{tabular} +\end{flushleft} +\normalsize +\vspace{0.3cm} + +However, a more evolved algorithm could attempt actually deconvolving the two genomes and generating copy number estimates based on the raw data: + +\vspace{0.3cm} +\tiny +\begin{flushleft} +\begin{tabular}{ l l l l l l l l l l l } +\#CHROM & POS & ID & REF & ALT & QUAL & FILTER & INFO & FORMAT & Blood & TumorSample \\ +2 & 321681 & bnd\_W & G & G$]2:421681]$ & 6 & PASS & SVTYPE=BND;MATEID=bnd\_U & GT:CNADJ & 0:1 & 1:1 \\ +2 & 321682 & bnd\_V & T & $[2:421682[$T & 6 & PASS & SVTYPE=BND;MATEID=bnd\_X & GT:CNADJ & 0:1 & 1:1 \\ +13 & 421681 & bnd\_U & A & A$]2:321681]$ & 6 & PASS & SVTYPE=BND;MATEID=bnd\_W & GT:CNADJ & 0:1 & 1:1 \\ +13 & 421682 & bnd\_X & C & $[2:321682[$C & 6 & PASS & SVTYPE=BND;MATEID=bnd\_V & GT:CNADJ & 0:1 & 1:1 \\ +\end{tabular} +\end{flushleft} +\normalsize + +\subsubsection{Clonal derivation relationships} +In cancer, each VCF file represents several genomes from a patient, but one genome is special in that it represents the germline genome of the patient. This genome is contrasted to a second genome, the cancer tumor genome. In the simplest case the VCF file for a single patient contains only these two genomes. This is assumed in most of the discussion of the sections below. + +In general there may be several tumor genomes from the same patient in the VCF file. Some of these may be secondary tumors derived from an original primary tumor. We suggest the derivation relationships between genomes in a cancer VCF file be represented in the header with PEDIGREE tags. + +Analogously, there might also be several normal genomes from the same patient in the VCF (typically double normal studies with blood and solid tissue samples). These normal genomes are then considered to be derived from the original germline genome, which has to be inferred by parsimony. + +The general format of a PEDIGREE line describing asexual, clonal derivation is: + +\begin{verbatim} +PEDIGREE= +\end{verbatim} + +This line asserts that the DNA in genome is asexually or clonally derived with mutations from the DNA in genome. This is the asexual analog of the VCF format that has been proposed for family relationships between genomes, i.e. there is one entry per of the form: + +\begin{verbatim} +PEDIGREE= +\end{verbatim} + +Let's consider a cancer patient VCF file with 4 genomes: germline, primary\_tumor, secondary\_tumor1, and secondary\_tumor2 as illustrated in Figure 10. The primary\_tumor is derived from the germline and the secondary tumors are each derived independently from the primary tumor, in all cases by clonal derivation with mutations. The PEDIGREE lines would look like: + +\begin{figure}[ht] +\centering +\includegraphics[width=4in,height=2.67in]{img/derivation-400x267.png} +\caption{Pegigree example} +\end{figure} + +\begin{verbatim} +##PEDIGREE= +##PEDIGREE= +##PEDIGREE= +\end{verbatim} + +Alternately, if data on the genomes is compiled in a database, a simple pointer can be provided: + +\begin{verbatim} +##pedigreeDB= +\end{verbatim} + +The most general form of a pedigree line is: + +\begin{verbatim} +##PEDIGREE= +\end{verbatim} + +This means that the genome Name\_0 is derived from the N $\ge$ 1 genomes Name\_1, ..., Name\_N. Based on these derivation relationships two new pieces of information can be specified. + +Firstly, we wish to express the knowledge that a variant is novel to a genome, with respect to its parent genome. Ideally, this could be derived by simply comparing the features on either genomes. However, insufficient data or sample mixtures might prevent us from clearly determining at which stage a given variant appeared. This would be represented by a mutation quality score. + +Secondly, we define a \textbf{haplotype} as a set of variants which are known to be on the same chromosome in the germline genome. Haplotype identifiers must be unique across the germline genome, and are conserved along clonal lineages, regardless of mutations, rearrangements, or recombination. In the case of the duplication of a region within a haplotype, one copy retains the original haplotype identifier, and the others are considered to be novel haplotypes with their own unique identifiers. All these novel haplotypes have in common their \textbf{haplotype ancestor} in the parent genome. + +\subsubsection{Phasing adjacencies in an aneuploid context} +In a cancer genome, due to duplication followed by mutation, there can in principle exist any number of haplotypes in the sampled genome for a given location in the reference genome. We assume each haplotype that the user chooses to name is named with a numerical haplotype identifier. Although it is difficult with current technologies to associate haplotypes with novel adjacencies, it might be partially possible to deconvolve these connections in the near future. We therefore propose the following notation to allow haplotype-ambiguous as well as haplotype-unambiguous connections to be described. The general term for these haplotype-specific adjacencies is \textbf{bundles}. + +The diagram in Figure 11 will be used to support examples below: + +\begin{figure}[ht] +\centering +\includegraphics[width=4in,height=2.59in]{img/phasing-400x259.png} +\caption{Phasing} +\end{figure} + +In this example, we know that in the sampled genome: + +\begin{enumerate} + \item A reference bundle connects breakend U, haplotype 5 on chr13 to its partner, breakend X, haplotype 5 on chr13, + \item A novel bundle connects breakend U, haplotype 1 on chr13 to its mate breakend V, haplotype 11 on chr2, and finally, + \item A novel bundle connects breakend U, haplotypes 2, 3 and 4 on chr13 to breakend V, haplotypes 12, 13 or 14 on chr2 without any explicit pairing. +\end{enumerate} + +These three are the bundles for breakend U. Each such bundle is referred to as a haplotype of the breakend U. Each allele of a breakend corresponds to one or more haplotypes. In the above case there are two alleles: the 0 allele, corresponding to the adjacency to the partner X, which has haplotype (1), and the 1 allele, corresponding to the two haplotypes (2) and (3) with adjacency to the mate V. + +For each haplotype of a breakend, say the haplotype (2) of breakend U above, connecting the end of haplotype 1 on a segment of Chr 13 to a mate on Chr 2 with haplotype 11, in addition to the list of haplotype-specific adjacencies that define it, we can also specify in VCF several other quantities. These include: + +\begin{enumerate} + \item The depth of reads on the segment where the breakend occurs that support the haplotype, e.g. the depth of reads supporting haplotype 1 in the segment containing breakend U + \item The estimated copy number of the haplotype on the segment where the breakend occurs + \item The depth of paired-end or split reads that support the haplotype-specific adjacencies, e.g. that support the adjacency between haplotype 1 on Chr 13 to haplotype 11 on Chr 2 + \item The estimated copy number of the haplotype-specific adjacencies + \item An overall quality score indicating how confident we are in this asserted haplotype +\end{enumerate} +These are specified using the using the DP, CN, BDP, BCN, and HQ subfields, respectively. The total information available about the three haplotypes of breakend U in the figure above may be visualized in a table as follows. + +\vspace{0.3cm} +\begin{tabular}{ l l l l } +Allele & 1 & 1 & 0 \\ +Haplotype & 1$>$11 & 2,3,4$>$12,13,14 & 5$>$5 \\ +Segment Depth & 5 & 17 & 4 \\ +Segment Copy Number & 1 & 3 & 1 \\ +Bundle Depth & 4 & 0 & 3 \\ +Bundle Copy Number & 1 & 3 & 1 \\ +Haplotype quality & 30 & 40 & 40 \\ +\end{tabular} + +\pagebreak +\section{BCF specification} + +VCF is very expressive, accommodates multiple samples, and is widely used +in the community. Its biggest drawback is that it is big and slow. +Files are text and therefore require a lot of space on disk. A normal batch +of a hundred exomes is a few GB, but large-scale VCFs with thousands of exome +samples quickly become hundreds of GBs. Because the file is text, it is +extremely slow to parse. + +Overall, the idea behind is BCF2 is simple. BCF2 is a binary, compressed +equivalent of VCF that can be indexed with tabix and can be efficiently decoded +from disk or streams. For efficiency reasons BCF2 only supports a subset +of~VCF, in that all info and genotype fields must have their full types +specified. That is, BCF2 requires that if e.g. an info field {\tt AC} is +present then it must contain an equivalent VCF header line noting that {\tt AC} +is an allele indexed array of type integer. + +\subsection{Overall file organization} + +A BCF2 file is composed of a mandatory header, followed by a series of BGZF +compressed blocks of binary BCF2 records. The BGZF blocks allow BCF2 files +to be indexed with tabix. + +BGZF blocks are composed of a VCF header with a few additional records and a +block of records. Following the last BGZF BCF2 record block is an empty +BGZF block (a block containing zero type of data), indicating that the +records are done. + +A BCF2 header follows exactly the specification as VCF, with a few +extensions/restrictions: +\begin{itemize} +\item All BCF2 files must have fully specified contigs definitions. +No record may refer to a contig not present in the header itself. + +\item All INFO and GENOTYPE fields must be fully typed in the BCF2 header to +enable type-specific encoding of the fields in records. An error should be +thrown when converting a VCF to BCF2 when an unknown or not fully specified +field is encountered in the records. +\end{itemize} + +\subsection{Header} + +The BCF2 header begins with the ``BCF2 magic'' 5 bytes that encode +{\tt BCF\em XY} where {\em X} and {\em Y} are bytes indicating the major +number (currently 2) and the minor number (currently 1). This magic can be +used to quickly examine the file to determine that it's a BCF2 file. +Immediately following the BCF2 magic is the standard VCF header lines in text +format, beginning with \verb|##fileformat=VCFvX.Y|. Because the type is +encoded directly in the header, the recommended extension for BCF2 formatted +files is {\sl .bcf}. BCF2 supports encoding values in a dictionary of strings. +The string map is provided by the keyword \verb|##dictionary=S0,S1,...,SN| as a +comma-separate ordered list of strings. See the ``Dictionary of strings'' +section for more details. + +\subsubsection{Dictionary of strings} + +Throughout the BCF file most string values are be specified by integer +reference to their dictionary values. For example, the following VCF record: +\small +\begin{verbatim} +##INFO= +##INFO= +##INFO= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +20 10144 rs144773400 TA T . PASS ASP;RSPOS=10145,dbSNPBuildID=134 +20 10228 rs143255646 TA T . PASS ASP;RSPOS=10229;dbSNPBuildID=134 +\end{verbatim} +\normalsize +would be encoded inline in BCF2 by reference to the relative position of the header line in the header (ASP=1, RSPOS=2, dbSNPBuildID=3, and PASS implicitly encoded in the last offset PASS=4) + +\small +\begin{verbatim} +##INFO= +##INFO= +##INFO= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +0 10144 rs144773400 TA T . s0 s1;s2=10145;s3=134 +0 10228 rs143255646 TA T . s0 s1;s2=10229;s3=134 +\end{verbatim} +\normalsize + +{\color{red} +Defined this way, the dictionary of strings depends on the order and the +presence of all preceding header lines. If an existing tag needs to be removed +from a BCF, also all consequent tags throughout the whole BCF would have to be +recoded. In order to avoid this costly operation, a new IDX field can be used +to explicitly define the position which is dropped on BCF-to-VCF conversion. If +not present, the implicit relative position is assumed. If the IDX field is +present in one record, it must be present also in all other dictionary-defining +records. +} + +Note that the dictionary encoding has the magic prefix `s' here to indicate that the field's value is actually in the dictionary entry giving by the subsequent offset. This representation isn't actually the one used in BCF2 records but it provides a clean visual guide for the above example. Note also how the contig has been recoded as a offset into the list of contig declarations. + +Note that ``PASS'' is always implicitly encoded as the first entry in the header dictionary. This is because VCF allows FILTER fields to be PASS without explicitly listing this in the FILTER field itself. + + +\subsubsection{Dictionary of contigs} + +The CHROM field in BCF2 is encoded as an integer offset into the list of \verb|##contig| field headers in the VCF header. The offsets begin, like the dictionary of strings, at 0. So for example if in BCF2 the contig value is 10, this indicates that the actual chromosome is the 11th element in the ordered list of \verb|##contig| elements. Here's a more concrete example: + +\small +\begin{verbatim} +##contig= +##contig= +##contig= +#CHROM POS ID REF ALT QUAL FILTER INFO +20 1 . T A . PASS . +21 2 . T A . PASS . +22 3 . T A . PASS . +\end{verbatim} +\normalsize + +the actual CHROM field values in the encoded BCF2 records would be 0, 1, and 2 corresponding to the first (offset 0) \verb|##contig| element, etc. + +\subsection{BCF2 records} + + +In BCF2, the original VCF records are converted to binary and encoded as BGZF +blocks. Each record is conceptually two parts. First is the site information +(chr, pos, INFO field). Immediately after the sites data is the genotype data +for every sample in the BCF2 file. The genotype data may be omitted entirely +from the record if there is no genotype data in the VCF file. Note that it's +acceptable to not BGZF compress a BCF2 file. + +\subsubsection{Site encoding} + +{\small +\begin{tabular}{|l | l | p{30em} | } \hline +\textbf{Field} & \textbf{Type} & \textbf{Notes} \\ \hline +l\_shared & uint32\_t & Data length from CHROM to the end of INFO \\ \hline +l\_indiv & uint32\_t & Data length of FORMAT and individual genotype fields \\ \hline +CHROM & int32\_t & Given as an offset into the mandatory contig dictionary \\ \hline +POS & int32\_t & 0-based leftmost coordinate \\ \hline +rlen & int32\_t & Length of the record as projected onto the reference sequence. May be the actual length of the REF allele but for symbolic alleles should be the declared length respecting the END attribute \\ \hline +n\_allele\_info & int32\_t & n\_info, where n\_allele is the number of REF+ALT alleles in this record, and n\_info is the number of VCF INFO fields present in this record \\ \hline +n\_fmt\_sample & uint32\_t & n\_sample, where n\_fmt is the number of format fields for genotypes in this record, and n\_samples is the number of samples present in this sample. Note that the number of samples must be equal to the number of samples in the header \\ \hline +QUAL & float & Variant quality; 0x7F800001 for a missing value \\ \hline +ID & typed string & REF+ALT list of n\_allele typed strings the first allele is REF (mandatory) followed by n\_alleles - 1 ALT alleles, all encoded as typed strings \\ \hline +FILTER & Typed vector of integers & a vector of integer offsets into dictionary, one for each FILTER field value. ``.'' is encoded as MISSING \\ \hline +INFO & field key/value pairs & n\_info pairs of typed vectors The first value must be a typed atomic integer giving the offset of the INFO field key into the dictionary. The second value is a typed vector giving the value of the field \\ \hline +Genotype values & see below & see below \\ \hline +\end{tabular}} + +\subsubsection{Genotype encoding} + +Genotype fields are encoded not by sample as in VCF but rather by field, with a vector of values for each sample following each field. In BCF2, the following VCF line: + +\vspace{0.3cm} +\begin{tabular}{l l l l} +FORMAT & NA00001 & NA00002 & NA00003 \\ +GT:GQ:DP & 0/0:48:1 & 0/1:48:8 & 1/1:43:5 \\ +\end{tabular} +\vspace{0.3cm} + +would encoded as the equivalent of: + +\vspace{0.3cm} +\begin{tabular}{l l l l} +GT=0/0,0/1,1/1 & GQ=48,9,43 & DP=1,8,5 +\end{tabular} +\vspace{0.3cm} + +Suppose there are i genotype fields in a specific record. Each i is encoded by a triplet: + +BCF2 site information encoding + +\vspace{0.3cm} +\small +\begin{tabular}{ | p{2cm} | p{2.5cm} | p{9.5cm} | } \hline +Field & Type & Notes \\ \hline +fmt\_key & typed int & Format key as an offset into the dictionary \\ \hline +fmt\_type & uint8\_t+ & Typing byte of each individual value, possibly followed by a typed int for the vector length. In effect this is the same as the typing value for a single vector, but for genotype values it appears only once before the array of genotype field values \\ \hline +fmt\_values (by fmt type) & Array of values & The information of each individual is concatenated in the vector. Every value is of the same fmt type. Variable-length vectors are padded with missing values; a string is stored as a vector of char \\ \hline +\end{tabular} +\normalsize +\vspace{0.3cm} + +The value is always implicitly a vector of N values, where N is the number of samples. The type byte of the value field indicates the type of each value of the N length vector. For atomic values this is straightforward (size = 1). But if the type field indicates that the values are themselves vectors (as often occurs, such as with the PL field) then each of the N values in the outer vector is itself a vector of values. This encoding is efficient when every value in the genotype field vector has the same length and type. + +Note that the specific order of fields isn't defined, but it's probably a good idea to respect the ordering as specified in the input VCF/BCF2 file. + +If there are no sample records (genotype data) in this VCF/BCF2 file, the size of the genotypes block will be 0. + + +\subsubsection{Type encoding} + +In BCF2 values are all strongly typed in the file. The type information is encoded in a prefix byte before the value, which contains information about the low-level type of the value(s) such as int32 or float, as well as the number of elements in the value. The encoding is as follows: + +\vspace{0.3cm} +\textbf{BCF2 type descriptor byte} + +\vspace{0.3cm} +\begin{tabular}{|p{2cm} | p{10cm}|} \hline +Bit & Meaning \\ \hline +5,6,7,8 bits & The number of elements of the upcoming type. For atomic values, the size must be 1. If the size is set to 15, this indicates that the vector has 15 or more elements, and that the subsequent BCF2 byte stream contains a typed Integer indicating the true size of the vector. If the size is between 2-14, then this Integer is omitted from the stream and the upcoming stream begins immediately with the first value of the vector. A size of 0 indicates that the value is MISSING. \\ \hline +1,2,3,4 bits & Type \\ \hline +\end{tabular} +\vspace{0.3cm} + +The final four bits encodes an unsigned integer that indicates the type of the upcoming value in the data stream. + +\textbf{BCF2 types} + +\vspace{0.3cm} +\begin{tabular}{|l | l | l|} \hline +Lowest 4 bits & Hexadecimal encoding & Corresponding atomic type \\ \hline +1 & 0x?1 & Integer [8 bit] \\ \hline +2 & 0x?2 & Integer [16 bit] \\ \hline +3 & 0x?3 & Integer [32 bit] \\ \hline +5 & 0x?5 & Float [32 bit] \\ \hline +7 & 0x?7 & Character, ASCII encoded in 8 bits \\ \hline +\end{tabular} +\vspace{0.3cm} + +Note this is not used in BCF2, but its type is reserved in case this becomes necessary. In BCF2 characters are simply represented by strings with a single element 0,4,6,8-15 reserved for future use. + +\vspace{0.3cm} + +\textbf{Integers} may be encoded as 8, 16, or 32 bit values, in little-endian +order. It is up to the encoder to determine the appropriate ranged value to +use when writing the BCF2 file. +{\color{red} +For integer types, the values 0x80, 0x8000, 0x80000000 are interpreted as +missing values and 0x81, 0x8001, 0x80000001 as end-of-vector indicators +(for 8, 16, and 32 bit values, respectively). Note that the end-of-vector byte +is not part of the vector itself and only end-of-vector bytes can follow. +In total, eight values are reserved for future use: 0x80-0x87, 0x8000-0x8007, 0x80000000-0x80000007. } + + +\vspace{0.3cm} +\textbf{Floats} are encoded as single-precision (32 bit) in the basic format +defined by the IEEE-754-1985 standard. This is the standard representation for +floating point numbers on modern computers, with direct support in programming +languages like C and Java (see Java's Double class for example). BCF2 supports +the full range of values from -Infinity to +Infinity, including NaN. BCF2 +needs to represent missing values for single precision floating point numbers. +This is accomplished by writing the NaN value as the quiet NaN (qNaN), while +the MISSING value is encoded as a signaling NaN. From the NaN wikipedia entry, +we have: + +\begin{quote} +For example, a bit-wise example of a IEEE floating-point standard single +precision (32-bit) NaN would be: s111 1111 1axx xxxx xxxx xxxx xxxx xxxx where +s is the sign (most often ignored in applications), a determines the type of +NaN, and x is an extra payload (most often ignored in applications). If a = 1, +it is a quiet NaN; if a is zero and the payload is nonzero, then it is a +signaling NaN. +\end{quote} + +\noindent A good way to understand these values is to play around with the IEEE encoder webiste. + +\vspace{0.3cm} +{\color{red} +\noindent Similarly to integers, the float value of 0x7F800001 is interpreted as a missing value +and 0x7F800002 as the end-of-vector indicator. Note that the end-of-vector byte +is not part of the vector itself and only end-of-vector bytes can follow. In total, +eight values are reservd for future use: +} + +\vspace{0.1cm} +\begin{tabular}{| l | c | l |} \hline +\textbf{Value} & \textbf{32-bit precision} & \textbf{Hexadecimal representation} \\ \hline +NaN & 0b0111 1111 1100 0000 0000 0000 0000 0000 & 0x7FC00000 \\ \hline +missing & 0b0111 1111 1000 0000 0000 0000 0000 0001 & 0x7F800001 \\ \hline +end-of-vector & 0b0111 1111 1000 0000 0000 0000 0000 0010 & 0x7F800002 \\ \hline +reserved & 0b0111 1111 1000 0000 0000 0000 0000 0011 & 0x7F800003 \\ \hline +$\ldots$ & $\ldots$ & $\ldots$ \\ \hline +reserved & 0b0111 1111 1000 0000 0000 0000 0000 0111 & 0x7F800007 \\ \hline +\end{tabular} + +\vspace{0.3cm} +\textbf{Character} values are not explicitly typed in BCF2. Instead, VCF Character values should be encoded by a single character string. As with Strings, UNICODE characters are not supported. + +\vspace{0.3cm} +\textbf{Flags} values -- which can only appear in INFO fields -- in BCF2 should be encoded by any non-MISSING value. The recommended best practice is to encode the value as an 1-element INT8 (type 0x11) with value of 1 to indicate present. Because FLAG values can only be encoded in INFO fields, BCF2 provides no mechanism to encode FLAG values in genotypes, but could be easily extended to do so if allowed in a future VCF version. + +\vspace{0.3cm} +\textbf{String} values have two basic encodings. For INFO, FORMAT, and FILTER keys these are encoded by integer offsets into the header dictionary. For string values, such as found in the ID, REF, ALT, INFO, and FORMAT fields, strings are encoded as typed array of ASCII encoded bytes. The array isn't terminated by a null byte. The length of the string is given by the length of the type descriptor. + +Suppose you want to encode the string ACAC. First, we need the type descriptor byte, which is the string type 0x07 or'd with inline size (4) yielding the type byte of 0x40 | 0x07 = 0x47. Immediately following the type byte is the four byte ASCII encoding of ``ACAC'' 0x41 0x43 0x41 0x43. So the final encoding is: + +\vspace{0.1cm} +\begin{tabular}{| l | l |} \hline +0x47 0x41 0x43 0x41 0x43 & String type with inline size of 4 followed by ACAC in ASCII \\ \hline +\end{tabular} +\vspace{0.3cm} + +Suppose you want to encode the string MarkDePristoWorksAtTheBroad, a string of size 27. First, we need the type descriptor byte, which is the string type 0x07. Because the size exceeds the inline size ($27 > 15$) we set the size to overflow, yielding the type byte of 0xF0 | 0x07 = 0xF7. Immediately following the type byte is the typed size of 27, which we encode by the atomic INT8 value: 0x11 followed by the actual size 0x1B. Finally comes the actual bytes of the string: 0x4D 0x61 0x72 0x6B 0x44 0x65 0x50 0x72 0x69 0x73 0x74 0x6F 0x57 0x6F 0x72 0x6B 0x73 0x41 0x74 0x54 0x68 0x65 0x42 0x72 0x6F 0x61 0x64. So the final encoding is: + +\vspace{0.3cm} +\begin{tabular}{ | p{9cm} | p{6cm} | } \hline +0xF7 & string with overflow size \\ \hline +0x11 0x1B & overflow size encoded as INT8 with value 27 \\ \hline +0x4D 0x61 0x72 0x6B 0x44 0x65 0x50 0x72 0x69 0x73 0x74 0x6F 0x57 0x6F 0x72 0x6B 0x73 0x41 0x74 0x54 0x68 0x65 0x42 0x72 0x6F 0x61 0x64 & message in ASCII \\ \hline +\end{tabular} +\vspace{0.3cm} + +Suppose you want to encode the missing value `.'. This is simply a string of size 0 = 0x07. + +\vspace{0.3cm} +In VCF there are sometimes fields of type list of strings, such as a number +field of unbounded size encoding the amino acid changes due to a mutation. +Since BCF2 doesn't directly support vectors of strings (a vector of character +is already a string) we collapse the list of strings into a single +comma-separated string, encode it as a regular BCF2 vector of characters, and +on reading explode it back into the list of strings. This works because +strings in VCF cannot contain `{ \tt ,}' (it's a field separator) and so we can +safely use `{\tt ,}' to separate the individual strings. + +% String vectors in BCF do not need to start with comma, as the number of +% values is indicated already in the definition of the tag in the header. +% +% For efficiency +% reasons we put a comma at the start of the collapsed string, so that just the +% first character can be examined to determine if the string is collapsed. +% +% To be concrete, suppose we have a info field around X=[A,B,C,D]. This is +% encoded in BCF2 as a single string ``,A,B,C,D'' of size 8, so it would have +% type byte 0x87 followed by the ASCII encoding 0x2C 0x41 0x2C 0x42 0x2C 0x43 +% 0x2C 0x44. + +\vspace{0.3cm} + +\textbf{Vectors} --- The BCF2 type byte may indicate that the upcoming data stream contains not a single value but a fixed length vector of values. The vector values occur in order (1st, 2nd, 3rd, etc) encoded as expected for the type declared in the vector's type byte. For example, a vector of 3 16-bit integers would be layed out as first the vector type byte, followed immediately by 3 2-byte values for each integer, including a total of 7 bytes. + +Missing values in vectors are handled slightly differently from atomic values. There are two possibilities for missing values: + +One (or more) of the values in the vector may be missing, but others in the vector are not. Here each value should be represented in the vector, and each corresponding BCF2 vector value either set to its present value or the type equivalent MISSING value. +Alternatively the entire vector of values may be missing. In this case the correct encoding is as a type byte with size 0 and the appropriate type MISSING. +Suppose we are encoding the record ``AC=[1,2,3]'' from the INFO field. The AC key is encoded in the standard way. This would be immediately followed by a typed 8-bit integer vector of size 3, which is encoded by the type descriptor 0x31. The type descriptor is immediately followed by the three 8-bit integer values: 0x01 0x02 0x03, for a grant total of 4 bytes: 0x31010203. + +Suppose we are at a site with many alternative alleles so AC=[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16]. Since there are 16 values, we have to use the long vector encoding. The type of this field is 8 bit integer with the size set to 15 to indicate that the size is the next stream value, so this has type of 0xF1. The next value in the stream is the size, as a typed 8-bit atomic integer: 0x11 with value 16 0x10. Each integer AC value is represented by it's value as a 8 bit integer. The grand total representation here is: + +\vspace{0.3cm} +\begin{tabular}{|p{9cm} | p{6cm}|} \hline +0xF1 0x01 0x10 & 8 bit integer vector with overflow size \\ \hline +0x01 0x02 0x03 0x04 0x05 0x06 0x07 0x08 0x09 0x0A 0x0B 0x0C 0x0D 0x0E 0x0F 0x10 & 1-16 as hexadecimal 8 bit integers \\ \hline +\end{tabular} +\vspace{0.3cm} + +Suppose this INFO field contains the ``AC=.'', indicating that the AC field is missing from a record with two alt alleles. The correct representation is as the typed pair of AC followed by a MISSING vector of type 8-bit integer: 0x01. + +\vspace{0.3cm} +\textbf{Vectors of mixed length} --- In some cases genotype fields may be vectors whose length differs among samples. For example, some CNV call sets encode different numbers of genotype likelihoods for each sample, given the large number of potential copy number states, rather padding all samples to have the same number of fields. For example, one sample could have CN0:0,CN1:10 and another CN0:0,CN1:10,CN2:10. In the situation when a genotype field contain vector values of different lengths, these are represented in BCF2 by a vector of the maximum length per sample, with all values in the each vector aligned to the left, and MISSING values assigned to all values not present in the original vector. The BCF2 encoder / decoder must automatically add and remove these MISSING values from the vectors. + +For example, suppose I have two samples, each with a FORMAT field X. Sample A has values [1], while sample B has [2,3]. In BCF2 this would be encoded as [1, MISSING] and [2, 3]. Diving into the complete details, suppose X is at offset 3 in the dictionary, which is encoded by the typed INT8 descriptor 0x11 followed by the value 0x03. Next we have the type of the each format field, which here is a 2 element INT8 vector: 0x21. Next we have the encoding for each sample, A = 0x01 0x80 followed by B = 0x02 0x03. All together we have: + +\vspace{0.3cm} +\begin{tabular}{|p{2cm} | l |} \hline +0x11 0x03 & X dictionary offset \\ \hline +0x21 & each value is a 2 element INT8 value \\ \hline +0x01 0x80 & A is [1, MISSING] \\ \hline +0x02 0x03 & B is [2, 3] \\ \hline +\end{tabular} +\vspace{0.3cm} + +Note that this means that it's illegal to encode a vector VCF field with missing values; the BCF2 codec should signal an error in this case. + +\vspace{0.3cm} +A \textbf{Genotype (GT) field} is encoded in a typed integer vector (can be 8, 16, or even 32 bit if necessary) with the number of elements equal to the maximum ploidy among all samples at a site. For one individual, each integer in the vector is organized as $(allele+1) << 1 \mid phased$ where allele is set to -1 if the allele in GT is a dot `.' (thus the higher bits are all 0). The vector is padded with missing values if the GT having fewer ploidy. + +Examples: + +\vspace{0.3cm} +\small +\begin{tabular}{|p{2.5cm} | p{10cm} | p{3cm}|} \hline +0/1 & in standard format $(0 + 1) << 1 \mid 0$ followed by $(1 + 1) << 1 \mid 0$ & 0x02 0x04 \\ \hline +0/1, 1/1, and 0/0 & three samples encoded consecutively & 0x020404040202 \\ \hline +$0\mid1$ & $(1 + 1) << 1 \mid 1$ = 0x05 preceded by the standard first byte value 0x04 & 0x0405 \\ \hline +./. & where both alleles are missing & 0x00 0x00 \\ \hline +0 & as a haploid it is represented by a single byte & 0x02 \\ \hline +1 & as a haploid it is represented by a single byte & 0x04 \\ \hline +0/1/2 & is tetraploid, with alleles & 0x02 0x04 0x06 \\ \hline +$0/1\mid2$ & is tetraploid with a single phased allele & 0x02 0x04 0x07 \\ \hline +0 and 0/1 & pad out the final allele for the haploid individual & 0x04 0x80 0x02 0x04\\ \hline +\end{tabular} +\normalsize + +\vspace{0.3cm} +The final example is something seen on chrX when we have a haploid male and a diploid female. The spare male allele is just assigned the missing value. +\vspace{0.3cm} + +\textbf{Misc. notes} + +A type byte value of 0x00 is an allowed special case meaning MISSING but without an explicit type provided. + + + +\subsection{Encoding a VCF record example} + +Let's encode a realistic (but made-up) VCF record. This is a A/C SNP in HM3 +(not really) called in~3 samples. In this section we'll build up the BCF2 +encoding for this record. +\scriptsize +\begin{verbatim} +#CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 NA00002 NA00003 +chr1 101 rs123 A C 30.1 PASS HM3;AC=3;AN=6;AA=C GT:GQ:DP:AD:PL 0/0:10:32:32,0:0,10,100 0/1:10:48:32,16:10,0,100 1/1:10:64:0,64:100,10,0 +\end{verbatim} +\normalsize + +\subsubsection{Encoding CHROM and POS} + +First, let's assume that {\tt chr1} is the second chromosome to appear in the +contig list---right after {\tt chrM} ({\tt MT}). So its offset is~1. +The {\tt POS} BCF2 field value is~101 (obviously). Because these are both +typed values in the BCF2 record, we encode both in their most compact 8-bit +value form. The type byte for an atomic 8-bit integer is 0x11. The value for +the contig offset is 1 = 0x01. The value 101 is encoded as the single byte +0x65. So in total these are represented as: + +\vspace{0.3cm} +\begin{tabular}{|l | l|} \hline +0x01000000 & CHROM offset is at 1 in 32 bit little endian \\ \hline +0x64000000 & POS in 0 base 32 bit little endian \\ \hline +0x01000000 & rlen = 1 (it's just a SNP) \\ \hline +\end{tabular} + +\subsubsection{Encoding QUAL} + +The QUAL field value is 30.1, which we encode as an untyped single precision +32-bit float: + +\vspace{0.3cm} +\begin{tabular}{|l| l|} \hline +0x41 0xF0 0xCC 0xCD & QUAL = 30.1 as 32-bit float \\ \hline +\end{tabular} + +\subsubsection{Encoding ID} + +For ID type byte would is a 5-element string (type descriptor 0x59), +which would then be followed by the five bytes for the string of +{\tt 0x72 0x73 0x31 0x32 0x33}. The full encoding is: + +\vspace{0.3cm} +\begin{tabular}{|l| l|} \hline +0x59 0x72 0x73 0x31 0x32 0x33 & ID \\ \hline +\end{tabular} + +\subsubsection{Encoding REF/ALT fields} + +We encode each of REF and ALT as typed strings, first REF followed immediately +by ALT. Each is a 1 element string (0x19), which would then be followed by the +single bytes for the bases of 0x43 and 0x41: + +\vspace{0.3cm} +\begin{tabular}{|l| l|} \hline +0x19 0x41 & REF A \\ \hline +0x19 0x43 & ALT C \\ \hline +\end{tabular} + +\vspace{0.3cm} +Just for discussion, suppose instead that ALT was ALT=C,T. The only thing that could change is that there would be another typed string following immediately after C encoding 0x19 (1 element string) with the value of 0x54. + +\subsubsection{Encoding FILTER} + +``PASS'' is implicitly encoded as the last entry in the header dictionary (see dictionary of strings). Here we encode the PASS FILTER field as a vector of size 1 of type 8-bit, which has type byte is 0x11. The value is the offset 0: + +\vspace{0.3cm} +\begin{tabular}{|l| l|} \hline +0x11 0x00 & FILTER field PASS \\ \hline +\end{tabular} + +\subsubsection{Encoding the INFO fields} + +HM3;AC=3;AN=6;AA=C +Let's assume that the header dictionary elements for HM3, AC, AN, and AA are at 80, 81, 82, and 83 respectively. All of these can be encoded by 1-element INT8 values (0x11), with associated hex values of 0x50, 0x51, 0x52, and 0x53 respectively. + +First is HM3. The entry begins with the key: 0x11 0x50. Next we have a Flag value to indicate the field is present, represented as a 1 element INT8 value of 1. Altogether we have: + +\vspace{0.3cm} +\begin{tabular}{|l| l|} \hline +0x11 0x50 0x11 0x01 & HM3 flag is present \\ \hline +\end{tabular} +\vspace{0.3cm} + +Now let's encode the two atomic 8-bit integer fields AC and AN: + +\vspace{0.3cm} +\begin{tabular}{|l| l|} \hline +0x11 0x51 & AC key \\ \hline +0x11 0x03 & with value of 3 \\ \hline +0x11 0x52 & AN key \\ \hline +0x11 0x06 & with value of 6 \\ \hline +\end{tabular} +\vspace{0.3cm} + +The ancestral allele (AA) tell us that among other primates the original allele is C, a Character here. Because we represent Characters as single element strings in BCF2 (0x19) with value 0x43 (C). So the entire key/value pair is: + +\vspace{0.3cm} +\begin{tabular}{|l |l|} \hline +0x11 0x51 & AA key \\ \hline +0x19 0x43 & with value of C \\ \hline +\end{tabular} + +\subsubsection{Encoding Genotypes} + +Continuing with our example: + +\vspace{0.3cm} +\begin{tabular}{l l l l} +FORMAT & NA00001 & NA00002 & NA00003 \\ +GT:GQ:DP:AD:PL & 0/0:10:32:32,0:0,10,100 & 0/1:10:48:32,16:10,0,100 & 1/1:10:64:0,64:100,10,0 \\ +\end{tabular} +\vspace{0.3cm} + +Here we have the specially encoded GT field. We have two integer fields GQ and DP. We have the AD field, which is a vector of 2 values per sample. And finally we have the PL field which is 3 values per sample. Let's say that the FORMAT keys for GT, GQ, DP, AD, and PL are at offsets 1, 2, 3, and 4, 5, respectively. +Now let's encode each of the genotype fields in order of the VCF record (GT, GQ, DP, AD, and then PL): + +GT triplet begins with the key: 0x1101. Next is the type of the field, which will be a 2-element (diploid) INT8 type: 0x21. This is followed by 3 2-byte arrays of values 0x0202 0x0204 0x0404 (see genotype encoding example for details). The final encoding is 0x1101 0x21 0x020202040404 + +GQ triplet begins with the key 0x1102. Because these values are small, we encode them as 8 bit atomic integers with type code 0x11. As each value is the same (10 = 0x0A) the GQ field is encoded as 0x1102 0x11 0x0A0A0A + +DP almost identical to GQ. First is the 0x1103 key, followed by 3 8-bit atomic integers encoded as 0x11 (the type) 0x20 (DP=32), 0x30 (DP=48) and 0x40 (DP=64). So we have: 0x1103 0x11203040 + +AD is more complex. The key is simple, just like the others, with 0x1104. Because the AD field is a vector of 2 values for each genotype, the value of key/value pair a vector type. Because the integer values in each AD field of each sample are small they are encoded by 8 bit values. So the value type is = 0x21. For sample one there are two values: 32,0 which are 0x30 and 0x00. Samples two and three are 0x30 0x20 and 0x00 0x40 respectively. So ultimately this field is encoded as 0x1104 0x21 0x300030200040 + +PL is just like AD but with three values per sample. The key is 0x1105. Because the PL field is a vector of 3 values for each genotype, the value of key/value pair a vector type, and because the size is 3 it's encoded in the size field of the type. Again, because the integer values in each PL field of each sample are small they are encoded by 8 bit values. So the value type 0x31. For sample one there are three values: 0, 10, and 100 which are 0x00, 0x0A, and 0x64. Samples two and three have the same values but in a slightly different order. So ultimately the PL field is encoded as 0x1105 0x31 0x000A64 0x0A0064 0x640A00 + +So the genotype block contains: + +\vspace{0.3cm} +\begin{tabular}{|l| l|} \hline +0x1101 0x21 0x020202040404 & GT \\ \hline +0x1102 0x11 0x0A0A0A & GQ \\ \hline +0x1103 0x11 0x203040 & DP \\ \hline +0x1104 0x21 0x300030200040 & AD \\ \hline +0x1105 0x31 0x000A640A0064640A00 & PL \\ \hline +\end{tabular} +\vspace{0.3cm} + +\textbf{Putting it all together} + +We need to determine a few values before writing out the final block: + +l\_shared = 54 (Data length from CHROM to the end of INFO) + +l\_indiv = 42 (Data length of FORMAT and individual genotype fields) + +n\_allele\_info = n\_allele$<<16\mid$n\_info = $2 << 16 \mid 4$ = 0x00020004 + +n\_fmt\_samples = n\_fmt$<<24\mid$n\_sample = $5 << 24 \mid 3$ = 0x05000003 + + +\vspace{0.3cm} +\begin{tabular}{|l| l|} \hline +0x36000000 & l\_shared as little endian hex \\ \hline +0x2A000000 & l\_indiv as little endian hex \\ \hline +0x01000000 & CHROM offset is at 1 in 32 bit little endian \\ \hline +0x64000000 & POS in 0 base 32 bit little endian \\ \hline +0x01000000 & rlen = 1 (it's just a SNP) \\ \hline +0x41 0xF0 0xCC 0xCD & QUAL = 30.1 as 32-bit float \\ \hline +0x00020004 & n\_allele\_info \\ \hline +0x05000003 & n\_fmt\_samples \\ \hline +0x59 0x72 0x73 0x31 0x32 0x33 & ID \\ \hline +0x19 0x41 & REF A \\ \hline +0x19 0x43 & ALT C \\ \hline +0x11 0x00 & FILTER field PASS \\ \hline +0x11 0x50 0x11 0x01 & HM3 flag is present \\ \hline +0x11 0x51 & AC key \\ \hline +0x11 0x03 & with value of 3 \\ \hline +0x11 0x52 & AN key \\ \hline +0x11 0x06 & with value of 6 \\ \hline +0x11 0x51 & AA key \\ \hline +0x19 0x43 & with value of C \\ \hline +0x1101 0x21 0x020202040404 & GT \\ \hline +0x1102 0x11 0x0A0A0A & GQ \\ \hline +0x1103 0x11 0x203040 & DP \\ \hline +0x1104 0x21 0x300030200040 & AD \\ \hline +0x1105 0x31 0x000A640A0064640A00 & PL \\ \hline +\end{tabular} +\vspace{0.3cm} + +That's quite a lot of information encoded in only 96 bytes! + +\subsection{BCF2 block gzip and indexing} + +These raw binary records may be subsequently encoded into BGZF blocks following +the BGZF compression format, section 3 of the SAM format specification. +BCF2 records can be raw, though, in cases where the decoding/encoding costs of +bgzipping the data make it reasonable to process the data uncompressed, such as +streaming BCF2s through pipes with samtools and bcftools. Here the files +should be still compressed with BGZF but with compression 0. Note that +currently the GATK generates raw BCF2 files (not BGZF compression at all) but +this will change in the near future. + +BCF2 files are expected to be indexed through the same index scheme, +section~4 as BAM files and other block-compressed files with BGZF. + +\end{document} From 2d3f041f4d3530da5839167ac65f78fe9c0ea4fb Mon Sep 17 00:00:00 2001 From: Ryan Poplin Date: Fri, 1 Aug 2014 14:12:35 -0400 Subject: [PATCH 02/34] Adding subsection about gVCF files. --- VCFv4.3.tex | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/VCFv4.3.tex b/VCFv4.3.tex index fe9a8a85b..dc61ee02a 100644 --- a/VCFv4.3.tex +++ b/VCFv4.3.tex @@ -1034,6 +1034,27 @@ \subsubsection{Phasing adjacencies in an aneuploid context} Haplotype quality & 30 & 40 & 40 \\ \end{tabular} +\subsection{Representing unknown or unspecified alternate alleles} +Inspired by the gVCF file format (\url{https://support.basespace.illumina.com/knowledgebase/articles/147078-gvcf-file}), +it is necessary, primarily in clinical settings, to report sequencing data evidence for both variant and non-variant +positions in the genome. The convention adopted here is to represent reference evidence as likelihoods against an +unknown alternate allele. Think of this as the likelihood for reference as compared to any other possible alternate +allele (both SNP, indel, or otherwise). The symbolic alternate allele $<$NON_REF$>$ is used to represent this +unspecified alternate allele. + +Example records are given below: +\begin{flushleft} +\begin{tabular}{ l l l l l l l l l l } +\#CHROM & POS & ID & REF & ALT & QUAL & FILTER & INFO & FORMAT & Sample \\ +1 & 1004370 & . & G & & . & . & END=1004383 & GT:DP:GQ:MIN_DP:PL & 0/0:25:60:23:0,60,900 \\ +1 & 1004384 & . & C & & . & . & END=1004388 & GT:DP:GQ:MIN_DP:PL & 0/0:25:45:25:0,42,630 \\ +1 & 1004389 & . & T & TC, & 213.73 & . & . & GT:AD:DP:GQ:PL & 0/1:14,9,0:23:99:251,0,362,293,392,686 \\ +1 & 1004390 & . & C & & . & . & END=1004390 & GT:DP:GQ:MIN_DP:PL & 0/0:26:0:26:0,0,315 \\ +1 & 1004391 & . & C & & . & . & END=1004395 & GT:DP:GQ:MIN_DP:PL & 0/0:27:63:27:0,63,945 \\ +1 & 1004396 & . & G & C, & 0 & . & . & GT:AD:DP:GQ:P & 0/0:22,2,0:24:52:0,52,953,66,958,972 \\ +1 & 1004397 & . & T & & . & . & END=1004416 & GT:DP:GQ:MIN_DP:PL & 0/0:22:14:22:0,15,593 \\ +\end{tabular} +\end{flushleft} \pagebreak \section{BCF specification} From 4ca34244e043fdaeaa10e510a177e8ac4cb6a3e0 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Mon, 11 Aug 2014 11:34:19 +0200 Subject: [PATCH 03/34] Updated gVCF section, added list of changes --- VCFv4.3.tex | 57 ++++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 45 insertions(+), 12 deletions(-) diff --git a/VCFv4.3.tex b/VCFv4.3.tex index dc61ee02a..830c531c4 100644 --- a/VCFv4.3.tex +++ b/VCFv4.3.tex @@ -1034,27 +1034,34 @@ \subsubsection{Phasing adjacencies in an aneuploid context} Haplotype quality & 30 & 40 & 40 \\ \end{tabular} -\subsection{Representing unknown or unspecified alternate alleles} -Inspired by the gVCF file format (\url{https://support.basespace.illumina.com/knowledgebase/articles/147078-gvcf-file}), -it is necessary, primarily in clinical settings, to report sequencing data evidence for both variant and non-variant -positions in the genome. The convention adopted here is to represent reference evidence as likelihoods against an +{\color{red} +\subsection{Representing unspecified alleles and REF-only blocks (gVCF)} +In order to report sequencing data evidence for both variant and non-variant +positions in the genome, the VCF specification allows to represent blocks of reference-only calls in a single record +using the END INFO tag, an idea originally introduced by the gVCF file format\footnote{\url{https://support.basespace.illumina.com/knowledgebase/articles/147078-gvcf-file}}. +The convention adopted here is to represent reference evidence as likelihoods against an unknown alternate allele. Think of this as the likelihood for reference as compared to any other possible alternate -allele (both SNP, indel, or otherwise). The symbolic alternate allele $<$NON_REF$>$ is used to represent this +allele (both SNP, indel, or otherwise). A symbolic alternate allele ($<$X$>$ in +samtools and $<$NON\_REF$>$ in GATK implementation) is used to represent this unspecified alternate allele. Example records are given below: +\scriptsize \begin{flushleft} \begin{tabular}{ l l l l l l l l l l } \#CHROM & POS & ID & REF & ALT & QUAL & FILTER & INFO & FORMAT & Sample \\ -1 & 1004370 & . & G & & . & . & END=1004383 & GT:DP:GQ:MIN_DP:PL & 0/0:25:60:23:0,60,900 \\ -1 & 1004384 & . & C & & . & . & END=1004388 & GT:DP:GQ:MIN_DP:PL & 0/0:25:45:25:0,42,630 \\ -1 & 1004389 & . & T & TC, & 213.73 & . & . & GT:AD:DP:GQ:PL & 0/1:14,9,0:23:99:251,0,362,293,392,686 \\ -1 & 1004390 & . & C & & . & . & END=1004390 & GT:DP:GQ:MIN_DP:PL & 0/0:26:0:26:0,0,315 \\ -1 & 1004391 & . & C & & . & . & END=1004395 & GT:DP:GQ:MIN_DP:PL & 0/0:27:63:27:0,63,945 \\ -1 & 1004396 & . & G & C, & 0 & . & . & GT:AD:DP:GQ:P & 0/0:22,2,0:24:52:0,52,953,66,958,972 \\ -1 & 1004397 & . & T & & . & . & END=1004416 & GT:DP:GQ:MIN_DP:PL & 0/0:22:14:22:0,15,593 \\ +1 & 4370 & . & G & $<$X$>$ & . & . & END=4383 & GT:DP:GQ:MIN\_DP:PL & 0/0:25:60:23:0,60,900 \\ +1 & 4384 & . & C & $<$X$>$ & . & . & END=4388 & GT:DP:GQ:MIN\_DP:PL & 0/0:25:45:25:0,42,630 \\ +1 & 4389 & . & T & TC,$<$X$>$ & 213.73 & . & . & GT:DP:GQ:PL & 0/1:23:99:51,0,36,93,92,86 \\ +1 & 4390 & . & C & $<$X$>$ & . & . & END=4390 & GT:DP:GQ:MIN\_DP:PL & 0/0:26:0:26:0,0,315 \\ +1 & 4391 & . & C & $<$X$>$ & . & . & END=4395 & GT:DP:GQ:MIN\_DP:PL & 0/0:27:63:27:0,63,945 \\ +1 & 4396 & . & G & C,$<$X$>$ & 0 & . & . & GT:DP:GQ:P & 0/0:24:52:0,52,95,66,95,97 \\ +1 & 4397 & . & T & $<$X$>$ & . & . & END=4416 & GT:DP:GQ:MIN\_DP:PL & 0/0:22:14:22:0,15,593 \\ \end{tabular} \end{flushleft} +\normalsize +} + \pagebreak \section{BCF specification} @@ -1647,4 +1654,30 @@ \subsection{BCF2 block gzip and indexing} BCF2 files are expected to be indexed through the same index scheme, section~4 as BAM files and other block-compressed files with BGZF. +{\color{red} +\section{List of changes} + +\subsection{Changes between VCFv4.2 and VCFv4.3} + +\begin{itemize} +\item VCF compliant implementations must support both LF and CR+LF newline conventions +\item INFO and FORMAT tag names must match the regular expression \texttt{\^{}[A-Za-z\_][0-9A-Za-z\_.]*\$} +\item Spaces are allowed in INFO field values +\item Characters with special meaning (such as ';' in INFO, ':' in FORMAT, and '\%' in both) can be encoded using URL encoding +\item Character encoding is declared by the \#\#fileencoding header line. Only encodings that are a superset of US-ASCII are allowed, the default encoding is UTF-8. Also sample names can be in UTF-8. +\item The SAMPLE field can contain optional DOI URL for the source data file +\item New reserved tag "CNP" analogous to "GP" was added. Both CNP and GP use 0 to 1 encoding, which is a change from previous phred-scaled GP. +\item In order for VCF and BCF to have the same expressive power, we state explicitly that Integers and Floats are 32-bit numbers. Integers are signed. +\item We state explicitly that zero length strings are not allowed, this includes the CHROM and ID column, INFO IDs, FILTER IDs and FORMAT IDs. Meta-information lines can be in any order, with the exception of \#\#fileformat which must come first. INFO, FILTER and FORMAT IDs must be unique within that type. +\item A section about gVCF was added +\end{itemize} + +\subsection{Changes between BCFv2.1 and BCFv2.2} +\begin{itemize} +\item BCF header lines can include optional IDX field +\item We introduce end-of-vector byte and reserve 8 values for future use +\item String vectors in BCF do not need to start with comma, as the number of values is indicated already in the definition of the tag in the header. +\end{itemize} +} + \end{document} From efed15f4562920f6257cce7bdad4c3f2b0cd2b50 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Mon, 8 Sep 2014 20:19:30 +0200 Subject: [PATCH 04/34] Added a gVCF section plus minor edits --- VCFv4.3.tex | 30 +++++++++++++++++++++++++++--- 1 file changed, 27 insertions(+), 3 deletions(-) diff --git a/VCFv4.3.tex b/VCFv4.3.tex index 830c531c4..a3ab7fb06 100644 --- a/VCFv4.3.tex +++ b/VCFv4.3.tex @@ -31,7 +31,7 @@ \section{The VCF specification} line {\color{red}(prefixed with "\#")}, and data lines each containing information about a position in the genome and genotype information on samples for each position -{\color{red}(text fields separated by tabs). Zero length fields are not allowed, a dot (".") can +{\color{red}(text fields separated by tabs). Zero length fields are not allowed, a dot (".") should be used instead. In order to ensure interoperability across platforms, VCF compliant implementations must support both LF (\texttt{\textbackslash n}) and CR+LF (\texttt{\textbackslash r\textbackslash n}) newline conventions. @@ -106,7 +106,8 @@ \subsubsection{Information field format} \end{verbatim} Possible Types for INFO fields are: Integer {\color{red} (32-bit, signed)}, Float {\color{red} (32-bit)}, Flag, Character, and -String. The Number entry is an Integer that describes the number of values that +String. {\color{red} For the Integer type, the values from $-2^{31}$ to $-2^{31}+7$ cannot be stored in binary version, see \ref{BcfTypeEncoding}.} +The Number entry is an Integer that describes the number of values that can be included with the INFO field. For example, if the INFO field contains a single number, then this value should be $1$; if the INFO field describes a pair of numbers, then this value should be $2$ and so on. There are also @@ -288,6 +289,7 @@ \subsubsection{Genotype fields} \item MQ : RMS mapping quality, similar to the version in the INFO field. (Integer) \end{itemize} + If any of the fields is missing, it is replaced with the missing value. For example if the FORMAT is GT:GQ:DP:HQ then $0\mid0:.:23:23,34$ indicates that GQ is missing. Trailing fields can be dropped (with the exception of the GT field, which should always be present if specified in the FORMAT field). See below for additional genotype fields used to encode structural variants. Additional Genotype fields can be defined in the meta-information. However, software support for such fields is not guaranteed. @@ -301,6 +303,25 @@ \section{Understanding the VCF format and the haplotype representation} \end{itemize} VCF records use a simple haplotype representation for REF and ALT alleles to describe variant haplotypes at a locus. ALT haplotypes are constructed from the REF haplotype by taking the REF allele bases at the POS in the reference genotype and replacing them with the ALT bases. In essence, the VCF record specifies a-REF-t and the alternative haplotypes are a-ALT-t for each alternative allele. +{\color{red} +\subsection{VCF tag naming conventions} +\begin{itemize} + \item The "L" suffix means "likelihood" as log-likelihood in the sampling + distribution, log10 Pr(Data$|$Model). Likelihoods are represented as log10 + scale, so has to be negative (e.g.~GL, CNL). The likelihood can be also + represented in some cases as phred-scale in a separate tag (e.g.~PL). + + \item The "P" suffix means "probability" as linear-scale probability in the + posterior distribution, which is Pr(Model$|$Data). Examples are GP, CNP. + + \item The "Q" suffix means "quality" as log-complementary-phred-scale posterior + probability, which is -10 * log10 Pr(Data$|$Model) where the model is the most + likely genotype that appears in the GT field. Examples are GQ, CNQ. The fixed + site-level QUAL field follows the same convention (represented as a + phred-scaled number). +\end{itemize} +} + \section{INFO keys used for structural variants} When the INFO keys reserved for encoding structural variants are used for imprecise variants, the values should be best estimates. When a key reflects a property of a single alt allele (e.g. SVLEN), then when there are multiple alt alleles there will be multiple values for the key corresponding to each alelle (e.g. SVLEN=-100,-110 for a deletion with two distinct alt alleles). @@ -1154,7 +1175,8 @@ \subsubsection{Dictionary of strings} to explicitly define the position which is dropped on BCF-to-VCF conversion. If not present, the implicit relative position is assumed. If the IDX field is present in one record, it must be present also in all other dictionary-defining -records. +records. The IDX tag is not necessary in newly created BCF files, but if +present, the numbering must match the implicit dictionary of tags. } Note that the dictionary encoding has the magic prefix `s' here to indicate that the field's value is actually in the dictionary entry giving by the subsequent offset. This representation isn't actually the one used in BCF2 records but it provides a clean visual guide for the above example. Note also how the contig has been recoded as a offset into the list of contig declarations. @@ -1251,6 +1273,7 @@ \subsubsection{Genotype encoding} \subsubsection{Type encoding} +\label{BcfTypeEncoding} In BCF2 values are all strongly typed in the file. The type information is encoded in a prefix byte before the value, which contains information about the low-level type of the value(s) such as int32 or float, as well as the number of elements in the value. The encoding is as follows: @@ -1670,6 +1693,7 @@ \subsection{Changes between VCFv4.2 and VCFv4.3} \item In order for VCF and BCF to have the same expressive power, we state explicitly that Integers and Floats are 32-bit numbers. Integers are signed. \item We state explicitly that zero length strings are not allowed, this includes the CHROM and ID column, INFO IDs, FILTER IDs and FORMAT IDs. Meta-information lines can be in any order, with the exception of \#\#fileformat which must come first. INFO, FILTER and FORMAT IDs must be unique within that type. \item A section about gVCF was added +\item A section about tag naming conventions was added \end{itemize} \subsection{Changes between BCFv2.1 and BCFv2.2} From a09e56feac09c831a490d97b423aac5a78960650 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Fri, 19 Sep 2014 16:04:32 +0200 Subject: [PATCH 05/34] Forbid duplicate fields, UTF8 the only encoding. --- VCFv4.3.tex | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/VCFv4.3.tex b/VCFv4.3.tex index a3ab7fb06..41d3300ff 100644 --- a/VCFv4.3.tex +++ b/VCFv4.3.tex @@ -89,13 +89,11 @@ \subsubsection{File format} {\color{red} \subsubsection{Character encoding} -Character encoding is declared by the `fileencoding' header line. Only -encodings that are a superset of US-ASCII are allowed, the default encoding is -UTF-8. - -\begin{verbatim} -##fileencoding=UTF-8 -\end{verbatim} +The character encoding of VCF files is UTF-8. UTF-8 is a multi-byte +character encoding that is a strict superset of 7-bit ASCII and has the +property that none of the bytes in any multi-byte characters are 7-bit ASCII +bytes. As a result, most software that processes VCF files does not have +to be aware of the possible presence of multi-byte UTF-8 characters. } \subsubsection{Information field format} @@ -182,8 +180,7 @@ \subsubsection{Contig field format} ##contig= \end{verbatim} \subsubsection{Sample field format} -It is possible to define sample to genome mappings as shown below:\footnote{\color{red}Some description of SAMPLE would be nice. What -is Genomes, Mixture, $k$?} +It is possible to define sample to genome mappings as shown below: \small {\color{red} \begin{verbatim} @@ -223,17 +220,17 @@ \subsubsection{Fixed fields} \begin{enumerate} \item CHROM - chromosome: An identifier from the reference genome or an angle-bracketed ID String (``$<$ID$>$'') pointing to a contig in the assembly file (cf. the \#\#assembly line in the header). All entries for a specific CHROM should form a contiguous block within the VCF file. The colon symbol (:) must be absent from all chromosome names to avoid parsing errors when dealing with breakends. (String, no white-space permitted, Required). \item POS - position: The reference position, with the 1st base having position 1. Positions are sorted numerically, in increasing order, within each reference sequence CHROM. It is permitted to have multiple records with the same POS. Telomeres are indicated by using positions 0 or N+1, where N is the length of the corresponding chromosome or contig. (Integer, Required) - \item ID - identifier: Semi-colon separated list of unique identifiers where available. If this is a dbSNP variant it is encouraged to use the rs number(s). No identifier should be present in more than one data record. If there is no identifier available, then the missing value should be used. (String, no white-space or semi-colons permitted) + \item ID - identifier: Semi-colon separated list of unique identifiers where available. If this is a dbSNP variant it is encouraged to use the rs number(s). No identifier should be present in more than one data record. If there is no identifier available, then the missing value should be used. (String, no white-space or semi-colons permitted, {\color{red}duplicate values not allowed.}) \item REF - reference base(s): Each base must be one of A,C,G,T,N (case insensitive). Multiple bases are permitted. The value in the POS field refers to the position of the first base in the String. For simple insertions and deletions in which either the REF or one of the ALT alleles would otherwise be null/empty, the REF and ALT Strings must include the base before the event (which must be reflected in the POS field), unless the event occurs at position 1 on the contig in which case it must include the base after the event; this padding base is not required (although it is permitted) for e.g. complex substitutions or other events where all alleles have at least one base represented in their Strings. If any of the ALT alleles is a symbolic allele (an angle-bracketed ID String ``$<$ID$>$'') then the padding base is required and POS denotes the coordinate of the base preceding the polymorphism. Tools processing VCF files are not required to preserve case in the allele Strings. (String, Required). \item ALT - alternate base(s): Comma separated list of alternate non-reference alleles called on at least one of the samples. Options are base Strings made up of the bases A,C,G,T,N,*, (case insensitive) or an angle-bracketed ID String (``$<$ID$>$'') or a breakend replacement string as described in the section on breakends. The `*' allele is reserved to indicate that the allele is missing due to a upstream deletion. If there are no alternative alleles, then the missing value should be used. Tools processing VCF files are not required to preserve case in the allele String, except for IDs, which are case sensitive. (String; no whitespace, commas, or angle-brackets are permitted in the ID String itself) \item QUAL - quality: Phred-scaled quality score for the assertion made in ALT. i.e. $-10log_{10}$ prob(call in ALT is wrong). If ALT is `.' (no variant) then this is $-10log_{10}$ prob(variant), and if ALT is not `.' this is $-10log_{10}$ prob(no variant). If unknown, the missing value should be specified. (Float) - \item FILTER - filter status: PASS if this position has passed all filters, i.e. a call is made at this position. Otherwise, if the site has not passed all filters, a semicolon-separated list of codes for filters that fail. e.g. ``q10;s50'' might indicate that at this site the quality is below 10 and the number of samples with data is below 50\% of the total number of samples. `0' is reserved and should not be used as a filter String. If filters have not been applied, then this field should be set to the missing value. (String, no white-space or semi-colons permitted) + \item FILTER - filter status: PASS if this position has passed all filters, i.e. a call is made at this position. Otherwise, if the site has not passed all filters, a semicolon-separated list of codes for filters that fail. e.g. ``q10;s50'' might indicate that at this site the quality is below 10 and the number of samples with data is below 50\% of the total number of samples. `0' is reserved and should not be used as a filter String. If filters have not been applied, then this field should be set to the missing value. (String, no white-space or semi-colons permitted, {\color{red}duplicate values not allowed.}) \item INFO - additional information: (String, no semi-colons or equals-signs permitted; commas are permitted only as delimiters for lists of values; characters with special meaning can be encoded using URL encoding; space characters are allowed) INFO fields are encoded as a semicolon-separated series of short keys with optional values in the format: $<$key$>$=$<$data$>$[,data]. - {\color{red} INFO keys must match the regular expression \texttt{\^{}[A-Za-z\_][0-9A-Za-z\_.]*\$}.} + {\color{red} INFO keys must match the regular expression \texttt{\^{}[A-Za-z\_][0-9A-Za-z\_.]*\$}, duplicate fields are not allowed.} Arbitrary keys are permitted, although the following sub-fields are reserved (albeit optional): \begin{itemize} \item AA : ancestral allele @@ -261,7 +258,7 @@ \subsubsection{Fixed fields} \subsubsection{Genotype fields} If genotype information is present, then the same types of data must be present for all samples. First a FORMAT field is given specifying the data types and -order ({\color{red} colon-separated FORMAT ids matching the regular expression \texttt{\^{}[A-Za-z\_][0-9A-Za-z\_.]*\$}}). This is followed by one data block per +order ({\color{red} colon-separated FORMAT ids matching the regular expression \texttt{\^{}[A-Za-z\_][0-9A-Za-z\_.]*\$}, duplicate fields are not allowed}). This is followed by one data block per sample, with the colon-separated data corresponding to the types specified in the format. The first sub-field must always be the genotype (GT) if it is present. There are no required sub-fields. @@ -1687,11 +1684,12 @@ \subsection{Changes between VCFv4.2 and VCFv4.3} \item INFO and FORMAT tag names must match the regular expression \texttt{\^{}[A-Za-z\_][0-9A-Za-z\_.]*\$} \item Spaces are allowed in INFO field values \item Characters with special meaning (such as ';' in INFO, ':' in FORMAT, and '\%' in both) can be encoded using URL encoding -\item Character encoding is declared by the \#\#fileencoding header line. Only encodings that are a superset of US-ASCII are allowed, the default encoding is UTF-8. Also sample names can be in UTF-8. +\item The character encoding of VCF files is UTF-8. \item The SAMPLE field can contain optional DOI URL for the source data file \item New reserved tag "CNP" analogous to "GP" was added. Both CNP and GP use 0 to 1 encoding, which is a change from previous phred-scaled GP. \item In order for VCF and BCF to have the same expressive power, we state explicitly that Integers and Floats are 32-bit numbers. Integers are signed. \item We state explicitly that zero length strings are not allowed, this includes the CHROM and ID column, INFO IDs, FILTER IDs and FORMAT IDs. Meta-information lines can be in any order, with the exception of \#\#fileformat which must come first. INFO, FILTER and FORMAT IDs must be unique within that type. +\item We state explicitly that duplicate IDs, FILTER, INFO or FORMAT keys are not valid. \item A section about gVCF was added \item A section about tag naming conventions was added \end{itemize} From af36fb16692d59f7e95b6c636cf190911f111b5e Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 23 Sep 2014 17:55:26 +0200 Subject: [PATCH 06/34] Removed forgotten copy and paste error, resolves #41 --- VCFv4.3.tex | 2 -- 1 file changed, 2 deletions(-) diff --git a/VCFv4.3.tex b/VCFv4.3.tex index 41d3300ff..cd340ba9b 100644 --- a/VCFv4.3.tex +++ b/VCFv4.3.tex @@ -321,8 +321,6 @@ \subsection{VCF tag naming conventions} \section{INFO keys used for structural variants} -When the INFO keys reserved for encoding structural variants are used for imprecise variants, the values should be best estimates. When a key reflects a property of a single alt allele (e.g. SVLEN), then when there are multiple alt alleles there will be multiple values for the key corresponding to each alelle (e.g. SVLEN=-100,-110 for a deletion with two distinct alt alleles). - The following INFO keys are reserved for encoding structural variants. In general, when these keys are used by imprecise variants, the values should be best estimates. When a key reflects a property of a single alt allele (e.g. SVLEN), then when there are multiple alt alleles there will be multiple values for the key corresponding to each alelle (e.g. SVLEN=-100,-110 for a deletion with two distinct alt alleles). \footnotesize \begin{verbatim} From 4a917457cb9be262bbe22634a90ece2e0dd08f48 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 23 Sep 2014 18:08:57 +0200 Subject: [PATCH 07/34] Introduced symbolic allele "<*>" in gVCF --- VCFv4.3.tex | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/VCFv4.3.tex b/VCFv4.3.tex index cd340ba9b..0e64eb6e7 100644 --- a/VCFv4.3.tex +++ b/VCFv4.3.tex @@ -1057,22 +1057,21 @@ \subsection{Representing unspecified alleles and REF-only blocks (gVCF)} using the END INFO tag, an idea originally introduced by the gVCF file format\footnote{\url{https://support.basespace.illumina.com/knowledgebase/articles/147078-gvcf-file}}. The convention adopted here is to represent reference evidence as likelihoods against an unknown alternate allele. Think of this as the likelihood for reference as compared to any other possible alternate -allele (both SNP, indel, or otherwise). A symbolic alternate allele ($<$X$>$ in -samtools and $<$NON\_REF$>$ in GATK implementation) is used to represent this -unspecified alternate allele. +allele (both SNP, indel, or otherwise). A symbolic alternate allele $<$*$>$ +is used to represent this unspecified alternate allele. Example records are given below: \scriptsize \begin{flushleft} \begin{tabular}{ l l l l l l l l l l } \#CHROM & POS & ID & REF & ALT & QUAL & FILTER & INFO & FORMAT & Sample \\ -1 & 4370 & . & G & $<$X$>$ & . & . & END=4383 & GT:DP:GQ:MIN\_DP:PL & 0/0:25:60:23:0,60,900 \\ -1 & 4384 & . & C & $<$X$>$ & . & . & END=4388 & GT:DP:GQ:MIN\_DP:PL & 0/0:25:45:25:0,42,630 \\ -1 & 4389 & . & T & TC,$<$X$>$ & 213.73 & . & . & GT:DP:GQ:PL & 0/1:23:99:51,0,36,93,92,86 \\ -1 & 4390 & . & C & $<$X$>$ & . & . & END=4390 & GT:DP:GQ:MIN\_DP:PL & 0/0:26:0:26:0,0,315 \\ -1 & 4391 & . & C & $<$X$>$ & . & . & END=4395 & GT:DP:GQ:MIN\_DP:PL & 0/0:27:63:27:0,63,945 \\ -1 & 4396 & . & G & C,$<$X$>$ & 0 & . & . & GT:DP:GQ:P & 0/0:24:52:0,52,95,66,95,97 \\ -1 & 4397 & . & T & $<$X$>$ & . & . & END=4416 & GT:DP:GQ:MIN\_DP:PL & 0/0:22:14:22:0,15,593 \\ +1 & 4370 & . & G & $<$*$>$ & . & . & END=4383 & GT:DP:GQ:MIN\_DP:PL & 0/0:25:60:23:0,60,900 \\ +1 & 4384 & . & C & $<$*$>$ & . & . & END=4388 & GT:DP:GQ:MIN\_DP:PL & 0/0:25:45:25:0,42,630 \\ +1 & 4389 & . & T & TC,$<$*$>$ & 213.73 & . & . & GT:DP:GQ:PL & 0/1:23:99:51,0,36,93,92,86 \\ +1 & 4390 & . & C & $<$*$>$ & . & . & END=4390 & GT:DP:GQ:MIN\_DP:PL & 0/0:26:0:26:0,0,315 \\ +1 & 4391 & . & C & $<$*$>$ & . & . & END=4395 & GT:DP:GQ:MIN\_DP:PL & 0/0:27:63:27:0,63,945 \\ +1 & 4396 & . & G & C,$<$*$>$ & 0 & . & . & GT:DP:GQ:P & 0/0:24:52:0,52,95,66,95,97 \\ +1 & 4397 & . & T & $<$*$>$ & . & . & END=4416 & GT:DP:GQ:MIN\_DP:PL & 0/0:22:14:22:0,15,593 \\ \end{tabular} \end{flushleft} \normalsize @@ -1688,7 +1687,7 @@ \subsection{Changes between VCFv4.2 and VCFv4.3} \item In order for VCF and BCF to have the same expressive power, we state explicitly that Integers and Floats are 32-bit numbers. Integers are signed. \item We state explicitly that zero length strings are not allowed, this includes the CHROM and ID column, INFO IDs, FILTER IDs and FORMAT IDs. Meta-information lines can be in any order, with the exception of \#\#fileformat which must come first. INFO, FILTER and FORMAT IDs must be unique within that type. \item We state explicitly that duplicate IDs, FILTER, INFO or FORMAT keys are not valid. -\item A section about gVCF was added +\item A section about gVCF was added, introduced the $<$*$>$ symbolic allele. \item A section about tag naming conventions was added \end{itemize} From 7789a1bfb3bf3f6798b60b2d5f4c4704d61b7c0e Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 1 Oct 2014 15:59:43 +0200 Subject: [PATCH 08/34] Add ##META definitions --- VCFv4.3.tex | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/VCFv4.3.tex b/VCFv4.3.tex index 0e64eb6e7..2b80903fd 100644 --- a/VCFv4.3.tex +++ b/VCFv4.3.tex @@ -181,13 +181,18 @@ \subsubsection{Contig field format} \end{verbatim} \subsubsection{Sample field format} It is possible to define sample to genome mappings as shown below: -\small {\color{red} +\scriptsize \begin{verbatim} -##SAMPLE= +##META= +##META= +##META= +##META= +##SAMPLE= +##SAMPLE= \end{verbatim} } -\normalsize + \subsubsection{Pedigree field format} It is possible to record relationships between genomes using the following syntax: \begin{verbatim} From 250a88c0a6f60afa02e447c00243e492aaeacc1e Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 2 Oct 2014 09:25:33 +0200 Subject: [PATCH 09/34] Include META header lines in the list of changes --- VCFv4.3.tex | 1 + 1 file changed, 1 insertion(+) diff --git a/VCFv4.3.tex b/VCFv4.3.tex index 2b80903fd..9a9d5ca09 100644 --- a/VCFv4.3.tex +++ b/VCFv4.3.tex @@ -1688,6 +1688,7 @@ \subsection{Changes between VCFv4.2 and VCFv4.3} \item Characters with special meaning (such as ';' in INFO, ':' in FORMAT, and '\%' in both) can be encoded using URL encoding \item The character encoding of VCF files is UTF-8. \item The SAMPLE field can contain optional DOI URL for the source data file +\item Introduced \#\#META header lines for defining phenotype metadata \item New reserved tag "CNP" analogous to "GP" was added. Both CNP and GP use 0 to 1 encoding, which is a change from previous phred-scaled GP. \item In order for VCF and BCF to have the same expressive power, we state explicitly that Integers and Floats are 32-bit numbers. Integers are signed. \item We state explicitly that zero length strings are not allowed, this includes the CHROM and ID column, INFO IDs, FILTER IDs and FORMAT IDs. Meta-information lines can be in any order, with the exception of \#\#fileformat which must come first. INFO, FILTER and FORMAT IDs must be unique within that type. From 1080e8a15d65f4a2b468c104243775cf7996428c Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 15 Oct 2014 14:52:11 +0200 Subject: [PATCH 10/34] Clarification: no negative values in GT array except the end-of-vector byte --- VCFv4.3.tex | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/VCFv4.3.tex b/VCFv4.3.tex index 9a9d5ca09..1670579f7 100644 --- a/VCFv4.3.tex +++ b/VCFv4.3.tex @@ -1447,7 +1447,16 @@ \subsubsection{Type encoding} Note that this means that it's illegal to encode a vector VCF field with missing values; the BCF2 codec should signal an error in this case. \vspace{0.3cm} -A \textbf{Genotype (GT) field} is encoded in a typed integer vector (can be 8, 16, or even 32 bit if necessary) with the number of elements equal to the maximum ploidy among all samples at a site. For one individual, each integer in the vector is organized as $(allele+1) << 1 \mid phased$ where allele is set to -1 if the allele in GT is a dot `.' (thus the higher bits are all 0). The vector is padded with missing values if the GT having fewer ploidy. +A \textbf{Genotype (GT) field} is encoded in a typed integer vector (can be 8, +16, or even 32 bit if necessary) with the number of elements equal to the +maximum ploidy among all samples at a site. For one individual, each integer +in the vector is organized as $(allele+1) << 1 \mid phased$ where allele is set +to -1 if the allele in GT is a dot `.' (thus the higher bits are all 0). +{\color{red} +The vector is padded with the end-of-vector values if the GT having fewer ploidy. +We note specifically that except for the end-of-vector byte, no other negative +values are allowed in the GT array. +} Examples: @@ -1701,6 +1710,7 @@ \subsection{Changes between BCFv2.1 and BCFv2.2} \begin{itemize} \item BCF header lines can include optional IDX field \item We introduce end-of-vector byte and reserve 8 values for future use +\item Clarified that except the end-of-vector byte, no other negative values are allowed in the GT array \item String vectors in BCF do not need to start with comma, as the number of values is indicated already in the definition of the tag in the header. \end{itemize} } From 66d45d5d1fd929f627717f0efe71b3db5d6ddc2d Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Mon, 5 Jan 2015 11:59:01 +0100 Subject: [PATCH 11/34] Clarified GL ordering. Resolves #58 --- VCFv4.3.tex | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/VCFv4.3.tex b/VCFv4.3.tex index 1670579f7..afd7ec009 100644 --- a/VCFv4.3.tex +++ b/VCFv4.3.tex @@ -5,6 +5,12 @@ \usepackage[margin=0.75in]{geometry} \usepackage[pdfborder={0 0 0}]{hyperref} +\usepackage{listings} +\lstset{ + basicstyle=\ttfamily, + mathescape +} + \usepackage{color} \renewcommand{\thefootnote}{\color{red}\fnsymbol{footnote}} @@ -279,7 +285,25 @@ \subsubsection{Genotype fields} \end{itemize} \item DP : read depth at this position for this sample (Integer) \item FT : sample genotype filter indicating if this genotype was ``called'' (similar in concept to the FILTER field). Again, use PASS to indicate that all filters have been passed, a semi-colon separated list of codes for filters that fail, or `.' to indicate that filters have not been applied. These values should be described in the meta-information in the same way as FILTERs (String, no white-space or semi-colons permitted) - \item GL : genotype likelihoods comprised of comma separated floating point $log_{10}$-scaled likelihoods for all possible genotypes given the set of alleles defined in the REF and ALT fields. In presence of the GT field the same ploidy is expected and the canonical order is used; without GT field, diploidy is assumed. If A is the allele in REF and B,C,... are the alleles as ordered in ALT, the ordering of genotypes for the likelihoods is given by: F(j/k) = (k*(k+1)/2)+j. In other words, for biallelic sites the ordering is: AA,AB,BB; for triallelic sites the ordering is: AA,AB,BB,AC,BC,CC, etc. For example: GT:GL 0/1:-323.03,-99.29,-802.53 (Floats) + \item GL : genotype likelihoods comprised of comma separated floating point + $log_{10}$-scaled likelihoods for all possible genotypes given the set of + alleles defined in the REF and ALT fields. {\color{red} In presence of the GT field the + same ploidy is expected; without GT field, diploidy is assumed. + If A is the allele in REF and B,C,... are the alleles as ordered in ALT, the + ordering of genotypes for the likelihoods is + AA,AB,BB for biallelic sites; AA,AB,BB,AC,BC,CC for triallelic sites, etc. + + In general case of ploidy P and N alternate alleles (0 is the REF and 1..N + the alternate alleles), the ordering of genotypes for the likelihoods can + be expressed by the following pseudocode with variable number of nested loops: + \begin{lstlisting} + for $a_1 = 0\ldots N$ + $\ldots$ + for $a_P = 0\ldots a_{P-1}$ + println $a_1 a_2 \ldots a_P$ + \end{lstlisting}} + + For example: GT:GL 0/1:-323.03,-99.29,-802.53 (Floats) \item GLE : genotype likelihoods of heterogeneous ploidy, used in presence of uncertain copy number. For example: GLE=0:-75.22,1:-223.42,0/0:-323.03,1/0:-99.29,1/1:-802.53 (String) \item PL : the phred-scaled genotype likelihoods rounded to the closest integer (and otherwise defined precisely as the GL field) (Integers) \item GP : {\color{red} genotype posterior probabilities in the range 0 to 1 using the same ordering as the GL field; one use can be to store imputed genotype probabilities (Float)} From aabe553ef5a3d8265aabff9f6a521ef3172b7d8e Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 30 Apr 2015 14:26:37 +0200 Subject: [PATCH 12/34] Fix #82 --- VCFv4.3.tex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VCFv4.3.tex b/VCFv4.3.tex index afd7ec009..1d3cfe887 100644 --- a/VCFv4.3.tex +++ b/VCFv4.3.tex @@ -642,7 +642,7 @@ \subsection{Encoding Structural Variants} The example shows in order: \begin{enumerate} \item A precise deletion with known breakpoint, a one base micro-homology, and a sample that is homozygous for the deletion. - \item An imprecise deletion of approximately 105 bp. + \item An imprecise deletion of approximately 205 bp. \item An imprecise deletion of an ALU element relative to the reference. \item An imprecise insertion of an L1 element relative to the reference. \item An imprecise duplication of approximately 21Kb. The sample genotype is copy number 3 (one extra copy of the duplicated sequence). From 61d0f0123e7c2cfde7fcb50f9b2cc390f5c358df Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 30 Apr 2015 14:32:27 +0200 Subject: [PATCH 13/34] Fix #82 --- VCFv4.2.tex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VCFv4.2.tex b/VCFv4.2.tex index b13643d25..8718ccd1c 100644 --- a/VCFv4.2.tex +++ b/VCFv4.2.tex @@ -529,7 +529,7 @@ \subsection{Encoding Structural Variants} The example shows in order: \begin{enumerate} \item A precise deletion with known breakpoint, a one base micro-homology, and a sample that is homozygous for the deletion. - \item An imprecise deletion of approximately 105 bp. + \item An imprecise deletion of approximately 205 bp. \item An imprecise deletion of an ALU element relative to the reference. \item An imprecise insertion of an L1 element relative to the reference. \item An imprecise duplication of approximately 21Kb. The sample genotype is copy number 3 (one extra copy of the duplicated sequence). From 672b512aa0b2ced39fb47ffa1b452732093c3f45 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 19 May 2015 14:27:57 +0200 Subject: [PATCH 14/34] Fix SVLEN example, resolves #84 --- VCFv4.1.tex | 2 +- VCFv4.2.tex | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/VCFv4.1.tex b/VCFv4.1.tex index 7fec9584b..c5a7d25fd 100644 --- a/VCFv4.1.tex +++ b/VCFv4.1.tex @@ -498,7 +498,7 @@ \subsection{Encoding Structural Variants} ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 -1 2827694 rs2376870 CGTGGATGCGGGGAC C . PASS SVTYPE=DEL;END=2827762;HOMLEN=1;HOMSEQ=G;SVLEN=-68 GT:GQ 1/1:13.9 +1 2827694 rs2376870 CGTGGATGCGGGGAC C . PASS SVTYPE=DEL;END=2827708;HOMLEN=1;HOMSEQ=G;SVLEN=-14 GT:GQ 1/1:13.9 2 321682 . T 6 PASS SVTYPE=DEL;END=321887;SVLEN=-205;CIPOS=-56,20;CIEND=-10,62 GT:GQ 0/1:12 2 14477084 . C 12 PASS SVTYPE=DEL;END=14477381;SVLEN=-297;CIPOS=-22,18;CIEND=-12,32 GT:GQ 0/1:12 3 9425916 . C 23 PASS SVTYPE=INS;END=9425916;SVLEN=6027;CIPOS=-16,22 GT:GQ 1/1:15 diff --git a/VCFv4.2.tex b/VCFv4.2.tex index 8718ccd1c..807bb64c8 100644 --- a/VCFv4.2.tex +++ b/VCFv4.2.tex @@ -515,7 +515,7 @@ \subsection{Encoding Structural Variants} ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 -1 2827694 rs2376870 CGTGGATGCGGGGAC C . PASS SVTYPE=DEL;END=2827762;HOMLEN=1;HOMSEQ=G;SVLEN=-68 GT:GQ 1/1:13.9 +1 2827694 rs2376870 CGTGGATGCGGGGAC C . PASS SVTYPE=DEL;END=2827708;HOMLEN=1;HOMSEQ=G;SVLEN=-14 GT:GQ 1/1:13.9 2 321682 . T 6 PASS SVTYPE=DEL;END=321887;SVLEN=-205;CIPOS=-56,20;CIEND=-10,62 GT:GQ 0/1:12 2 14477084 . C 12 PASS SVTYPE=DEL;END=14477381;SVLEN=-297;CIPOS=-22,18;CIEND=-12,32 GT:GQ 0/1:12 3 9425916 . C 23 PASS SVTYPE=INS;END=9425916;SVLEN=6027;CIPOS=-16,22 GT:GQ 1/1:15 From 73482c0624395625f4f1dd47e8c1308f04f49571 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 19 May 2015 14:29:09 +0200 Subject: [PATCH 15/34] Fix SVLEN examplein VCFv4.3, resolves #84 --- VCFv4.3.tex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VCFv4.3.tex b/VCFv4.3.tex index 1d3cfe887..4996fa51d 100644 --- a/VCFv4.3.tex +++ b/VCFv4.3.tex @@ -628,7 +628,7 @@ \subsection{Encoding Structural Variants} ##FORMAT= ##FORMAT= #CHROM POS ID REF ALT QUAL FILTER INFO FORMAT NA00001 -1 2827694 rs2376870 CGTGGATGCGGGGAC C . PASS SVTYPE=DEL;END=2827762;HOMLEN=1;HOMSEQ=G;SVLEN=-68 GT:GQ 1/1:13.9 +1 2827694 rs2376870 CGTGGATGCGGGGAC C . PASS SVTYPE=DEL;END=2827708;HOMLEN=1;HOMSEQ=G;SVLEN=-14 GT:GQ 1/1:13.9 2 321682 . T 6 PASS SVTYPE=DEL;END=321887;SVLEN=-205;CIPOS=-56,20;CIEND=-10,62 GT:GQ 0/1:12 2 14477084 . C 12 PASS SVTYPE=DEL;END=14477381;SVLEN=-297;CIPOS=-22,18;CIEND=-12,32 GT:GQ 0/1:12 3 9425916 . C 23 PASS SVTYPE=INS;END=9425916;SVLEN=6027;CIPOS=-16,22 GT:GQ 1/1:15 From 8f6b7cf14f94eda288f6f436f87da9842593e0d8 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 19 May 2015 14:38:09 +0200 Subject: [PATCH 16/34] Update minor version number, resolves #63 --- VCFv4.3.tex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VCFv4.3.tex b/VCFv4.3.tex index 4996fa51d..e6cc9dbfe 100644 --- a/VCFv4.3.tex +++ b/VCFv4.3.tex @@ -1151,7 +1151,7 @@ \subsection{Header} The BCF2 header begins with the ``BCF2 magic'' 5 bytes that encode {\tt BCF\em XY} where {\em X} and {\em Y} are bytes indicating the major -number (currently 2) and the minor number (currently 1). This magic can be +number (currently 2) and the minor number (currently 2). This magic can be used to quickly examine the file to determine that it's a BCF2 file. Immediately following the BCF2 magic is the standard VCF header lines in text format, beginning with \verb|##fileformat=VCFvX.Y|. Because the type is From f247cee92ff161a02fa3871433ae38ffb24c30d0 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 2 Jun 2015 10:18:42 +0200 Subject: [PATCH 17/34] List of changes: - meta-information lines must be key=value pairs (#67) - an ID attribute is required in structured header lines, unique within its type - the above point newly requires ID in the reserved PEDIGREE tag - new reserved AD, ADF, and ADR FORMAT and INFO fields added, resolves #78 - reorder list of INFO and FORMAT tags alphabetically - removed UNICODE-characters-not-supported sentence from BCF specification, in partial response to #65 --- VCFv4.3.tex | 60 +++++++++++++++++++++++++++++++++-------------------- 1 file changed, 38 insertions(+), 22 deletions(-) diff --git a/VCFv4.3.tex b/VCFv4.3.tex index e6cc9dbfe..45dd85b7d 100644 --- a/VCFv4.3.tex +++ b/VCFv4.3.tex @@ -75,16 +75,20 @@ \subsection{An example} This example shows (in order): a good simple SNP, a possible SNP that has been filtered out because its quality is below 10, a site at which two alternate alleles are called, with one of them (T) being ancestral (possibly a reference sequencing error), a site that is called monomorphic reference (i.e. with no alternate alleles), and a microsatellite with two alternative alleles, one a deletion of 2 bases (TC), and the other an insertion of one base (T). Genotype data are given for three samples, two of which are phased and the third unphased, with per sample genotype quality, depth and haplotype qualities (the latter only for the phased samples) given as well as the genotypes. The microsatellite calls are unphased. \subsection{Meta-information lines} + File meta-information is included after the \#\# string and must be key=value -pairs. It is strongly encouraged that information lines describing the -\texttt{INFO}, \texttt{FILTER}, \texttt{FORMAT} and -{\color{red}\texttt{contig}} entries used in the body of the VCF file be -included in the meta-information section. Although they are optional, if these -lines are present then they must be completely well-formed {\color{red} and -their ID must be unique within their type. +pairs. Meta-information lines are optional, but if they are present then +they must be completely well-formed. {\color{red} Note that BCF, the binary +counterpart of VCF, requires that all entries are present. It is strongly +encouraged to include meta-information lines describing the entries used in the +body of the VCF file. + +All structured lines that have their value enclosed within "$<>$" require an ID +which must be unique within their type. + Meta-information lines can be in any order with the exception of `fileformat` -which must come first. Note that BCF, the binary counterpart of VCF, requires -that all entries are present.} +which must come first.} + \subsubsection{File format} A single `fileformat' field is always required, must be the first line in the file, and details the VCF format version number. For example, for VCF version 4.2, this line should read: @@ -95,6 +99,7 @@ \subsubsection{File format} {\color{red} \subsubsection{Character encoding} +\label{character-encoding} The character encoding of VCF files is UTF-8. UTF-8 is a multi-byte character encoding that is a strict superset of 7-bit ASCII and has the property that none of the bytes in any multi-byte characters are 7-bit ASCII @@ -201,13 +206,18 @@ \subsubsection{Sample field format} \subsubsection{Pedigree field format} It is possible to record relationships between genomes using the following syntax: +{\color{red} \begin{verbatim} -##PEDIGREE= +##PEDIGREE= \end{verbatim} -or a link to a database: +} +\noindent or a link to a database: +{\color{red} \begin{verbatim} -##pedigreeDB= +##pedigreeDB=URL \end{verbatim} +} + \subsection{Header line syntax} The header line names the 8 fixed, mandatory columns. These columns are as follows: @@ -246,6 +256,7 @@ \subsubsection{Fixed fields} \begin{itemize} \item AA : ancestral allele \item AC : allele count in genotypes, for each ALT allele, in the same order as listed + \item {\color{red}AD, ADF, ADR: read depths for each allele; total (AD), on the forward (ADF) and the reverse (ADR) strand (Integer, Number=R)} \item AF : allele frequency for each ALT allele in the same order as listed: use this when estimated from primary data, not called genotypes \item AN : total number of alleles in called genotypes \item BQ : RMS base quality at this position @@ -278,13 +289,17 @@ \subsubsection{Genotype fields} \begin{itemize} \renewcommand{\labelitemii}{$\circ$} + \item {\color{red}AD, ADF, ADR: per-sample read depths for each allele; total (AD), on the forward (ADF) and the reverse (ADR) strand (Integer, Number=R)} + \item DP : read depth at this position for this sample (Integer) + \item EC : comma separated list of expected alternate allele counts for each alternate allele in the same order as listed in the ALT field (typically used in association analyses) (Integers) + \item FT : sample genotype filter indicating if this genotype was ``called'' (similar in concept to the FILTER field). Again, use PASS to indicate that all filters have been passed, a semi-colon separated list of codes for filters that fail, or `.' to indicate that filters have not been applied. These values should be described in the meta-information in the same way as FILTERs (String, no white-space or semi-colons permitted) + \item GQ : conditional genotype quality, encoded as a phred quality $-10log_{10}$ p(genotype call is wrong, conditioned on the site's being variant) (Integer) + \item GP : {\color{red} genotype posterior probabilities in the range 0 to 1 using the same ordering as the GL field; one use can be to store imputed genotype probabilities (Float)} \item GT : genotype, encoded as allele values separated by either of $/$ or $\mid$. The allele values are 0 for the reference allele (what is in the REF field), 1 for the first allele listed in ALT, 2 for the second allele list in ALT and so on. For diploid calls examples could be $0/1$, $1\mid0$, or $1/2$, etc. For haploid calls, e.g. on Y, male non-pseudoautosomal X, or mitochondrion, only one allele value should be given; a triploid call might look like $0/0/1$. If a call cannot be made for a sample at a given locus, `.' should be specified for each missing allele in the GT field (for example `$./.$' for a diploid genotype and `.' for haploid genotype). The meanings of the separators are as follows (see the PS field below for more details on incorporating phasing information into the genotypes): \begin{itemize} \item $/$ : genotype unphased \item $\mid$ : genotype phased \end{itemize} - \item DP : read depth at this position for this sample (Integer) - \item FT : sample genotype filter indicating if this genotype was ``called'' (similar in concept to the FILTER field). Again, use PASS to indicate that all filters have been passed, a semi-colon separated list of codes for filters that fail, or `.' to indicate that filters have not been applied. These values should be described in the meta-information in the same way as FILTERs (String, no white-space or semi-colons permitted) \item GL : genotype likelihoods comprised of comma separated floating point $log_{10}$-scaled likelihoods for all possible genotypes given the set of alleles defined in the REF and ALT fields. {\color{red} In presence of the GT field the @@ -305,14 +320,11 @@ \subsubsection{Genotype fields} For example: GT:GL 0/1:-323.03,-99.29,-802.53 (Floats) \item GLE : genotype likelihoods of heterogeneous ploidy, used in presence of uncertain copy number. For example: GLE=0:-75.22,1:-223.42,0/0:-323.03,1/0:-99.29,1/1:-802.53 (String) - \item PL : the phred-scaled genotype likelihoods rounded to the closest integer (and otherwise defined precisely as the GL field) (Integers) - \item GP : {\color{red} genotype posterior probabilities in the range 0 to 1 using the same ordering as the GL field; one use can be to store imputed genotype probabilities (Float)} - \item GQ : conditional genotype quality, encoded as a phred quality $-10log_{10}$ p(genotype call is wrong, conditioned on the site's being variant) (Integer) \item HQ : haplotype qualities, two comma separated phred qualities (Integers) - \item PS : phase set. A phase set is defined as a set of phased genotypes to which this genotype belongs. Phased genotypes for an individual that are on the same chromosome and have the same PS value are in the same phased set. A phase set specifies multi-marker haplotypes for the phased genotypes in the set. All phased genotypes that do not contain a PS subfield are assumed to belong to the same phased set. If the genotype in the GT field is unphased, the corresponding PS field is ignored. The recommended convention is to use the position of the first variant in the set as the PS identifier (although this is not required). (Non-negative 32-bit Integer) - \item PQ : phasing quality, the phred-scaled probability that alleles are ordered incorrectly in a heterozygote (against all other members in the phase set). We note that we have not yet included the specific measure for precisely defining ``phasing quality''; our intention for now is simply to reserve the PQ tag for future use as a measure of phasing quality. (Integer) - \item EC : comma separated list of expected alternate allele counts for each alternate allele in the same order as listed in the ALT field (typically used in association analyses) (Integers) \item MQ : RMS mapping quality, similar to the version in the INFO field. (Integer) + \item PL : the phred-scaled genotype likelihoods rounded to the closest integer (and otherwise defined precisely as the GL field) (Integers) + \item PQ : phasing quality, the phred-scaled probability that alleles are ordered incorrectly in a heterozygote (against all other members in the phase set). We note that we have not yet included the specific measure for precisely defining ``phasing quality''; our intention for now is simply to reserve the PQ tag for future use as a measure of phasing quality. (Integer) + \item PS : phase set. A phase set is defined as a set of phased genotypes to which this genotype belongs. Phased genotypes for an individual that are on the same chromosome and have the same PS value are in the same phased set. A phase set specifies multi-marker haplotypes for the phased genotypes in the set. All phased genotypes that do not contain a PS subfield are assumed to belong to the same phased set. If the genotype in the GT field is unphased, the corresponding PS field is ignored. The recommended convention is to use the position of the first variant in the set as the PS identifier (although this is not required). (Non-negative 32-bit Integer) \end{itemize} @@ -1383,7 +1395,7 @@ \subsubsection{Type encoding} \end{tabular} \vspace{0.3cm} -\textbf{Character} values are not explicitly typed in BCF2. Instead, VCF Character values should be encoded by a single character string. As with Strings, UNICODE characters are not supported. +\textbf{Character} values are not explicitly typed in BCF2. Instead, VCF Character values should be encoded by a single character string. {\color{red} See also \ref{character-encoding}.} \vspace{0.3cm} \textbf{Flags} values -- which can only appear in INFO fields -- in BCF2 should be encoded by any non-MISSING value. The recommended best practice is to encode the value as an 1-element INT8 (type 0x11) with value of 1 to indicate present. Because FLAG values can only be encoded in INFO fields, BCF2 provides no mechanism to encode FLAG values in genotypes, but could be easily extended to do so if allowed in a future VCF version. @@ -1724,10 +1736,14 @@ \subsection{Changes between VCFv4.2 and VCFv4.3} \item Introduced \#\#META header lines for defining phenotype metadata \item New reserved tag "CNP" analogous to "GP" was added. Both CNP and GP use 0 to 1 encoding, which is a change from previous phred-scaled GP. \item In order for VCF and BCF to have the same expressive power, we state explicitly that Integers and Floats are 32-bit numbers. Integers are signed. -\item We state explicitly that zero length strings are not allowed, this includes the CHROM and ID column, INFO IDs, FILTER IDs and FORMAT IDs. Meta-information lines can be in any order, with the exception of \#\#fileformat which must come first. INFO, FILTER and FORMAT IDs must be unique within that type. +\item We state explicitly that zero length strings are not allowed, this includes the CHROM and ID column, INFO IDs, FILTER IDs and FORMAT IDs. Meta-information lines can be in any order, with the exception of \#\#fileformat which must come first. +\item All header lines of the form \#\#key=$<$ID=xxx,...$>$ must have an ID value +that is unique for a given value of "key". All header lines whose value starts +with "$<$" must have an ID field. Therefore, also \#\#PEDIGREE newly requires a unique ID. \item We state explicitly that duplicate IDs, FILTER, INFO or FORMAT keys are not valid. \item A section about gVCF was added, introduced the $<$*$>$ symbolic allele. -\item A section about tag naming conventions was added +\item A section about tag naming conventions was added. +\item New reserved AD, ADF, and ADR INFO and FORMAT fields added. \end{itemize} \subsection{Changes between BCFv2.1 and BCFv2.2} From d6b46dfa98a96458820f8e7f13e8c3a71d5484e6 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 3 Jun 2015 20:33:08 +0200 Subject: [PATCH 18/34] GL ordering made clearer as discussed in #83 --- VCFv4.3.tex | 64 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 53 insertions(+), 11 deletions(-) diff --git a/VCFv4.3.tex b/VCFv4.3.tex index 45dd85b7d..a47b93809 100644 --- a/VCFv4.3.tex +++ b/VCFv4.3.tex @@ -300,25 +300,67 @@ \subsubsection{Genotype fields} \item $/$ : genotype unphased \item $\mid$ : genotype phased \end{itemize} + \item GL : genotype likelihoods comprised of comma separated floating point $log_{10}$-scaled likelihoods for all possible genotypes given the set of alleles defined in the REF and ALT fields. {\color{red} In presence of the GT field the same ploidy is expected; without GT field, diploidy is assumed. - If A is the allele in REF and B,C,... are the alleles as ordered in ALT, the - ordering of genotypes for the likelihoods is - AA,AB,BB for biallelic sites; AA,AB,BB,AC,BC,CC for triallelic sites, etc. - In general case of ploidy P and N alternate alleles (0 is the REF and 1..N + \textsc{Genotype Ordering.} In general case of ploidy P and N alternate alleles (0 is the REF and 1..N the alternate alleles), the ordering of genotypes for the likelihoods can - be expressed by the following pseudocode with variable number of nested loops: + be expressed by the following pseudocode with as many nested loops as ploidy:\footnote{\color{red}Note + that we use inclusive \texttt{for} loop boundaries.} + \begingroup + \small + \begin{lstlisting} + for $a_P = 0\ldots N$ + for $a_{P-1} = 0\ldots a_P$ + $\ldots$ + for $a_1 = 0\ldots a_{2}$ + println $a_1 a_2 \ldots a_P$ + \end{lstlisting} + \endgroup + + Alternatively, the same can be achieved recursively with the following pseudocode: + \begingroup + \small + \begin{lstlisting} + Ordering($P$, $N$, suffix=""): + for $a$ in $0\ldots N$ + if ($P == 1$) println str($a$) + suffix + if ($P > 1$) Ordering($P$-1, $a$, str($a$) + suffix) + \end{lstlisting} + \endgroup + + Conversely, the index of the value corresponding to the genotype $k_1\le k_2\le\ldots\le k_P$ is + \begingroup + \small \begin{lstlisting} - for $a_1 = 0\ldots N$ - $\ldots$ - for $a_P = 0\ldots a_{P-1}$ - println $a_1 a_2 \ldots a_P$ - \end{lstlisting}} + Index($k_1/k_2/\ldots/k_P$) = $\sum_{m=1}^{P} {k_m + m - 1 \choose m}$ + \end{lstlisting} + \endgroup + + Examples: + \begin{itemize} + \item for $P$=2 and $N$=1, the ordering is 00,01,11 + \item for $P$=2 and $N$=2, the ordering is 00,01,11,02,12,22 + \item for $P$=3 and $N$=2, the ordering is 000, 001, 011, 111, 002, 012, 112, 022, 122, 222 + \item for $P$=1, the index of the genotype $a$ is $a$ + \item for $P$=2, the index of the genotype "$a/b$", where $a\le b$, is $b (b+1)/2 + a$ + \item for $P$=2 and arbitrary $N$, the ordering can be easily derived from a triangular matrix + \newline + \hbox{\hskip5em\footnotesize + \begin{tabular}{l|llll} + $b\setminus a$ & 0 & 1 & 2 & 3 \\ \hline \\[-0.5em] + 0 & 0 & & & \\ + 1 & 1 & 2 & & \\ + 2 & 3 & 4 & 5 & \\ + 3 & 6 & 7 & 8 & 9 + \end{tabular} + } + \end{itemize} + } - For example: GT:GL 0/1:-323.03,-99.29,-802.53 (Floats) \item GLE : genotype likelihoods of heterogeneous ploidy, used in presence of uncertain copy number. For example: GLE=0:-75.22,1:-223.42,0/0:-323.03,1/0:-99.29,1/1:-802.53 (String) \item HQ : haplotype qualities, two comma separated phred qualities (Integers) \item MQ : RMS mapping quality, similar to the version in the INFO field. (Integer) From 08506d8a3693401228c16099a48992280c749a64 Mon Sep 17 00:00:00 2001 From: John Marshall Date: Thu, 11 Jun 2015 16:42:45 +0100 Subject: [PATCH 19/34] Be definitive in the ##fileformat section This section specifies ##fileformat, not just gives an example. --- VCFv4.3.tex | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/VCFv4.3.tex b/VCFv4.3.tex index a47b93809..f1b405cf9 100644 --- a/VCFv4.3.tex +++ b/VCFv4.3.tex @@ -91,10 +91,10 @@ \subsection{Meta-information lines} \subsubsection{File format} -A single `fileformat' field is always required, must be the first line in the file, and details the VCF format version number. For example, for VCF version 4.2, this line should read: +A single `fileformat' line is always required, must be the first line in the file, and details the VCF format version number. For VCF version 4.3, this line should read: \begin{verbatim} -##fileformat=VCFv4.2 +##fileformat=VCFv4.3 \end{verbatim} {\color{red} From 1c5e2787daf6d7bf97d9e4d2d77c082716351750 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Mon, 22 Jun 2015 14:45:45 +0200 Subject: [PATCH 20/34] Removed ill-defined GLE tag, resolves #90 --- VCFv4.3.tex | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/VCFv4.3.tex b/VCFv4.3.tex index f1b405cf9..eafcadfb8 100644 --- a/VCFv4.3.tex +++ b/VCFv4.3.tex @@ -361,7 +361,6 @@ \subsubsection{Genotype fields} \end{itemize} } - \item GLE : genotype likelihoods of heterogeneous ploidy, used in presence of uncertain copy number. For example: GLE=0:-75.22,1:-223.42,0/0:-323.03,1/0:-99.29,1/1:-802.53 (String) \item HQ : haplotype qualities, two comma separated phred qualities (Integers) \item MQ : RMS mapping quality, similar to the version in the INFO field. (Integer) \item PL : the phred-scaled genotype likelihoods rounded to the closest integer (and otherwise defined precisely as the GL field) (Integers) @@ -1786,6 +1785,7 @@ \subsection{Changes between VCFv4.2 and VCFv4.3} \item A section about gVCF was added, introduced the $<$*$>$ symbolic allele. \item A section about tag naming conventions was added. \item New reserved AD, ADF, and ADR INFO and FORMAT fields added. +\item Removed unused and ill-defined GLE FORMAT tag. \end{itemize} \subsection{Changes between BCFv2.1 and BCFv2.2} From db372bca317e5576a5e082e63c395ac702054dee Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Mon, 22 Jun 2015 15:05:36 +0200 Subject: [PATCH 21/34] Data Types section, describe valid float format, NaN and +/-Inf are allowed --- VCFv4.3.tex | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/VCFv4.3.tex b/VCFv4.3.tex index eafcadfb8..25bf3efc5 100644 --- a/VCFv4.3.tex +++ b/VCFv4.3.tex @@ -105,6 +105,11 @@ \subsubsection{Character encoding} property that none of the bytes in any multi-byte characters are 7-bit ASCII bytes. As a result, most software that processes VCF files does not have to be aware of the possible presence of multi-byte UTF-8 characters. + +\subsubsection{Data types} +Data types supported by VCF are: Integer (32-bit, signed), Float (32-bit, formatted +to match the regular expression \texttt{\^{}[-+]?[0-9]*\textbackslash.?[0-9]+([eE][-+]?[0-9]+)?\$}, \texttt{NaN}, or \texttt{+/-Inf}), Flag, Character, and +String. For the Integer type, the values from $-2^{31}$ to $-2^{31}+7$ cannot be stored in the binary version, see \ref{BcfTypeEncoding}. } \subsubsection{Information field format} @@ -114,8 +119,8 @@ \subsubsection{Information field format} ##INFO= \end{verbatim} -Possible Types for INFO fields are: Integer {\color{red} (32-bit, signed)}, Float {\color{red} (32-bit)}, Flag, Character, and -String. {\color{red} For the Integer type, the values from $-2^{31}$ to $-2^{31}+7$ cannot be stored in binary version, see \ref{BcfTypeEncoding}.} +Possible Types for INFO fields are: Integer, Float, Flag, Character, and +String. The Number entry is an Integer that describes the number of values that can be included with the INFO field. For example, if the INFO field contains a single number, then this value should be $1$; if the INFO field describes a From c1286807a95f7725946f75ae20e7642e8e5f28e7 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Mon, 29 Jun 2015 10:41:44 +0200 Subject: [PATCH 22/34] Extended bits about character encoding - added list of non-printable characters - specified the percent encoding - lines should not end with TAB - last line in the file should end with new line separator --- VCFv4.3.tex | 35 ++++++++++++++++++++++++++++++----- 1 file changed, 30 insertions(+), 5 deletions(-) diff --git a/VCFv4.3.tex b/VCFv4.3.tex index 25bf3efc5..2d0a2b4d7 100644 --- a/VCFv4.3.tex +++ b/VCFv4.3.tex @@ -98,18 +98,38 @@ \subsubsection{File format} \end{verbatim} {\color{red} -\subsubsection{Character encoding} +\subsubsection{Character encoding, non-printable characters and characters with special meaning} \label{character-encoding} The character encoding of VCF files is UTF-8. UTF-8 is a multi-byte character encoding that is a strict superset of 7-bit ASCII and has the property that none of the bytes in any multi-byte characters are 7-bit ASCII bytes. As a result, most software that processes VCF files does not have to be aware of the possible presence of multi-byte UTF-8 characters. +Note that non-printable characters U+0000-U+0008, U+000B-U+000C, U+000E-U+001F are disallowed. +Line separators must be CR+LF or LF and they are allowed only as line separators at +end of line. Characters with special meaning must be represented using the (capitalized) +percent encoding + +\begingroup\footnotesize +\begin{tabular}{l l l} +\%3A & : & (colon) \\ +\%3B & ; & (semicolon) \\ +\%3D & = & (equal sign) \\ +\%25 & \% & (percent sign) \\ +\%2C & , & (comma) \\ +\%0D & CR & \\ +\%0A & LF & \\ +\%09 & TAB & +\end{tabular} +\endgroup + \subsubsection{Data types} Data types supported by VCF are: Integer (32-bit, signed), Float (32-bit, formatted to match the regular expression \texttt{\^{}[-+]?[0-9]*\textbackslash.?[0-9]+([eE][-+]?[0-9]+)?\$}, \texttt{NaN}, or \texttt{+/-Inf}), Flag, Character, and String. For the Integer type, the values from $-2^{31}$ to $-2^{31}+7$ cannot be stored in the binary version, see \ref{BcfTypeEncoding}. + + } \subsubsection{Information field format} @@ -195,6 +215,8 @@ \subsubsection{Contig field format} \begin{verbatim} ##contig= \end{verbatim} + + \subsubsection{Sample field format} It is possible to define sample to genome mappings as shown below: {\color{red} @@ -237,11 +259,14 @@ \subsection{Header line syntax} \item INFO \end{enumerate} -If genotype data is present in the file, these are followed by a FORMAT column header, then an arbitrary number of sample IDs. The header line is tab-delimited. +If genotype data is present in the file, these are followed by a FORMAT column header, then an arbitrary number of sample IDs. The header line is tab-delimited +{\color{red} and there should be no tab characters at the end of the line}. \subsection{Data lines} \subsubsection{Fixed fields} -There are 8 fixed fields per record. All data lines are tab-delimited. In all cases, missing values are specified with a dot (`.'). Fixed fields are: +There are 8 fixed fields per record. All data lines are tab-delimited +{\color{red} with no tab character at the end of the line. The last data line should end with a line separator.} In all cases, +missing values are specified with a dot (`.'). Fixed fields are: \begin{enumerate} \item CHROM - chromosome: An identifier from the reference genome or an angle-bracketed ID String (``$<$ID$>$'') pointing to a contig in the assembly file (cf. the \#\#assembly line in the header). All entries for a specific CHROM should form a contiguous block within the VCF file. The colon symbol (:) must be absent from all chromosome names to avoid parsing errors when dealing with breakends. (String, no white-space permitted, Required). @@ -253,7 +278,7 @@ \subsubsection{Fixed fields} \item FILTER - filter status: PASS if this position has passed all filters, i.e. a call is made at this position. Otherwise, if the site has not passed all filters, a semicolon-separated list of codes for filters that fail. e.g. ``q10;s50'' might indicate that at this site the quality is below 10 and the number of samples with data is below 50\% of the total number of samples. `0' is reserved and should not be used as a filter String. If filters have not been applied, then this field should be set to the missing value. (String, no white-space or semi-colons permitted, {\color{red}duplicate values not allowed.}) \item INFO - additional information: (String, no semi-colons or equals-signs permitted; commas are permitted only as delimiters for lists of - values; characters with special meaning can be encoded using URL encoding; space characters are allowed) + values; characters with special meaning can be encoded using the percent encoding, see Section~\ref{character-encoding}; space characters are allowed) INFO fields are encoded as a semicolon-separated series of short keys with optional values in the format: $<$key$>$=$<$data$>$[,data]. {\color{red} INFO keys must match the regular expression \texttt{\^{}[A-Za-z\_][0-9A-Za-z\_.]*\$}, duplicate fields are not allowed.} @@ -1776,7 +1801,7 @@ \subsection{Changes between VCFv4.2 and VCFv4.3} \item VCF compliant implementations must support both LF and CR+LF newline conventions \item INFO and FORMAT tag names must match the regular expression \texttt{\^{}[A-Za-z\_][0-9A-Za-z\_.]*\$} \item Spaces are allowed in INFO field values -\item Characters with special meaning (such as ';' in INFO, ':' in FORMAT, and '\%' in both) can be encoded using URL encoding +\item Characters with special meaning (such as ';' in INFO, ':' in FORMAT, and '\%' in both) can be encoded using the percent encoding (see Section~\ref{character-encoding}) \item The character encoding of VCF files is UTF-8. \item The SAMPLE field can contain optional DOI URL for the source data file \item Introduced \#\#META header lines for defining phenotype metadata From c50589be123e547c44e1208b39f9ba28dd181a5f Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Mon, 29 Jun 2015 10:51:39 +0200 Subject: [PATCH 23/34] Chromosome names cannot use reserved symbolic alleles and contain characters used by breakpoints --- VCFv4.3.tex | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/VCFv4.3.tex b/VCFv4.3.tex index 2d0a2b4d7..ba97103cf 100644 --- a/VCFv4.3.tex +++ b/VCFv4.3.tex @@ -210,12 +210,19 @@ \subsubsection{Assembly field format} \end{verbatim} The URL field specifies the location of a fasta file containing breakpoint assemblies referenced in the VCF records for structural variants via the BKPTID INFO key. + \subsubsection{Contig field format} +\label{sec-contig-field} As with chromosomal sequences it is highly recommended (but not required) that the header include tags describing the contigs referred to in the VCF file. This furthermore allows these contigs to come from different files. The format is identical to that of a reference sequence, but with an additional URL tag to indicate where that sequence can be found. For example:. \begin{verbatim} ##contig= \end{verbatim} +\noindent +{\color{red}Valid contig names must follow the reference sequence names allowed by the SAM format, +excluding the characters "\texttt{\textless\textgreater[]:*}". The contig names must not use a +reserved symbolic allele name. +} \subsubsection{Sample field format} It is possible to define sample to genome mappings as shown below: @@ -1816,6 +1823,7 @@ \subsection{Changes between VCFv4.2 and VCFv4.3} \item A section about tag naming conventions was added. \item New reserved AD, ADF, and ADR INFO and FORMAT fields added. \item Removed unused and ill-defined GLE FORMAT tag. +\item Chromosome names cannot use reserved symbolic alleles and contain characters used by breakpoints (Section~\ref{sec-contig-field}). \end{itemize} \subsection{Changes between BCFv2.1 and BCFv2.2} From 4f49af05caeab4a73f15e8711ee2015b48a6c405 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Mon, 29 Jun 2015 11:03:43 +0200 Subject: [PATCH 24/34] Reference SV INFO and FORMAT section from the main text --- VCFv4.3.tex | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/VCFv4.3.tex b/VCFv4.3.tex index ba97103cf..e20c98e9f 100644 --- a/VCFv4.3.tex +++ b/VCFv4.3.tex @@ -310,6 +310,7 @@ \subsubsection{Fixed fields} \item SOMATIC : indicates that the record is a somatic mutation, for cancer genomics \item VALIDATED : validated by follow-up experiment \item 1000G : membership in 1000 Genomes + \item $\ldots$ see Section~\ref{sv-info-keys} for a list of INFO keys reserved for structural variants. \end{itemize} \end{enumerate} The exact format of each INFO sub-field should be specified in the meta-information (as described above). @@ -403,6 +404,7 @@ \subsubsection{Genotype fields} \item PL : the phred-scaled genotype likelihoods rounded to the closest integer (and otherwise defined precisely as the GL field) (Integers) \item PQ : phasing quality, the phred-scaled probability that alleles are ordered incorrectly in a heterozygote (against all other members in the phase set). We note that we have not yet included the specific measure for precisely defining ``phasing quality''; our intention for now is simply to reserve the PQ tag for future use as a measure of phasing quality. (Integer) \item PS : phase set. A phase set is defined as a set of phased genotypes to which this genotype belongs. Phased genotypes for an individual that are on the same chromosome and have the same PS value are in the same phased set. A phase set specifies multi-marker haplotypes for the phased genotypes in the set. All phased genotypes that do not contain a PS subfield are assumed to belong to the same phased set. If the genotype in the GT field is unphased, the corresponding PS field is ignored. The recommended convention is to use the position of the first variant in the set as the PS identifier (although this is not required). (Non-negative 32-bit Integer) + \item $\ldots$ see Section~\ref{sv-format-keys} for a list of genotype keys reserved for structural variants. \end{itemize} @@ -440,6 +442,7 @@ \subsection{VCF tag naming conventions} \section{INFO keys used for structural variants} +\label{sv-info-keys} The following INFO keys are reserved for encoding structural variants. In general, when these keys are used by imprecise variants, the values should be best estimates. When a key reflects a property of a single alt allele (e.g. SVLEN), then when there are multiple alt alleles there will be multiple values for the key corresponding to each alelle (e.g. SVLEN=-100,-110 for a deletion with two distinct alt alleles). \footnotesize \begin{verbatim} @@ -492,6 +495,7 @@ \section{INFO keys used for structural variants} \normalsize \section{FORMAT keys used for structural variants} +\label{sv-format-keys} \footnotesize \begin{verbatim} ##FORMAT= From 6cc560491695492e1c4b38dfef52204f2e2429f6 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Mon, 29 Jun 2015 11:47:23 +0200 Subject: [PATCH 25/34] IUPAC ambiguity codes, as per #54 --- VCFv4.3.tex | 63 ++++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 53 insertions(+), 10 deletions(-) diff --git a/VCFv4.3.tex b/VCFv4.3.tex index e20c98e9f..128b835e9 100644 --- a/VCFv4.3.tex +++ b/VCFv4.3.tex @@ -84,7 +84,18 @@ \subsection{Meta-information lines} body of the VCF file. All structured lines that have their value enclosed within "$<>$" require an ID -which must be unique within their type. +which must be unique within their type. For all of the structured lines (\#\#INFO, \#\#FORMAT, +\#\#FILTER, etc.), extra fields can be included after the default fields. For example: +\begin{verbatim} +##INFO= +\end{verbatim} +In the above example, the extra fields of ``Source'' and ``Version'' are +provided. Optional fields should be stored as strings even for numeric values. + +It is highly recommended (but not required) that the header +include tags describing the reference and contigs backing the data contained in +the file. These tags are based on the SQ field from the SAM spec; all tags are +optional (see the VCF example above). Meta-information lines can be in any order with the exception of `fileformat` which must come first.} @@ -173,12 +184,17 @@ \subsubsection{Individual format field format} Possible Types for FORMAT fields are: Integer, Float, Character, and String (this field is otherwise defined precisely as the INFO field). \subsubsection{Alternative allele field format} -Symbolic alternate alleles for imprecise structural variants: - +Symbolic alternate alleles should be described as follows: \begin{verbatim} ##ALT= \end{verbatim} -The ID field indicates the type of structural variant, and can be a colon-separated list of types and subtypes. ID values are case sensitive strings and may not contain whitespace or angle brackets. The first level type must be one of the following: + +\noindent \textbf{Structural Variants} \newline +In symbolic alternate alleles for imprecise structural variants, +the ID field indicates the type of structural variant, and can be a +colon-separated list of types and subtypes. ID values are case sensitive +strings and may not contain whitespace or angle brackets. The first level type +must be one of the following: \begin{itemize} \item DEL Deletion relative to the reference \item INS Insertion of novel sequence relative to the reference @@ -194,14 +210,17 @@ \subsubsection{Alternative allele field format} \item INS:ME Insertion of a mobile element relative to the reference \end{itemize} -In addition, it is highly recommended (but not required) that the header include tags describing the reference and contigs backing the data contained in the file. These tags are based on the SQ field from the SAM spec; all tags are optional (see the VCF example above). - -For all of the \#\#INFO, \#\#FORMAT, \#\#FILTER, and \#\#ALT metainformation, extra fields can be included after the default fields. For example: +\bigskip +{\color{red} +\noindent \textbf{IUPAC ambiguity codes} \newline +Symbolic alleles can be used also to represent genuinely ambiguous data in VCF, for example: \begin{verbatim} -##INFO= + ##ALT= + ##ALT= \end{verbatim} -In the above example, the extra fields of ``Source'' and ``Version'' are provided. Optional fields should be stored as strings even for numeric values. +} + \subsubsection{Assembly field format} Breakpoint assemblies for structural variations may use an external file: @@ -279,7 +298,29 @@ \subsubsection{Fixed fields} \item CHROM - chromosome: An identifier from the reference genome or an angle-bracketed ID String (``$<$ID$>$'') pointing to a contig in the assembly file (cf. the \#\#assembly line in the header). All entries for a specific CHROM should form a contiguous block within the VCF file. The colon symbol (:) must be absent from all chromosome names to avoid parsing errors when dealing with breakends. (String, no white-space permitted, Required). \item POS - position: The reference position, with the 1st base having position 1. Positions are sorted numerically, in increasing order, within each reference sequence CHROM. It is permitted to have multiple records with the same POS. Telomeres are indicated by using positions 0 or N+1, where N is the length of the corresponding chromosome or contig. (Integer, Required) \item ID - identifier: Semi-colon separated list of unique identifiers where available. If this is a dbSNP variant it is encouraged to use the rs number(s). No identifier should be present in more than one data record. If there is no identifier available, then the missing value should be used. (String, no white-space or semi-colons permitted, {\color{red}duplicate values not allowed.}) - \item REF - reference base(s): Each base must be one of A,C,G,T,N (case insensitive). Multiple bases are permitted. The value in the POS field refers to the position of the first base in the String. For simple insertions and deletions in which either the REF or one of the ALT alleles would otherwise be null/empty, the REF and ALT Strings must include the base before the event (which must be reflected in the POS field), unless the event occurs at position 1 on the contig in which case it must include the base after the event; this padding base is not required (although it is permitted) for e.g. complex substitutions or other events where all alleles have at least one base represented in their Strings. If any of the ALT alleles is a symbolic allele (an angle-bracketed ID String ``$<$ID$>$'') then the padding base is required and POS denotes the coordinate of the base preceding the polymorphism. Tools processing VCF files are not required to preserve case in the allele Strings. (String, Required). + \item REF - reference base(s): Each base must be one of A,C,G,T,N (case + insensitive). Multiple bases are permitted. The value in the POS field refers + to the position of the first base in the String. For simple insertions and + deletions in which either the REF or one of the ALT alleles would otherwise + be null/empty, the REF and ALT Strings must include the base before the event + (which must be reflected in the POS field), unless the event occurs at + position 1 on the contig in which case it must include the base after the + event; this padding base is not required (although it is permitted) for e.g. + complex substitutions or other events where all alleles have at least one + base represented in their Strings. If any of the ALT alleles is a symbolic + allele (an angle-bracketed ID String ``$<$ID$>$'') then the padding base is + required and POS denotes the coordinate of the base preceding the + polymorphism. Tools processing VCF files are not required to preserve case in + the allele Strings. (String, Required). + + {\color{red} + If the reference sequence contains IUPAC ambiguity codes not + allowed by this specification (such as R = A/G), the ambiguous reference base + can be reduced to a concrete base by using the one that is first alphabetically + (thus R as a reference base is converted to A in VCF.) + } + + \item ALT - alternate base(s): Comma separated list of alternate non-reference alleles called on at least one of the samples. Options are base Strings made up of the bases A,C,G,T,N,*, (case insensitive) or an angle-bracketed ID String (``$<$ID$>$'') or a breakend replacement string as described in the section on breakends. The `*' allele is reserved to indicate that the allele is missing due to a upstream deletion. If there are no alternative alleles, then the missing value should be used. Tools processing VCF files are not required to preserve case in the allele String, except for IDs, which are case sensitive. (String; no whitespace, commas, or angle-brackets are permitted in the ID String itself) \item QUAL - quality: Phred-scaled quality score for the assertion made in ALT. i.e. $-10log_{10}$ prob(call in ALT is wrong). If ALT is `.' (no variant) then this is $-10log_{10}$ prob(variant), and if ALT is not `.' this is $-10log_{10}$ prob(no variant). If unknown, the missing value should be specified. (Float) \item FILTER - filter status: PASS if this position has passed all filters, i.e. a call is made at this position. Otherwise, if the site has not passed all filters, a semicolon-separated list of codes for filters that fail. e.g. ``q10;s50'' might indicate that at this site the quality is below 10 and the number of samples with data is below 50\% of the total number of samples. `0' is reserved and should not be used as a filter String. If filters have not been applied, then this field should be set to the missing value. (String, no white-space or semi-colons permitted, {\color{red}duplicate values not allowed.}) @@ -1828,6 +1869,8 @@ \subsection{Changes between VCFv4.2 and VCFv4.3} \item New reserved AD, ADF, and ADR INFO and FORMAT fields added. \item Removed unused and ill-defined GLE FORMAT tag. \item Chromosome names cannot use reserved symbolic alleles and contain characters used by breakpoints (Section~\ref{sec-contig-field}). +\item IUPAC ambiguity codes should be converted to a concrete base. +\item Symbolic ALTs for IUPAC codes. \end{itemize} \subsection{Changes between BCFv2.1 and BCFv2.2} From 4b381fa4d55b3bd2551035939c092ae0d5d97646 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Tue, 30 Jun 2015 12:55:46 +0200 Subject: [PATCH 26/34] Fixed the wording of trailing tabs and IUPAC codes from "should" to "must" --- VCFv4.3.tex | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/VCFv4.3.tex b/VCFv4.3.tex index 128b835e9..024a5c0c8 100644 --- a/VCFv4.3.tex +++ b/VCFv4.3.tex @@ -286,7 +286,7 @@ \subsection{Header line syntax} \end{enumerate} If genotype data is present in the file, these are followed by a FORMAT column header, then an arbitrary number of sample IDs. The header line is tab-delimited -{\color{red} and there should be no tab characters at the end of the line}. +{\color{red} and there must be no tab characters at the end of the line}. \subsection{Data lines} \subsubsection{Fixed fields} @@ -316,7 +316,7 @@ \subsubsection{Fixed fields} {\color{red} If the reference sequence contains IUPAC ambiguity codes not allowed by this specification (such as R = A/G), the ambiguous reference base - can be reduced to a concrete base by using the one that is first alphabetically + must be reduced to a concrete base by using the one that is first alphabetically (thus R as a reference base is converted to A in VCF.) } From 31bd4ccf3a30ae916f753bfff0ea9d83df745f38 Mon Sep 17 00:00:00 2001 From: Bernie Pope Date: Tue, 14 Jul 2015 10:10:49 +1000 Subject: [PATCH 27/34] Remove repeated text about INFO fields for imprecise structural variants. Insert missing 'of' into sentence about precise variants consensus sequence --- VCFv4.1.tex | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/VCFv4.1.tex b/VCFv4.1.tex index c5a7d25fd..5623e5b3d 100644 --- a/VCFv4.1.tex +++ b/VCFv4.1.tex @@ -231,7 +231,7 @@ \section{Understanding the VCF format and the haplotype representation} \section{INFO keys used for structural variants} When the INFO keys reserved for encoding structural variants are used for imprecise variants, the values should be best estimates. When a key reflects a property of a single alt allele (e.g. SVLEN), then when there are multiple alt alleles there will be multiple values for the key corresponding to each alelle (e.g. SVLEN=-100,-110 for a deletion with two distinct alt alleles). -The following INFO keys are reserved for encoding structural variants. In general, when these keys are used by imprecise variants, the values should be best estimates. When a key reflects a property of a single alt allele (e.g. SVLEN), then when there are multiple alt alleles there will be multiple values for the key corresponding to each alelle (e.g. SVLEN=-100,-110 for a deletion with two distinct alt alleles). +The following INFO keys are reserved for encoding structural variants. \footnotesize \begin{verbatim} ##INFO= @@ -261,7 +261,7 @@ \section{INFO keys used for structural variants} ##INFO= \end{verbatim} \normalsize -For precise variants, the consensus sequence the alternate allele assembly is derivable from the REF and ALT fields. However, the alternate allele assembly file may contain additional information about the characteristics of the alt allele contigs. +For precise variants, the consensus sequence of the alternate allele assembly is derivable from the REF and ALT fields. However, the alternate allele assembly file may contain additional information about the characteristics of the alt allele contigs. \footnotesize \begin{verbatim} ##INFO= From 51b4cb6dfc99ca9735e1dece206ad659379cec1e Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 29 Jul 2015 17:30:01 +0200 Subject: [PATCH 28/34] PEDIGREE VCF header lines now require ID tag, resolves #96 Plus minor change of wording in ALT * description, "overlapping" rather than "upstream" deletion. --- VCFv4.3.tex | 33 ++++++++++++++++++++++++--------- 1 file changed, 24 insertions(+), 9 deletions(-) diff --git a/VCFv4.3.tex b/VCFv4.3.tex index 024a5c0c8..efc1a60d2 100644 --- a/VCFv4.3.tex +++ b/VCFv4.3.tex @@ -261,7 +261,10 @@ \subsubsection{Pedigree field format} It is possible to record relationships between genomes using the following syntax: {\color{red} \begin{verbatim} -##PEDIGREE= +##PEDIGREE= +##PEDIGREE= +##PEDIGREE= +##PEDIGREE= \end{verbatim} } \noindent or a link to a database: @@ -269,8 +272,11 @@ \subsubsection{Pedigree field format} \begin{verbatim} ##pedigreeDB=URL \end{verbatim} + +\noindent See \ref{PedigreeInDetail} for details. } + \subsection{Header line syntax} The header line names the 8 fixed, mandatory columns. These columns are as follows: @@ -321,7 +327,7 @@ \subsubsection{Fixed fields} } - \item ALT - alternate base(s): Comma separated list of alternate non-reference alleles called on at least one of the samples. Options are base Strings made up of the bases A,C,G,T,N,*, (case insensitive) or an angle-bracketed ID String (``$<$ID$>$'') or a breakend replacement string as described in the section on breakends. The `*' allele is reserved to indicate that the allele is missing due to a upstream deletion. If there are no alternative alleles, then the missing value should be used. Tools processing VCF files are not required to preserve case in the allele String, except for IDs, which are case sensitive. (String; no whitespace, commas, or angle-brackets are permitted in the ID String itself) + \item ALT - alternate base(s): Comma separated list of alternate non-reference alleles called on at least one of the samples. Options are base Strings made up of the bases A,C,G,T,N,*, (case insensitive) or an angle-bracketed ID String (``$<$ID$>$'') or a breakend replacement string as described in the section on breakends. The `*' allele is reserved to indicate that the allele is {\color{red}missing due to a an overlapping deletion}. If there are no alternative alleles, then the missing value should be used. Tools processing VCF files are not required to preserve case in the allele String, except for IDs, which are case sensitive. (String; no whitespace, commas, or angle-brackets are permitted in the ID String itself) \item QUAL - quality: Phred-scaled quality score for the assertion made in ALT. i.e. $-10log_{10}$ prob(call in ALT is wrong). If ALT is `.' (no variant) then this is $-10log_{10}$ prob(variant), and if ALT is not `.' this is $-10log_{10}$ prob(no variant). If unknown, the missing value should be specified. (Float) \item FILTER - filter status: PASS if this position has passed all filters, i.e. a call is made at this position. Otherwise, if the site has not passed all filters, a semicolon-separated list of codes for filters that fail. e.g. ``q10;s50'' might indicate that at this site the quality is below 10 and the number of samples with data is below 50\% of the total number of samples. `0' is reserved and should not be used as a filter String. If filters have not been applied, then this field should be set to the missing value. (String, no white-space or semi-colons permitted, {\color{red}duplicate values not allowed.}) \item INFO - additional information: (String, no semi-colons or @@ -1121,6 +1127,7 @@ \subsubsection{Sample mixtures} \normalsize \subsubsection{Clonal derivation relationships} +\label{PedigreeInDetail} In cancer, each VCF file represents several genomes from a patient, but one genome is special in that it represents the germline genome of the patient. This genome is contrasted to a second genome, the cancer tumor genome. In the simplest case the VCF file for a single patient contains only these two genomes. This is assumed in most of the discussion of the sections below. In general there may be several tumor genomes from the same patient in the VCF file. Some of these may be secondary tumors derived from an original primary tumor. We suggest the derivation relationships between genomes in a cancer VCF file be represented in the header with PEDIGREE tags. @@ -1129,15 +1136,19 @@ \subsubsection{Clonal derivation relationships} The general format of a PEDIGREE line describing asexual, clonal derivation is: +{\color{red} \begin{verbatim} -PEDIGREE= +PEDIGREE= \end{verbatim} +} This line asserts that the DNA in genome is asexually or clonally derived with mutations from the DNA in genome. This is the asexual analog of the VCF format that has been proposed for family relationships between genomes, i.e. there is one entry per of the form: +{\color{red} \begin{verbatim} -PEDIGREE= +PEDIGREE= \end{verbatim} +} Let's consider a cancer patient VCF file with 4 genomes: germline, primary\_tumor, secondary\_tumor1, and secondary\_tumor2 as illustrated in Figure 10. The primary\_tumor is derived from the germline and the secondary tumors are each derived independently from the primary tumor, in all cases by clonal derivation with mutations. The PEDIGREE lines would look like: @@ -1147,11 +1158,13 @@ \subsubsection{Clonal derivation relationships} \caption{Pegigree example} \end{figure} +{\color{red} \begin{verbatim} -##PEDIGREE= -##PEDIGREE= -##PEDIGREE= +##PEDIGREE= +##PEDIGREE= +##PEDIGREE= \end{verbatim} +} Alternately, if data on the genomes is compiled in a database, a simple pointer can be provided: @@ -1161,11 +1174,13 @@ \subsubsection{Clonal derivation relationships} The most general form of a pedigree line is: +{\color{red} \begin{verbatim} -##PEDIGREE= +##PEDIGREE= \end{verbatim} +} -This means that the genome Name\_0 is derived from the N $\ge$ 1 genomes Name\_1, ..., Name\_N. Based on these derivation relationships two new pieces of information can be specified. +This means that the genome SampleID is derived from the N $\ge$ 1 genomes Ancestor1, ..., AncestorN. Based on these derivation relationships two new pieces of information can be specified. Firstly, we wish to express the knowledge that a variant is novel to a genome, with respect to its parent genome. Ideally, this could be derived by simply comparing the features on either genomes. However, insufficient data or sample mixtures might prevent us from clearly determining at which stage a given variant appeared. This would be represented by a mutation quality score. From 56e8baee015cfe4b2926794b98170b19c43d0009 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Mon, 24 Aug 2015 15:07:28 +0200 Subject: [PATCH 29/34] Clarifications made as requested by https://github.com/samtools/hts-specs/pull/88#issuecomment-127329756 --- VCFv4.3.tex | 82 ++++++++++++++++++++++++++++------------------------- 1 file changed, 44 insertions(+), 38 deletions(-) diff --git a/VCFv4.3.tex b/VCFv4.3.tex index efc1a60d2..f9762b443 100644 --- a/VCFv4.3.tex +++ b/VCFv4.3.tex @@ -73,6 +73,40 @@ \subsection{An example} \end{verbatim} \normalsize This example shows (in order): a good simple SNP, a possible SNP that has been filtered out because its quality is below 10, a site at which two alternate alleles are called, with one of them (T) being ancestral (possibly a reference sequencing error), a site that is called monomorphic reference (i.e. with no alternate alleles), and a microsatellite with two alternative alleles, one a deletion of 2 bases (TC), and the other an insertion of one base (T). Genotype data are given for three samples, two of which are phased and the third unphased, with per sample genotype quality, depth and haplotype qualities (the latter only for the phased samples) given as well as the genotypes. The microsatellite calls are unphased. + +{\color{red} +\subsection{Character encoding, non-printable characters and characters with special meaning} +\label{character-encoding} +The character encoding of VCF files is UTF-8. UTF-8 is a multi-byte +character encoding that is a strict superset of 7-bit ASCII and has the +property that none of the bytes in any multi-byte characters are 7-bit ASCII +bytes. As a result, most software that processes VCF files does not have +to be aware of the possible presence of multi-byte UTF-8 characters. +Note that non-printable characters U+0000-U+0008, U+000B-U+000C, U+000E-U+001F are disallowed. +Line separators must be CR+LF or LF and they are allowed only as line separators at +end of line. +Characters with special meaning (such as field delimiters ';' in INFO +or ':' FORMAT fields) must be represented using the capitalized percent encoding: + +\begingroup\footnotesize +\begin{tabular}{l l l} +\%3A & : & (colon) \\ +\%3B & ; & (semicolon) \\ +\%3D & = & (equal sign) \\ +\%25 & \% & (percent sign) \\ +\%2C & , & (comma) \\ +\%0D & CR & \\ +\%0A & LF & \\ +\%09 & TAB & +\end{tabular} +\endgroup + + +\subsection{Data types} +Data types supported by VCF are: Integer (32-bit, signed), Float (32-bit, formatted +to match the regular expression \texttt{\^{}[-+]?[0-9]*\textbackslash.?[0-9]+([eE][-+]?[0-9]+)?\$}, \texttt{NaN}, or \texttt{+/-Inf}), Flag, Character, and +String. For the Integer type, the values from $-2^{31}$ to $-2^{31}+7$ cannot be stored in the binary version, see \ref{BcfTypeEncoding}. +} \subsection{Meta-information lines} @@ -108,41 +142,8 @@ \subsubsection{File format} ##fileformat=VCFv4.3 \end{verbatim} -{\color{red} -\subsubsection{Character encoding, non-printable characters and characters with special meaning} -\label{character-encoding} -The character encoding of VCF files is UTF-8. UTF-8 is a multi-byte -character encoding that is a strict superset of 7-bit ASCII and has the -property that none of the bytes in any multi-byte characters are 7-bit ASCII -bytes. As a result, most software that processes VCF files does not have -to be aware of the possible presence of multi-byte UTF-8 characters. -Note that non-printable characters U+0000-U+0008, U+000B-U+000C, U+000E-U+001F are disallowed. -Line separators must be CR+LF or LF and they are allowed only as line separators at -end of line. Characters with special meaning must be represented using the (capitalized) -percent encoding - -\begingroup\footnotesize -\begin{tabular}{l l l} -\%3A & : & (colon) \\ -\%3B & ; & (semicolon) \\ -\%3D & = & (equal sign) \\ -\%25 & \% & (percent sign) \\ -\%2C & , & (comma) \\ -\%0D & CR & \\ -\%0A & LF & \\ -\%09 & TAB & -\end{tabular} -\endgroup - - -\subsubsection{Data types} -Data types supported by VCF are: Integer (32-bit, signed), Float (32-bit, formatted -to match the regular expression \texttt{\^{}[-+]?[0-9]*\textbackslash.?[0-9]+([eE][-+]?[0-9]+)?\$}, \texttt{NaN}, or \texttt{+/-Inf}), Flag, Character, and -String. For the Integer type, the values from $-2^{31}$ to $-2^{31}+7$ cannot be stored in the binary version, see \ref{BcfTypeEncoding}. -} - \subsubsection{Information field format} INFO fields should be described as follows (first four keys are required, source and version are recommended): @@ -232,17 +233,22 @@ \subsubsection{Assembly field format} \subsubsection{Contig field format} \label{sec-contig-field} -As with chromosomal sequences it is highly recommended (but not required) that the header include tags describing the contigs referred to in the VCF file. This furthermore allows these contigs to come from different files. The format is identical to that of a reference sequence, but with an additional URL tag to indicate where that sequence can be found. For example:. +It is highly recommended (and required for BCF) that the header includes tags +describing the contigs referred to in the VCF file. The structured \texttt{contig} +field must include the ID attribute and typically includes also +sequence length, MD5 checksum, URL tag to indicate where the sequence can be +found, etc. For example: \begin{verbatim} -##contig= +##contig= \end{verbatim} \noindent -{\color{red}Valid contig names must follow the reference sequence names allowed by the SAM format, -excluding the characters "\texttt{\textless\textgreater[]:*}". The contig names must not use a -reserved symbolic allele name. +{\color{red}Valid contig names must follow the reference sequence names allowed by the SAM format +("{\tt [!-)+-\char60\char62-\char126][!-\char126]*}") excluding the characters "\texttt{\textless\textgreater[]:*}" to avoid clashes with +symbolic alleles and breakend notation. The contig names must not use a reserved symbolic allele name. } + \subsubsection{Sample field format} It is possible to define sample to genome mappings as shown below: {\color{red} From 187745fe48c340935f61ff90cb52ee313be71fd5 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Wed, 9 Sep 2015 09:47:00 +0200 Subject: [PATCH 30/34] Fix inconsistent description of PASS in BCF. The implicit filter PASS was described inconsistently throughout BCFv2.1. It is encoded as the first entry in the dictionary, not the last. --- VCFv4.3.tex | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/VCFv4.3.tex b/VCFv4.3.tex index f9762b443..b3c2d88f5 100644 --- a/VCFv4.3.tex +++ b/VCFv4.3.tex @@ -1332,7 +1332,7 @@ \subsubsection{Dictionary of strings} 20 10228 rs143255646 TA T . PASS ASP;RSPOS=10229;dbSNPBuildID=134 \end{verbatim} \normalsize -would be encoded inline in BCF2 by reference to the relative position of the header line in the header (ASP=1, RSPOS=2, dbSNPBuildID=3, and PASS implicitly encoded in the last offset PASS=4) +would be encoded inline in BCF2 by reference to the relative position of the header line in the header (ASP=1, RSPOS=2, dbSNPBuildID=3, and PASS implicitly encoded in the {\color{red}first offset PASS=0}). \small \begin{verbatim} @@ -1732,7 +1732,7 @@ \subsubsection{Encoding REF/ALT fields} \subsubsection{Encoding FILTER} -``PASS'' is implicitly encoded as the last entry in the header dictionary (see dictionary of strings). Here we encode the PASS FILTER field as a vector of size 1 of type 8-bit, which has type byte is 0x11. The value is the offset 0: +``PASS'' is implicitly encoded as the {\color{red}first} entry in the header dictionary (see dictionary of strings). Here we encode the PASS FILTER field as a vector of size 1 of type 8-bit, which has type byte is 0x11. The value is the offset 0: \vspace{0.3cm} \begin{tabular}{|l| l|} \hline @@ -1900,6 +1900,7 @@ \subsection{Changes between BCFv2.1 and BCFv2.2} \item We introduce end-of-vector byte and reserve 8 values for future use \item Clarified that except the end-of-vector byte, no other negative values are allowed in the GT array \item String vectors in BCF do not need to start with comma, as the number of values is indicated already in the definition of the tag in the header. +\item The implicit filter PASS was described inconsistently throughout BCFv2.1: It is encoded as the first entry in the dictionary, not the last. \end{itemize} } From d3a28e273fe02474c39099cea47a2813e93ad97f Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Thu, 24 Sep 2015 09:29:42 +0200 Subject: [PATCH 31/34] Removed red coloring from the text, VCFv4.3 is ready to be merged --- VCFv4.3.tex | 92 ++++++++++++++++------------------------------------- 1 file changed, 28 insertions(+), 64 deletions(-) diff --git a/VCFv4.3.tex b/VCFv4.3.tex index b3c2d88f5..9073f3d53 100644 --- a/VCFv4.3.tex +++ b/VCFv4.3.tex @@ -12,7 +12,7 @@ } \usepackage{color} -\renewcommand{\thefootnote}{\color{red}\fnsymbol{footnote}} +\renewcommand{\thefootnote}{\fnsymbol{footnote}} \begin{document} \input{VCFv4.3.ver} @@ -33,15 +33,14 @@ \section{The VCF specification} VCF is a text file format (most likely stored in a compressed manner). -It contains meta-information lines {\color{red}(prefixed with "\#\#")}, a header -line {\color{red}(prefixed with "\#")}, and data lines +It contains meta-information lines (prefixed with "\#\#"), a header +line (prefixed with "\#"), and data lines each containing information about a position in the genome and genotype information on samples for each position -{\color{red}(text fields separated by tabs). Zero length fields are not allowed, a dot (".") should +(text fields separated by tabs). Zero length fields are not allowed, a dot (".") should be used instead. In order to ensure interoperability across platforms, VCF compliant implementations must support both LF (\texttt{\textbackslash n}) and CR+LF (\texttt{\textbackslash r\textbackslash n}) newline conventions. -} \subsection{An example} \scriptsize @@ -74,7 +73,6 @@ \subsection{An example} \normalsize This example shows (in order): a good simple SNP, a possible SNP that has been filtered out because its quality is below 10, a site at which two alternate alleles are called, with one of them (T) being ancestral (possibly a reference sequencing error), a site that is called monomorphic reference (i.e. with no alternate alleles), and a microsatellite with two alternative alleles, one a deletion of 2 bases (TC), and the other an insertion of one base (T). Genotype data are given for three samples, two of which are phased and the third unphased, with per sample genotype quality, depth and haplotype qualities (the latter only for the phased samples) given as well as the genotypes. The microsatellite calls are unphased. -{\color{red} \subsection{Character encoding, non-printable characters and characters with special meaning} \label{character-encoding} The character encoding of VCF files is UTF-8. UTF-8 is a multi-byte @@ -106,13 +104,13 @@ \subsection{Data types} Data types supported by VCF are: Integer (32-bit, signed), Float (32-bit, formatted to match the regular expression \texttt{\^{}[-+]?[0-9]*\textbackslash.?[0-9]+([eE][-+]?[0-9]+)?\$}, \texttt{NaN}, or \texttt{+/-Inf}), Flag, Character, and String. For the Integer type, the values from $-2^{31}$ to $-2^{31}+7$ cannot be stored in the binary version, see \ref{BcfTypeEncoding}. -} + \subsection{Meta-information lines} File meta-information is included after the \#\# string and must be key=value pairs. Meta-information lines are optional, but if they are present then -they must be completely well-formed. {\color{red} Note that BCF, the binary +they must be completely well-formed. Note that BCF, the binary counterpart of VCF, requires that all entries are present. It is strongly encouraged to include meta-information lines describing the entries used in the body of the VCF file. @@ -132,7 +130,7 @@ \subsection{Meta-information lines} optional (see the VCF example above). Meta-information lines can be in any order with the exception of `fileformat` -which must come first.} +which must come first. \subsubsection{File format} @@ -213,14 +211,12 @@ \subsubsection{Alternative allele field format} \bigskip -{\color{red} \noindent \textbf{IUPAC ambiguity codes} \newline Symbolic alleles can be used also to represent genuinely ambiguous data in VCF, for example: \begin{verbatim} ##ALT= ##ALT= \end{verbatim} -} \subsubsection{Assembly field format} @@ -243,16 +239,14 @@ \subsubsection{Contig field format} \end{verbatim} \noindent -{\color{red}Valid contig names must follow the reference sequence names allowed by the SAM format +Valid contig names must follow the reference sequence names allowed by the SAM format ("{\tt [!-)+-\char60\char62-\char126][!-\char126]*}") excluding the characters "\texttt{\textless\textgreater[]:*}" to avoid clashes with symbolic alleles and breakend notation. The contig names must not use a reserved symbolic allele name. -} \subsubsection{Sample field format} It is possible to define sample to genome mappings as shown below: -{\color{red} -\scriptsize +{\scriptsize \begin{verbatim} ##META= ##META= @@ -260,27 +254,22 @@ \subsubsection{Sample field format} ##META= ##SAMPLE= ##SAMPLE= -\end{verbatim} -} +\end{verbatim}} \subsubsection{Pedigree field format} It is possible to record relationships between genomes using the following syntax: -{\color{red} \begin{verbatim} ##PEDIGREE= ##PEDIGREE= ##PEDIGREE= ##PEDIGREE= \end{verbatim} -} \noindent or a link to a database: -{\color{red} \begin{verbatim} ##pedigreeDB=URL \end{verbatim} \noindent See \ref{PedigreeInDetail} for details. -} \subsection{Header line syntax} @@ -298,18 +287,18 @@ \subsection{Header line syntax} \end{enumerate} If genotype data is present in the file, these are followed by a FORMAT column header, then an arbitrary number of sample IDs. The header line is tab-delimited -{\color{red} and there must be no tab characters at the end of the line}. +and there must be no tab characters at the end of the line. \subsection{Data lines} \subsubsection{Fixed fields} There are 8 fixed fields per record. All data lines are tab-delimited -{\color{red} with no tab character at the end of the line. The last data line should end with a line separator.} In all cases, +with no tab character at the end of the line. The last data line should end with a line separator. In all cases, missing values are specified with a dot (`.'). Fixed fields are: \begin{enumerate} \item CHROM - chromosome: An identifier from the reference genome or an angle-bracketed ID String (``$<$ID$>$'') pointing to a contig in the assembly file (cf. the \#\#assembly line in the header). All entries for a specific CHROM should form a contiguous block within the VCF file. The colon symbol (:) must be absent from all chromosome names to avoid parsing errors when dealing with breakends. (String, no white-space permitted, Required). \item POS - position: The reference position, with the 1st base having position 1. Positions are sorted numerically, in increasing order, within each reference sequence CHROM. It is permitted to have multiple records with the same POS. Telomeres are indicated by using positions 0 or N+1, where N is the length of the corresponding chromosome or contig. (Integer, Required) - \item ID - identifier: Semi-colon separated list of unique identifiers where available. If this is a dbSNP variant it is encouraged to use the rs number(s). No identifier should be present in more than one data record. If there is no identifier available, then the missing value should be used. (String, no white-space or semi-colons permitted, {\color{red}duplicate values not allowed.}) + \item ID - identifier: Semi-colon separated list of unique identifiers where available. If this is a dbSNP variant it is encouraged to use the rs number(s). No identifier should be present in more than one data record. If there is no identifier available, then the missing value should be used. (String, no white-space or semi-colons permitted, duplicate values not allowed.) \item REF - reference base(s): Each base must be one of A,C,G,T,N (case insensitive). Multiple bases are permitted. The value in the POS field refers to the position of the first base in the String. For simple insertions and @@ -325,28 +314,26 @@ \subsubsection{Fixed fields} polymorphism. Tools processing VCF files are not required to preserve case in the allele Strings. (String, Required). - {\color{red} If the reference sequence contains IUPAC ambiguity codes not allowed by this specification (such as R = A/G), the ambiguous reference base must be reduced to a concrete base by using the one that is first alphabetically (thus R as a reference base is converted to A in VCF.) - } - \item ALT - alternate base(s): Comma separated list of alternate non-reference alleles called on at least one of the samples. Options are base Strings made up of the bases A,C,G,T,N,*, (case insensitive) or an angle-bracketed ID String (``$<$ID$>$'') or a breakend replacement string as described in the section on breakends. The `*' allele is reserved to indicate that the allele is {\color{red}missing due to a an overlapping deletion}. If there are no alternative alleles, then the missing value should be used. Tools processing VCF files are not required to preserve case in the allele String, except for IDs, which are case sensitive. (String; no whitespace, commas, or angle-brackets are permitted in the ID String itself) + \item ALT - alternate base(s): Comma separated list of alternate non-reference alleles called on at least one of the samples. Options are base Strings made up of the bases A,C,G,T,N,*, (case insensitive) or an angle-bracketed ID String (``$<$ID$>$'') or a breakend replacement string as described in the section on breakends. The `*' allele is reserved to indicate that the allele is missing due to a an overlapping deletion. If there are no alternative alleles, then the missing value should be used. Tools processing VCF files are not required to preserve case in the allele String, except for IDs, which are case sensitive. (String; no whitespace, commas, or angle-brackets are permitted in the ID String itself) \item QUAL - quality: Phred-scaled quality score for the assertion made in ALT. i.e. $-10log_{10}$ prob(call in ALT is wrong). If ALT is `.' (no variant) then this is $-10log_{10}$ prob(variant), and if ALT is not `.' this is $-10log_{10}$ prob(no variant). If unknown, the missing value should be specified. (Float) - \item FILTER - filter status: PASS if this position has passed all filters, i.e. a call is made at this position. Otherwise, if the site has not passed all filters, a semicolon-separated list of codes for filters that fail. e.g. ``q10;s50'' might indicate that at this site the quality is below 10 and the number of samples with data is below 50\% of the total number of samples. `0' is reserved and should not be used as a filter String. If filters have not been applied, then this field should be set to the missing value. (String, no white-space or semi-colons permitted, {\color{red}duplicate values not allowed.}) + \item FILTER - filter status: PASS if this position has passed all filters, i.e. a call is made at this position. Otherwise, if the site has not passed all filters, a semicolon-separated list of codes for filters that fail. e.g. ``q10;s50'' might indicate that at this site the quality is below 10 and the number of samples with data is below 50\% of the total number of samples. `0' is reserved and should not be used as a filter String. If filters have not been applied, then this field should be set to the missing value. (String, no white-space or semi-colons permitted, duplicate values not allowed.) \item INFO - additional information: (String, no semi-colons or equals-signs permitted; commas are permitted only as delimiters for lists of values; characters with special meaning can be encoded using the percent encoding, see Section~\ref{character-encoding}; space characters are allowed) INFO fields are encoded as a semicolon-separated series of short keys with optional values in the format: $<$key$>$=$<$data$>$[,data]. - {\color{red} INFO keys must match the regular expression \texttt{\^{}[A-Za-z\_][0-9A-Za-z\_.]*\$}, duplicate fields are not allowed.} + INFO keys must match the regular expression \texttt{\^{}[A-Za-z\_][0-9A-Za-z\_.]*\$}, duplicate fields are not allowed. Arbitrary keys are permitted, although the following sub-fields are reserved (albeit optional): \begin{itemize} \item AA : ancestral allele \item AC : allele count in genotypes, for each ALT allele, in the same order as listed - \item {\color{red}AD, ADF, ADR: read depths for each allele; total (AD), on the forward (ADF) and the reverse (ADR) strand (Integer, Number=R)} + \item AD, ADF, ADR: read depths for each allele; total (AD), on the forward (ADF) and the reverse (ADR) strand (Integer, Number=R) \item AF : allele frequency for each ALT allele in the same order as listed: use this when estimated from primary data, not called genotypes \item AN : total number of alleles in called genotypes \item BQ : RMS base quality at this position @@ -371,7 +358,7 @@ \subsubsection{Fixed fields} \subsubsection{Genotype fields} If genotype information is present, then the same types of data must be present for all samples. First a FORMAT field is given specifying the data types and -order ({\color{red} colon-separated FORMAT ids matching the regular expression \texttt{\^{}[A-Za-z\_][0-9A-Za-z\_.]*\$}, duplicate fields are not allowed}). This is followed by one data block per +order (colon-separated FORMAT ids matching the regular expression \texttt{\^{}[A-Za-z\_][0-9A-Za-z\_.]*\$}, duplicate fields are not allowed). This is followed by one data block per sample, with the colon-separated data corresponding to the types specified in the format. The first sub-field must always be the genotype (GT) if it is present. There are no required sub-fields. @@ -380,12 +367,12 @@ \subsubsection{Genotype fields} \begin{itemize} \renewcommand{\labelitemii}{$\circ$} - \item {\color{red}AD, ADF, ADR: per-sample read depths for each allele; total (AD), on the forward (ADF) and the reverse (ADR) strand (Integer, Number=R)} + \item AD, ADF, ADR: per-sample read depths for each allele; total (AD), on the forward (ADF) and the reverse (ADR) strand (Integer, Number=R) \item DP : read depth at this position for this sample (Integer) \item EC : comma separated list of expected alternate allele counts for each alternate allele in the same order as listed in the ALT field (typically used in association analyses) (Integers) \item FT : sample genotype filter indicating if this genotype was ``called'' (similar in concept to the FILTER field). Again, use PASS to indicate that all filters have been passed, a semi-colon separated list of codes for filters that fail, or `.' to indicate that filters have not been applied. These values should be described in the meta-information in the same way as FILTERs (String, no white-space or semi-colons permitted) \item GQ : conditional genotype quality, encoded as a phred quality $-10log_{10}$ p(genotype call is wrong, conditioned on the site's being variant) (Integer) - \item GP : {\color{red} genotype posterior probabilities in the range 0 to 1 using the same ordering as the GL field; one use can be to store imputed genotype probabilities (Float)} + \item GP : genotype posterior probabilities in the range 0 to 1 using the same ordering as the GL field; one use can be to store imputed genotype probabilities (Float) \item GT : genotype, encoded as allele values separated by either of $/$ or $\mid$. The allele values are 0 for the reference allele (what is in the REF field), 1 for the first allele listed in ALT, 2 for the second allele list in ALT and so on. For diploid calls examples could be $0/1$, $1\mid0$, or $1/2$, etc. For haploid calls, e.g. on Y, male non-pseudoautosomal X, or mitochondrion, only one allele value should be given; a triploid call might look like $0/0/1$. If a call cannot be made for a sample at a given locus, `.' should be specified for each missing allele in the GT field (for example `$./.$' for a diploid genotype and `.' for haploid genotype). The meanings of the separators are as follows (see the PS field below for more details on incorporating phasing information into the genotypes): \begin{itemize} \item $/$ : genotype unphased @@ -394,13 +381,12 @@ \subsubsection{Genotype fields} \item GL : genotype likelihoods comprised of comma separated floating point $log_{10}$-scaled likelihoods for all possible genotypes given the set of - alleles defined in the REF and ALT fields. {\color{red} In presence of the GT field the + alleles defined in the REF and ALT fields. In presence of the GT field the same ploidy is expected; without GT field, diploidy is assumed. \textsc{Genotype Ordering.} In general case of ploidy P and N alternate alleles (0 is the REF and 1..N the alternate alleles), the ordering of genotypes for the likelihoods can - be expressed by the following pseudocode with as many nested loops as ploidy:\footnote{\color{red}Note - that we use inclusive \texttt{for} loop boundaries.} + be expressed by the following pseudocode with as many nested loops as ploidy:\footnote{Note that we use inclusive \texttt{for} loop boundaries.} \begingroup \small \begin{lstlisting} @@ -450,7 +436,6 @@ \subsubsection{Genotype fields} \end{tabular} } \end{itemize} - } \item HQ : haplotype qualities, two comma separated phred qualities (Integers) \item MQ : RMS mapping quality, similar to the version in the INFO field. (Integer) @@ -474,7 +459,6 @@ \section{Understanding the VCF format and the haplotype representation} \end{itemize} VCF records use a simple haplotype representation for REF and ALT alleles to describe variant haplotypes at a locus. ALT haplotypes are constructed from the REF haplotype by taking the REF allele bases at the POS in the reference genotype and replacing them with the ALT bases. In essence, the VCF record specifies a-REF-t and the alternative haplotypes are a-ALT-t for each alternative allele. -{\color{red} \subsection{VCF tag naming conventions} \begin{itemize} \item The "L" suffix means "likelihood" as log-likelihood in the sampling @@ -491,7 +475,6 @@ \subsection{VCF tag naming conventions} site-level QUAL field follows the same convention (represented as a phred-scaled number). \end{itemize} -} \section{INFO keys used for structural variants} @@ -566,9 +549,9 @@ \section{FORMAT keys used for structural variants} the integer copy number of the variant in this sample. CNQ is encoded as a phred quality $-10log_{10}$ p(copy number genotype call is wrong). CNL specifies a list of $log_{10}$ likelihoods for each potential copy number, -starting from zero. {\color{red} CNP is 0 to 1-scaled copy number posterior +starting from zero. CNP is 0 to 1-scaled copy number posterior probabilities (and otherwise defined precisely as the CNL field), intended to -store imputed genotype probabilities.} When possible, GT/GQ/GL/GP should be used +store imputed genotype probabilities. When possible, GT/GQ/GL/GP should be used instead of (or in addition to) these keys. \section{Representing variation in VCF records} @@ -1142,19 +1125,15 @@ \subsubsection{Clonal derivation relationships} The general format of a PEDIGREE line describing asexual, clonal derivation is: -{\color{red} \begin{verbatim} PEDIGREE= \end{verbatim} -} This line asserts that the DNA in genome is asexually or clonally derived with mutations from the DNA in genome. This is the asexual analog of the VCF format that has been proposed for family relationships between genomes, i.e. there is one entry per of the form: -{\color{red} \begin{verbatim} PEDIGREE= \end{verbatim} -} Let's consider a cancer patient VCF file with 4 genomes: germline, primary\_tumor, secondary\_tumor1, and secondary\_tumor2 as illustrated in Figure 10. The primary\_tumor is derived from the germline and the secondary tumors are each derived independently from the primary tumor, in all cases by clonal derivation with mutations. The PEDIGREE lines would look like: @@ -1164,13 +1143,11 @@ \subsubsection{Clonal derivation relationships} \caption{Pegigree example} \end{figure} -{\color{red} \begin{verbatim} ##PEDIGREE= ##PEDIGREE= ##PEDIGREE= \end{verbatim} -} Alternately, if data on the genomes is compiled in a database, a simple pointer can be provided: @@ -1180,11 +1157,9 @@ \subsubsection{Clonal derivation relationships} The most general form of a pedigree line is: -{\color{red} \begin{verbatim} ##PEDIGREE= \end{verbatim} -} This means that the genome SampleID is derived from the N $\ge$ 1 genomes Ancestor1, ..., AncestorN. Based on these derivation relationships two new pieces of information can be specified. @@ -1235,7 +1210,6 @@ \subsubsection{Phasing adjacencies in an aneuploid context} Haplotype quality & 30 & 40 & 40 \\ \end{tabular} -{\color{red} \subsection{Representing unspecified alleles and REF-only blocks (gVCF)} In order to report sequencing data evidence for both variant and non-variant positions in the genome, the VCF specification allows to represent blocks of reference-only calls in a single record @@ -1260,7 +1234,6 @@ \subsection{Representing unspecified alleles and REF-only blocks (gVCF)} \end{tabular} \end{flushleft} \normalsize -} \pagebreak \section{BCF specification} @@ -1332,7 +1305,7 @@ \subsubsection{Dictionary of strings} 20 10228 rs143255646 TA T . PASS ASP;RSPOS=10229;dbSNPBuildID=134 \end{verbatim} \normalsize -would be encoded inline in BCF2 by reference to the relative position of the header line in the header (ASP=1, RSPOS=2, dbSNPBuildID=3, and PASS implicitly encoded in the {\color{red}first offset PASS=0}). +would be encoded inline in BCF2 by reference to the relative position of the header line in the header (ASP=1, RSPOS=2, dbSNPBuildID=3, and PASS implicitly encoded in the first offset PASS=0). \small \begin{verbatim} @@ -1346,7 +1319,6 @@ \subsubsection{Dictionary of strings} \end{verbatim} \normalsize -{\color{red} Defined this way, the dictionary of strings depends on the order and the presence of all preceding header lines. If an existing tag needs to be removed from a BCF, also all consequent tags throughout the whole BCF would have to be @@ -1356,7 +1328,6 @@ \subsubsection{Dictionary of strings} present in one record, it must be present also in all other dictionary-defining records. The IDX tag is not necessary in newly created BCF files, but if present, the numbering must match the implicit dictionary of tags. -} Note that the dictionary encoding has the magic prefix `s' here to indicate that the field's value is actually in the dictionary entry giving by the subsequent offset. This representation isn't actually the one used in BCF2 records but it provides a clean visual guide for the above example. Note also how the contig has been recoded as a offset into the list of contig declarations. @@ -1489,12 +1460,11 @@ \subsubsection{Type encoding} \textbf{Integers} may be encoded as 8, 16, or 32 bit values, in little-endian order. It is up to the encoder to determine the appropriate ranged value to use when writing the BCF2 file. -{\color{red} For integer types, the values 0x80, 0x8000, 0x80000000 are interpreted as missing values and 0x81, 0x8001, 0x80000001 as end-of-vector indicators (for 8, 16, and 32 bit values, respectively). Note that the end-of-vector byte is not part of the vector itself and only end-of-vector bytes can follow. -In total, eight values are reserved for future use: 0x80-0x87, 0x8000-0x8007, 0x80000000-0x80000007. } +In total, eight values are reserved for future use: 0x80-0x87, 0x8000-0x8007, 0x80000000-0x80000007. \vspace{0.3cm} @@ -1520,12 +1490,10 @@ \subsubsection{Type encoding} \noindent A good way to understand these values is to play around with the IEEE encoder webiste. \vspace{0.3cm} -{\color{red} \noindent Similarly to integers, the float value of 0x7F800001 is interpreted as a missing value and 0x7F800002 as the end-of-vector indicator. Note that the end-of-vector byte is not part of the vector itself and only end-of-vector bytes can follow. In total, eight values are reservd for future use: -} \vspace{0.1cm} \begin{tabular}{| l | c | l |} \hline @@ -1539,7 +1507,7 @@ \subsubsection{Type encoding} \end{tabular} \vspace{0.3cm} -\textbf{Character} values are not explicitly typed in BCF2. Instead, VCF Character values should be encoded by a single character string. {\color{red} See also \ref{character-encoding}.} +\textbf{Character} values are not explicitly typed in BCF2. Instead, VCF Character values should be encoded by a single character string. See also \ref{character-encoding}. \vspace{0.3cm} \textbf{Flags} values -- which can only appear in INFO fields -- in BCF2 should be encoded by any non-MISSING value. The recommended best practice is to encode the value as an 1-element INT8 (type 0x11) with value of 1 to indicate present. Because FLAG values can only be encoded in INFO fields, BCF2 provides no mechanism to encode FLAG values in genotypes, but could be easily extended to do so if allowed in a future VCF version. @@ -1632,11 +1600,9 @@ \subsubsection{Type encoding} maximum ploidy among all samples at a site. For one individual, each integer in the vector is organized as $(allele+1) << 1 \mid phased$ where allele is set to -1 if the allele in GT is a dot `.' (thus the higher bits are all 0). -{\color{red} The vector is padded with the end-of-vector values if the GT having fewer ploidy. We note specifically that except for the end-of-vector byte, no other negative values are allowed in the GT array. -} Examples: @@ -1732,7 +1698,7 @@ \subsubsection{Encoding REF/ALT fields} \subsubsection{Encoding FILTER} -``PASS'' is implicitly encoded as the {\color{red}first} entry in the header dictionary (see dictionary of strings). Here we encode the PASS FILTER field as a vector of size 1 of type 8-bit, which has type byte is 0x11. The value is the offset 0: +``PASS'' is implicitly encoded as the first entry in the header dictionary (see dictionary of strings). Here we encode the PASS FILTER field as a vector of size 1 of type 8-bit, which has type byte is 0x11. The value is the offset 0: \vspace{0.3cm} \begin{tabular}{|l| l|} \hline @@ -1865,7 +1831,6 @@ \subsection{BCF2 block gzip and indexing} BCF2 files are expected to be indexed through the same index scheme, section~4 as BAM files and other block-compressed files with BGZF. -{\color{red} \section{List of changes} \subsection{Changes between VCFv4.2 and VCFv4.3} @@ -1902,6 +1867,5 @@ \subsection{Changes between BCFv2.1 and BCFv2.2} \item String vectors in BCF do not need to start with comma, as the number of values is indicated already in the definition of the tag in the header. \item The implicit filter PASS was described inconsistently throughout BCFv2.1: It is encoded as the first entry in the dictionary, not the last. \end{itemize} -} \end{document} From 861e23543f3bfd501f800942f8f1d0dbd2958379 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Sat, 10 Oct 2015 13:49:58 +0200 Subject: [PATCH 32/34] Modified example, coordinates must be ordered --- VCFv4.3.tex | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/VCFv4.3.tex b/VCFv4.3.tex index 9073f3d53..3878c67b0 100644 --- a/VCFv4.3.tex +++ b/VCFv4.3.tex @@ -572,16 +572,19 @@ \subsubsection{Example 1} Representing these as VCF records would be done as follows: \begin{enumerate} - \item A SNP polymorphism of C/G $\rightarrow \{C,G\} \rightarrow$ C is the reference allele - \item A single base deletion of C $\rightarrow \{tC,t\} \rightarrow$ tC is the reference allele - \item A single base insertion of A $\rightarrow \{tC,tCA\} \rightarrow$ tC is the reference allele + \item A SNP polymorphism of C/G at position 3 where C is the reference base and G is the alternate base becomes REF=C, ALT=G + \item A single base deletion of C at position 3 becomes REF=TC, ALT=T + \item A single base insertion of A after position 3 becomes REF=C, ALT=CA \end{enumerate} +Note that the positions must be sorted in increasing order: + +\vspace{0.5em} \begin{tabular}{ l l l l l l l l} \#CHROM & POS & ID & REF & ALT & QUAL & FILTER & INFO \\ - $20$ & $3$ & . & C & G & . & PASS & DP=100 \\ $20$ & $2$ & . & TC & T & . & PASS & DP=100 \\ - $20$ & $2$ & . & TC & TCA & . & PASS & DP=100 \\ + $20$ & $3$ & . & C & G & . & PASS & DP=100 \\ + $20$ & $3$ & . & C & CA & . & PASS & DP=100 \\ \end{tabular} \subsubsection{Example 2} From ad14194e353b563a44bb7a9366c2757827b6d8c3 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Sat, 10 Oct 2015 14:22:01 +0200 Subject: [PATCH 33/34] Move sentences which apply to all data lines from 1.6.1 to 1.6 --- VCFv4.3.tex | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/VCFv4.3.tex b/VCFv4.3.tex index 3878c67b0..32d322456 100644 --- a/VCFv4.3.tex +++ b/VCFv4.3.tex @@ -290,10 +290,12 @@ \subsection{Header line syntax} and there must be no tab characters at the end of the line. \subsection{Data lines} -\subsubsection{Fixed fields} -There are 8 fixed fields per record. All data lines are tab-delimited +All data lines are tab-delimited with no tab character at the end of the line. The last data line should end with a line separator. In all cases, -missing values are specified with a dot (`.'). Fixed fields are: +missing values are specified with a dot (`.'). + +\subsubsection{Fixed fields} +There are 8 fixed fields per record. Fixed fields are: \begin{enumerate} \item CHROM - chromosome: An identifier from the reference genome or an angle-bracketed ID String (``$<$ID$>$'') pointing to a contig in the assembly file (cf. the \#\#assembly line in the header). All entries for a specific CHROM should form a contiguous block within the VCF file. The colon symbol (:) must be absent from all chromosome names to avoid parsing errors when dealing with breakends. (String, no white-space permitted, Required). From fe0222c6e9628a31087597f772e9a5b8b6c0c6f4 Mon Sep 17 00:00:00 2001 From: Petr Danecek Date: Sat, 10 Oct 2015 14:25:34 +0200 Subject: [PATCH 34/34] Stronger wording about disallowed INT32 values --- VCFv4.3.tex | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/VCFv4.3.tex b/VCFv4.3.tex index 32d322456..2d258f96e 100644 --- a/VCFv4.3.tex +++ b/VCFv4.3.tex @@ -103,7 +103,8 @@ \subsection{Character encoding, non-printable characters and characters with spe \subsection{Data types} Data types supported by VCF are: Integer (32-bit, signed), Float (32-bit, formatted to match the regular expression \texttt{\^{}[-+]?[0-9]*\textbackslash.?[0-9]+([eE][-+]?[0-9]+)?\$}, \texttt{NaN}, or \texttt{+/-Inf}), Flag, Character, and -String. For the Integer type, the values from $-2^{31}$ to $-2^{31}+7$ cannot be stored in the binary version, see \ref{BcfTypeEncoding}. +String. For the Integer type, the values from $-2^{31}$ to $-2^{31}+7$ cannot be stored in the binary version and therefore +are disallowed in both VCF and BCF, see \ref{BcfTypeEncoding}. \subsection{Meta-information lines}