-
Notifications
You must be signed in to change notification settings - Fork 243
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
VCFHeader and VCFHeaderLine rewrite/refactoring #1581
base: master
Are you sure you want to change the base?
Changes from all commits
ce7cdcf
4d08d5d
210adb2
f3b9001
2fe930c
88bdf78
d93546f
f9a0c08
29c854f
5700958
5e09eb3
860cab6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -53,6 +53,13 @@ public SAMSequenceDictionary(final List<SAMSequenceRecord> list) { | |
setSequences(list); | ||
} | ||
|
||
//TODO: this returns sequences in the internal list order instead of | ||
// honoring each sequence's contigIndex | ||
/** | ||
* Get a list of sequences for this dictionary. | ||
* @return the list of sequences for this dictionary in internal order (the order in which the sequences | ||
* were added to this dictionary) | ||
*/ | ||
public List<SAMSequenceRecord> getSequences() { | ||
return Collections.unmodifiableList(mSequences); | ||
} | ||
|
@@ -75,6 +82,14 @@ public void setSequences(final List<SAMSequenceRecord> list) { | |
list.forEach(this::addSequence); | ||
} | ||
|
||
/** | ||
* Add a sequence to the dictionary. | ||
* @param sequenceRecord the sequence record to add - note that this method mutates the contig | ||
* index of the sequenceRecord to match the newly added record's relative | ||
* order in the list | ||
*/ | ||
//TODO: this method ignores (and actually mutates) the sequenceRecord's contig index to make it match | ||
// the record's relative placement in the dictionary's internal list | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Wow, that's a nasty side effect that we've been living with forever. |
||
public void addSequence(final SAMSequenceRecord sequenceRecord) { | ||
if (mSequenceMap.containsKey(sequenceRecord.getSequenceName())) { | ||
throw new IllegalArgumentException("Cannot add sequence that already exists in SAMSequenceDictionary: " + | ||
|
Large diffs are not rendered by default.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,7 +27,11 @@ | |
|
||
import htsjdk.samtools.util.FileExtensions; | ||
import htsjdk.tribble.TribbleException; | ||
import htsjdk.variant.vcf.*; | ||
import htsjdk.variant.vcf.VCFConstants; | ||
import htsjdk.variant.vcf.VCFHeader; | ||
import htsjdk.variant.vcf.VCFHeaderLine; | ||
import htsjdk.variant.vcf.VCFIDHeaderLine; | ||
import htsjdk.variant.vcf.VCFSimpleHeaderLine; | ||
|
||
import java.io.File; | ||
import java.io.FileNotFoundException; | ||
|
@@ -93,10 +97,15 @@ public static ArrayList<String> makeDictionary(final VCFHeader header) { | |
// set up the strings dictionary | ||
for ( VCFHeaderLine line : header.getMetaDataInInputOrder() ) { | ||
if ( line.shouldBeAddedToDictionary() ) { | ||
final VCFIDHeaderLine idLine = (VCFIDHeaderLine)line; | ||
if ( ! seen.contains(idLine.getID())) { | ||
dict.add(idLine.getID()); | ||
seen.add(idLine.getID()); | ||
if (!line.isIDHeaderLine()) { | ||
//is there a better way to ensure that shouldBeAddedToDictionary==true only when isIDHeaderLine==true | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm wondering whether There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes! There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should do that in the BCF branch. |
||
throw new TribbleException(String.format( | ||
"The header line %s cannot be added to the BCF dictionary since its not an ID header line", | ||
line)); | ||
} | ||
if ( ! seen.contains(line.getID())) { | ||
dict.add(line.getID()); | ||
seen.add(line.getID()); | ||
} | ||
} | ||
} | ||
|
@@ -291,7 +300,7 @@ else if ( o.getClass().isArray() ) { | |
* Are the elements and their order in the output and input headers consistent so that | ||
* we can write out the raw genotypes block without decoding and recoding it? | ||
* | ||
* If the order of INFO, FILTER, or contrig elements in the output header is different than | ||
* If the order of INFO, FILTER, or contig elements in the output header is different than | ||
* in the input header we must decode the blocks using the input header and then recode them | ||
* based on the new output order. | ||
* | ||
|
@@ -308,15 +317,15 @@ public static boolean headerLinesAreOrderedConsistently(final VCFHeader outputHe | |
if ( ! nullAsEmpty(outputHeader.getSampleNamesInOrder()).equals(nullAsEmpty(genotypesBlockHeader.getSampleNamesInOrder())) ) | ||
return false; | ||
|
||
final Iterator<? extends VCFIDHeaderLine> outputLinesIt = outputHeader.getIDHeaderLines().iterator(); | ||
final Iterator<? extends VCFIDHeaderLine> inputLinesIt = genotypesBlockHeader.getIDHeaderLines().iterator(); | ||
final Iterator<VCFSimpleHeaderLine> outputLinesIt = outputHeader.getIDHeaderLines().iterator(); | ||
final Iterator<VCFSimpleHeaderLine> inputLinesIt = genotypesBlockHeader.getIDHeaderLines().iterator(); | ||
|
||
while ( inputLinesIt.hasNext() ) { | ||
if ( ! outputLinesIt.hasNext() ) // missing lines in output | ||
return false; | ||
|
||
final VCFIDHeaderLine outputLine = outputLinesIt.next(); | ||
final VCFIDHeaderLine inputLine = inputLinesIt.next(); | ||
final VCFSimpleHeaderLine outputLine = outputLinesIt.next(); | ||
final VCFSimpleHeaderLine inputLine = inputLinesIt.next(); | ||
|
||
if ( ! inputLine.getClass().equals(outputLine.getClass()) || ! inputLine.getID().equals(outputLine.getID()) ) | ||
return false; | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -27,15 +27,19 @@ | |
|
||
import htsjdk.samtools.SAMSequenceDictionary; | ||
import htsjdk.samtools.util.IOUtil; | ||
import htsjdk.samtools.util.Log; | ||
import htsjdk.samtools.util.RuntimeIOException; | ||
import htsjdk.tribble.TribbleException; | ||
import htsjdk.tribble.index.IndexCreator; | ||
import htsjdk.utils.ValidationUtils; | ||
import htsjdk.variant.variantcontext.VariantContext; | ||
import htsjdk.variant.variantcontext.VariantContextBuilder; | ||
import htsjdk.variant.vcf.VCFConstants; | ||
import htsjdk.variant.vcf.VCFEncoder; | ||
import htsjdk.variant.vcf.VCFHeader; | ||
import htsjdk.variant.vcf.VCFHeaderLine; | ||
import htsjdk.variant.vcf.VCFHeaderVersion; | ||
import htsjdk.variant.vcf.VCFUtils; | ||
|
||
import java.io.BufferedWriter; | ||
import java.io.ByteArrayOutputStream; | ||
|
@@ -45,14 +49,15 @@ | |
import java.io.OutputStreamWriter; | ||
import java.io.Writer; | ||
import java.nio.file.Path; | ||
import java.util.stream.Collectors; | ||
|
||
/** | ||
* this class writes VCF files | ||
*/ | ||
class VCFWriter extends IndexingVariantContextWriter { | ||
protected final static Log logger = Log.getInstance(VCFWriter.class); | ||
|
||
private static final String VERSION_LINE = | ||
VCFHeader.METADATA_INDICATOR + VCFHeaderVersion.VCF4_2.getFormatString() + "=" + VCFHeaderVersion.VCF4_2.getVersionString(); | ||
private static final String DEFAULT_VERSION_LINE = VCFHeader.DEFAULT_VCF_VERSION.toHeaderVersionLine(); | ||
|
||
// Initialized when the header is written to the output stream | ||
private VCFEncoder vcfEncoder = null; | ||
|
@@ -164,7 +169,7 @@ public void writeHeader(final VCFHeader header) { | |
} | ||
|
||
public static String getVersionLine() { | ||
return VERSION_LINE; | ||
return DEFAULT_VERSION_LINE; | ||
} | ||
|
||
public static VCFHeader writeHeader(VCFHeader header, | ||
|
@@ -175,12 +180,18 @@ public static VCFHeader writeHeader(VCFHeader header, | |
try { | ||
rejectVCFV43Headers(header); | ||
|
||
// the file format field needs to be written first | ||
// Validate that the file version we're writing is version-compatible this header's version. | ||
validateHeaderVersion(header, versionLine); | ||
|
||
// The file format field needs to be written first; below any file format lines | ||
// embedded in the header will be removed | ||
writer.write(versionLine + "\n"); | ||
|
||
for (final VCFHeaderLine line : header.getMetaDataInSortedOrder() ) { | ||
if ( VCFHeaderVersion.isFormatString(line.getKey()) ) | ||
// Remove the fileformat header lines | ||
if ( VCFHeaderVersion.isFormatString(line.getKey()) ) { | ||
continue; | ||
} | ||
|
||
writer.write(VCFHeader.METADATA_INDICATOR); | ||
writer.write(line.toString()); | ||
|
@@ -189,14 +200,9 @@ public static VCFHeader writeHeader(VCFHeader header, | |
|
||
// write out the column line | ||
writer.write(VCFHeader.HEADER_INDICATOR); | ||
boolean isFirst = true; | ||
for (final VCFHeader.HEADER_FIELDS field : header.getHeaderFields() ) { | ||
if ( isFirst ) | ||
isFirst = false; // don't write out a field separator | ||
else | ||
writer.write(VCFConstants.FIELD_SEPARATOR); | ||
writer.write(field.toString()); | ||
} | ||
writer.write(header.getHeaderFields().stream() | ||
.map(f -> f.name()) | ||
.collect(Collectors.joining(VCFConstants.FIELD_SEPARATOR)).toString()); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this too string is unecessary? |
||
|
||
if ( header.hasGenotypingData() ) { | ||
writer.write(VCFConstants.FIELD_SEPARATOR); | ||
|
@@ -274,6 +280,28 @@ private static void rejectVCFV43Headers(final VCFHeader targetHeader) { | |
if (targetHeader.getVCFHeaderVersion() != null && targetHeader.getVCFHeaderVersion().isAtLeastAsRecentAs(VCFHeaderVersion.VCF4_3)) { | ||
throw new IllegalArgumentException(String.format("Writing VCF version %s is not implemented", targetHeader.getVCFHeaderVersion())); | ||
} | ||
} | ||
|
||
// Given a header and a requested target output version, see if the header's version is compatible with the | ||
// requested version (where compatible means its ok to just declare that the header has the requested | ||
// version). | ||
private static void validateHeaderVersion(final VCFHeader header, final String requestedVersionLine) { | ||
ValidationUtils.nonNull(header); | ||
ValidationUtils.nonNull(requestedVersionLine); | ||
|
||
final VCFHeaderVersion vcfCurrentVersion = header.getVCFHeaderVersion(); | ||
final VCFHeaderVersion vcfRequestedVersion = VCFHeaderVersion.fromHeaderVersionLine(requestedVersionLine); | ||
if (!vcfCurrentVersion.equals(vcfRequestedVersion)) { | ||
if (!VCFHeaderVersion.versionsAreCompatible(VCFHeaderVersion.fromHeaderVersionLine(requestedVersionLine), vcfCurrentVersion)) { | ||
final String message = String.format("Attempting to write a %s VCF header to a %s VCFWriter", | ||
vcfRequestedVersion, | ||
vcfCurrentVersion.getVersionString()); | ||
if (VCFUtils.isStrictVCFVersionValidation()) { | ||
throw new TribbleException(message); | ||
} | ||
logger.warn(message); | ||
} | ||
} | ||
} | ||
|
||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I would make the property name also "strict_vcf_version_validation".