Skip to content

Commit

Permalink
Added in a UCSC version number to GencodeGtfFeature.
Browse files Browse the repository at this point in the history
  • Loading branch information
jonn-smith committed Sep 12, 2017
1 parent 33c2855 commit 28037e4
Show file tree
Hide file tree
Showing 3 changed files with 290 additions and 16 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,39 @@ final public class GencodeGtfCodec extends AbstractFeatureCodec<GencodeGtfFeatur

// ============================================================================================================

/**
* Gets the UCSC version corresponding to the given gencode version.
* Version equivalences obtained here:
*
* https://genome.ucsc.edu/FAQ/FAQreleases.html
* https://www.gencodegenes.org/releases/
*
* @param gencodeVersion The gencode version to convert to UCSC version.
* @return The UCSC version in a {@link String} corresponding to the given gencode version.
*/
private static String getUcscVersionFromGencodeVersion(final int gencodeVersion) {
if ((gencodeVersion < GENCODE_GTF_MIN_VERSION_NUM_INCLUSIVE ) ||
(gencodeVersion > GENCODE_GTF_MAX_VERSION_NUM_INCLUSIVE)) {
throw new GATKException("Gencode version is out of expected range. Cannot decode: " + gencodeVersion);
}

switch (gencodeVersion) {
case 19: return "hg19";
case 20: return "hg19";
case 21: return "hg19";
case 22: return "hg19";
case 23: return "hg19";
case 24: return "hg19";
case 25: return "hg38";
case 26: return "hg38";
}

// This should never happen:
throw new GATKException("Gencode version is out of expected range. Cannot decode: " + gencodeVersion);
}

// ============================================================================================================

public GencodeGtfCodec() {
super(GencodeGtfFeature.class);
}
Expand Down Expand Up @@ -156,7 +189,7 @@ public GencodeGtfFeature decode(final LineIterator lineIterator) {
// Split the line into different GTF Fields
// Note that we're using -1 as the limit so that empty tokens will still be counted
// (as opposed to discarded).
String[] splitLine = line.split(GTF_FIELD_DELIMITER, -1);
final String[] splitLine = line.split(GTF_FIELD_DELIMITER, -1);

// Ensure the file is at least trivially well-formed:
if (splitLine.length != NUM_COLUMNS) {
Expand All @@ -168,11 +201,14 @@ public GencodeGtfFeature decode(final LineIterator lineIterator) {
final GencodeGtfFeature.FeatureType featureType = GencodeGtfFeature.FeatureType.getEnum( splitLine[FEATURE_TYPE_FIELD_INDEX] );

// Create a baseline feature to add into our data:
GencodeGtfFeature feature = GencodeGtfFeature.create(splitLine);
final GencodeGtfFeature feature = GencodeGtfFeature.create(splitLine);

// Make sure we keep track of the line number for if and when we need to write the file back out:
feature.setFeatureOrderNumber(currentLineNum);

// Set our UCSC version number:
feature.setUcscGenomeVersion(getUcscVersionFromGencodeVersion(versionNumber));

// Once we see another gene we take all accumulated records and combine them into the
// current GencodeGtfFeature.
// Then we then break out of the loop and return the last full gene object.
Expand Down Expand Up @@ -256,7 +292,7 @@ else if ((transcript != null) && (featureType == GencodeGtfFeature.FeatureType.T
logger.error("Gene Feature Aggregation: leaf feature store not empty: " + leafFeatureStore.toString());
}

String msg = "Aggregated data left over after parsing complete: Exons: " + exonStore.size() + " ; LeafFeatures: " + leafFeatureStore.size();
final String msg = "Aggregated data left over after parsing complete: Exons: " + exonStore.size() + " ; LeafFeatures: " + leafFeatureStore.size();
throw new GATKException.ShouldNeverReachHereException(msg);
}

Expand All @@ -275,7 +311,7 @@ else if ((transcript != null) && (featureType == GencodeGtfFeature.FeatureType.T
* @param reader The {@link LineIterator} from which to read the header.
* @return The header as read from the {@code reader}
*/
private List<String> readActualHeader(LineIterator reader) {
private List<String> readActualHeader(final LineIterator reader) {

// Make sure we start with a clear header:
header.clear();
Expand Down Expand Up @@ -432,7 +468,7 @@ static public boolean validateGencodeGtfFeature(final GencodeGtfFeature feature,
}

@Override
public boolean canDecode(String inputFilePath) {
public boolean canDecode(final String inputFilePath) {

boolean canDecode;
try {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
package org.broadinstitute.hellbender.utils.codecs.GENCODE;

import com.google.common.annotations.VisibleForTesting;
import htsjdk.samtools.util.Locatable;
import htsjdk.tribble.Feature;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.broadinstitute.hellbender.exceptions.GATKException;
import org.broadinstitute.hellbender.exceptions.UserException;
import org.broadinstitute.hellbender.utils.SimpleInterval;
import org.broadinstitute.hellbender.utils.Utils;
Expand Down Expand Up @@ -71,6 +71,7 @@ public abstract class GencodeGtfFeature implements Feature, Comparable<GencodeGt

private static final Pattern NUMBER_PATTERN = Pattern.compile("\\d\\d*");

private String ucscGenomeVersion = null;
private final GencodeGtfFeatureBaseData baseData;

// ================================================================================================
Expand Down Expand Up @@ -303,7 +304,8 @@ public int getEnd() {
* This is useful to get any subfeatures included in this {@link GencodeGtfFeature}.
* @return A {@link List} of the features represented in this {@link GencodeGtfFeature}.
*/
protected List<GencodeGtfFeature> getAllFeatures() {
@VisibleForTesting
List<GencodeGtfFeature> getAllFeatures() {
final List<GencodeGtfFeature> list = new ArrayList<>();
list.add(this);
return list;
Expand Down Expand Up @@ -428,6 +430,14 @@ public String toString() {

// ================================================================================================

public String getUcscGenomeVersion() {
return ucscGenomeVersion;
}

public void setUcscGenomeVersion(final String ucscGenomeVersion) {
this.ucscGenomeVersion = ucscGenomeVersion;
}

public SimpleInterval getGenomicPosition() { return baseData.genomicPosition; }

public int getFeatureOrderNumber() { return baseData.featureOrderNumber; }
Expand Down Expand Up @@ -548,6 +558,10 @@ else if ( this == that ) {
if (isEqual) {
final GencodeGtfFeature thatFeature = (GencodeGtfFeature) that;
isEqual = Objects.equals(baseData, thatFeature.baseData);

if ( isEqual ) {
isEqual = ucscGenomeVersion.equals( thatFeature.getUcscGenomeVersion() );
}
}

return isEqual;
Expand Down
Loading

0 comments on commit 28037e4

Please sign in to comment.