Issue #7159 Create tool for producing genomic regions (as a BED file) (

#8942) * Initial commit and basic code to read gtf * add: code to write to bed & integration test * fix: make getAllFeatures public and use the nesting of features to get to transcripts * add: filtering transcripts by basic tag * add: sorts by contig and start (need to fix - sorting lexicographically) * fix: now sorts by contig then start & output is correct * fix: make dictionary an arg * add: comments + simplified CompareGtfInfo * refactor: apply method test: add separate tests for gene and transcript * refactor: onTraversalSuccess and writeToBed * add: more tests * fix: test files in correct dir pt1. (files are too large) * fix: test files in correct dir pt2. * add: compareFiles and ground truth bed files * fix: runGtfToBed assert * add: comments to GtfToBed * fix: error handling for different versions of gtf and dictionary * fix: edited some bad conventions * fix: remove spaces from input file fullName * add: gtf file with MYT1L and MAPK1 * add: many transcripts unit test and refactoring * add: tiebreaker sorting by id * add: make sort by basic optional * add: html doc comment * fix: dictionary arg * fix: add "Gencode" to description * add: sample mouse gencode testing * fix: Remove arg shortnames * fix: rename and move CompareGtfInfo * fix: kebab-case args * fix: update html doc * fix: use IntegrationTestSpec.assertEqualTextFiles() * fix: remove unnecessary test of pik3ca * fix: remove set functions in GtfInfo * fix: style of comparator * fix: style of comparator * fix: use Files.newOutputStream() to write and logger for errors * fix: use getBestAvailableSequenceDictionary() * fix: use dataProvider for integration tests * fix: better encapsulation * fix: move mapk1.gtf to large dir * fix: arg names * fix: rename reference dict. * fix: sequence-dictionary arg javadoc * add: javadoc to GtfInfo * add: dictionary exception and corresponding test * add: test with fasta file as reference arg * add: javadoc for fasta file * fix: javadoc and onTraversalStart exception
broadinstitute · Aug 29, 2024 · 3c1430d · 3c1430d
1 parent 9f2fbb5
commit 3c1430d
Show file tree

Hide file tree

Showing 22 changed files with 8,675 additions and 3 deletions.
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/conversion/GtfInfo.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/conversion/GtfInfo.java
@@ -0,0 +1,66 @@
+package org.broadinstitute.hellbender.tools.walkers.conversion;
+
+import htsjdk.samtools.util.Interval;
+
+/**
+ * A class that represents information extracted from a feature in a GTF file.
+ * The {@code GtfInfo} object encapsulates details about a specific gene or transcript,
+ * including its type, gene name, and interval.
+ *
+ * <p>The {@code GtfInfo.Type} enum specifies whether the features is a
+ * {@code GENE} or a {@code TRANSCRIPT}.</p>
+ *
+ * <p>The interval specifies the feature's contig (chromosome), start position,
+ * and end position, providing the precise location of the gene or transcript
+ * on the genome.</p>
+ *
+ * <p>Example usage:</p>
+ * <pre>
+ *     Interval interval = new Interval("chr1", 1000, 2000);
+ *     GtfInfo gtfInfo = new GtfInfo(interval, GtfInfo.Type.GENE, "MAPK1");
+ * </pre>
+ **/
+
+public class GtfInfo {
+
+    public enum Type {
+        GENE,
+        TRANSCRIPT
+    }
+
+    private final Type type;
+    private final String geneName;
+    private final Interval interval;
+
+    public GtfInfo(Interval interval, Type type, String geneName) {
+        this.interval = interval;
+        this.type = type;
+        this.geneName = geneName;
+    }
+
+    public Type getType() {
+        return type;
+    }
+
+    public String getGeneName() {
+        return geneName;
+    }
+
+    public Interval getInterval() {
+        return interval;
+    }
+
+    public Integer getStart() {
+        return interval.getStart();
+    }
+
+    public Integer getEnd() {
+        return interval.getEnd();
+    }
+
+    @Override
+    public String toString() {
+        return "GtfInfo{ " + "type = " + type + " geneName = " + geneName + "interval = " + interval;
+    }
+
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/tools/walkers/conversion/GtfToBed.java b/src/main/java/org/broadinstitute/hellbender/tools/walkers/conversion/GtfToBed.java
@@ -0,0 +1,322 @@
+package org.broadinstitute.hellbender.tools.walkers.conversion;
+
+import htsjdk.samtools.SAMSequenceDictionary;
+import htsjdk.samtools.util.*;
+import htsjdk.tribble.Feature;
+import org.apache.log4j.LogManager;
+import org.apache.log4j.Logger;
+import org.broadinstitute.barclay.argparser.Argument;
+import org.broadinstitute.barclay.argparser.CommandLineProgramProperties;
+import org.broadinstitute.barclay.argparser.WorkflowProperties;
+import org.broadinstitute.barclay.help.DocumentedFeature;
+import org.broadinstitute.hellbender.cmdline.StandardArgumentDefinitions;
+import org.broadinstitute.hellbender.cmdline.programgroups.ShortVariantDiscoveryProgramGroup;
+import org.broadinstitute.hellbender.engine.*;
+import org.broadinstitute.hellbender.exceptions.GATKException;
+import org.broadinstitute.hellbender.exceptions.UserException;
+import org.broadinstitute.hellbender.utils.Utils;
+import org.broadinstitute.hellbender.utils.codecs.gtf.GencodeGtfFeature;
+
+import java.io.IOException;
+import java.io.OutputStream;
+import java.nio.file.Files;
+import java.util.*;
+
+/**
+ * <p>Convert Gencode GTF files to BED format with options for gene and transcript level processing.
+ * This tool allows for the extraction of gene and transcript information from Gencode GTF files and
+ * outputs the data in BED format.</p>
+ *
+ *
+ * <p>The conversion process includes sorting entries
+ * by karyotype, providing flexibility in the selection of either gene or transcript level
+ * data, and an option to only use basic tags. It ensures that the BED output is sorted and formatted correctly for subsequent use.
+ * Note that it has been tested for both human and mouse Gencode GTFs. </p>
+ *
+ * <h3>Usage examples</h3>
+ * <p>Example commands to run GtfToBed for typical scenarios:</p>
+ *
+ * <h4>(i) Convert GTF to BED with gene level data</h4>
+ * <p>This mode extracts and converts gene data from the input GTF file to BED format:</p>
+ *
+ * <pre>
+ *     java -jar GtfToBed.jar \
+ *     -gtf-path input.gtf \
+ *     -sequence-dictionary dictionary.dict \
+= *    -output output.bed \
+ * </pre>
+ *
+ * <h4>(ii) Convert GTF to BED with transcript level data</h4>
+ * <p>This mode extracts and converts transcript data from the input GTF file to BED format:</p>
+ *
+ * <pre>
+ *     java -jar GtfToBed.jar \
+ *     -gtf-path input.gtf \
+ *     -sequence-dictionary dictionary.dict \
+ *     -sort-by-transcript \
+ *     -output output.bed \
+ * </pre>
+ *
+ * <h4>(iii) Convert GTF to BED with transcript level data, filtering for only those with the basic tag</h4>
+ * <p>This mode extracts and converts basic transcript data from the input GTF file to BED format:</p>
+ *
+ * <pre>
+ *     java -jar GtfToBed.jar \
+ *     -gtf-path input.gtf \
+ *     -sequence-dictionary dictionary.dict \
+ *     -sort-by-transcript \
+ *     -use-basic-transcript \
+ *     -output output.bed \
+ * </pre>
+ *
+ * <h4>(iiii) </h4> Convert GTF to BED with transcript level data using a reference FASTA instead of DICT file.
+ * <p>This mode extracts and converts transcript data from the input GTF file to BED format using a FASTA file:</p>
+ *
+ * <pre>
+ *     java -jar GtfToBed.jar \
+ *     -gtf-path input.gtf \
+ *     -reference ref.fa \
+ *     -sort-by-transcript \
+ *     -output output.bed \
+ * </pre>
+ */
+
+@CommandLineProgramProperties(
+        summary = "Converts Gencode GTF files to Bed file format with each row of bed file being either a gene or a transcript.",
+        oneLineSummary = "Gencode GTF to BED",
+        programGroup = ShortVariantDiscoveryProgramGroup.class
+)
+
+@DocumentedFeature
+@WorkflowProperties
+public class GtfToBed extends FeatureWalker<GencodeGtfFeature> {
+    public static final String SORT_BY_TRANSCRIPT_LONG_NAME = "sort-by-transcript";
+    public static final String USE_BASIC_TRANSCRIPT_LONG_NAME = "use-basic-transcript";
+    public static final String INPUT_LONG_NAME = "gtf-path";
+    protected final Logger logger = LogManager.getLogger(this.getClass());
+
+    @Argument(fullName = INPUT_LONG_NAME, doc = "Path to Gencode GTF file")
+    public GATKPath inputFile;
+
+    @Argument(fullName = StandardArgumentDefinitions.OUTPUT_LONG_NAME , doc = "Output BED file")
+    public GATKPath outputFile;
+
+    @Argument(fullName = SORT_BY_TRANSCRIPT_LONG_NAME, doc = "Make each row of BED file sorted by transcript", optional = true)
+    public boolean sortByTranscript = false;
+
+    @Argument(fullName = USE_BASIC_TRANSCRIPT_LONG_NAME, doc = "Only use basic transcripts")
+    public boolean sortByBasic = false;
+
+    //stores either gene or transcript ID and summary information about the feature
+    private final Map<String, GtfInfo> featureInfoMap = new HashMap<>();
+
+    //Sequence Dictionary
+    private SAMSequenceDictionary sequenceDictionary = null;
+
+    @Override
+    protected boolean isAcceptableFeatureType(Class<? extends Feature> featureType) {
+        return featureType.isAssignableFrom(GencodeGtfFeature.class);
+    }
+
+    // runs per line of gtf file
+    @Override
+    public void apply(GencodeGtfFeature feature, ReadsContext readsContext, ReferenceContext referenceContext, FeatureContext featureContext) {
+        // list of all features of the gene
+        List<GencodeGtfFeature> geneFeatures = feature.getAllFeatures();
+
+        // process each gtf feature in the list of gene features
+        for (GencodeGtfFeature gtfFeature : geneFeatures) {
+            // the basic tag is in optional fields
+            List<GencodeGtfFeature.OptionalField<?>> optionalFields = getOptionalFields(gtfFeature);
+
+            // if the gtf feature is a Gene
+            if (gtfFeature.getFeatureType() == GencodeGtfFeature.FeatureType.GENE) {
+                processGeneFeature(gtfFeature);
+            }
+
+            //  check if the gtf feature is a transcript. If user only wants basic transcripts check that it has the basic tag
+            else if (gtfFeature.getFeatureType() == GencodeGtfFeature.FeatureType.TRANSCRIPT) {
+                if (sortByBasic) {
+                    for (GencodeGtfFeature.OptionalField<?> field : optionalFields) {
+                        if ("basic".equals(field.getValue())) {
+                            processTranscriptFeature(gtfFeature);
+                        }
+                    }
+                } else {
+                    processTranscriptFeature(gtfFeature);
+                }
+            }
+        }
+    }
+
+    // gets the tag out of the list of optional fields
+    private List<GencodeGtfFeature.OptionalField<?>> getOptionalFields(GencodeGtfFeature gtfFeature) {
+        List<GencodeGtfFeature.OptionalField<?>> optionalFields = null;
+        try {
+            optionalFields = gtfFeature.getOptionalField("tag");
+        } catch (Exception e) {
+            logger.error("Could not retrieve optional fields: ", e);
+        }
+        return optionalFields;
+    }
+
+    // stores the gene ID and Interval info in hashmap
+    private void processGeneFeature(GencodeGtfFeature gtfFeature) {
+        final int geneStart = gtfFeature.getStart();
+        final int geneEnd = gtfFeature.getEnd();
+        final Interval interval = new Interval(gtfFeature.getContig(), geneStart, geneEnd);
+
+        // put the interval, type as gene, and the name of gene
+        final GtfInfo gtfInfo = new GtfInfo(interval, GtfInfo.Type.GENE, gtfFeature.getGeneName());
+
+        // store in hashmap with key as geneId
+        featureInfoMap.put(gtfFeature.getGeneId(), gtfInfo);
+    }
+
+    // stores the transcript ID and Interval info in hashmap
+    private void processTranscriptFeature(GencodeGtfFeature gtfFeature) {
+        //get interval and put the interval, type as transcript, and the name of the gene it's in
+        final Interval interval = new Interval(gtfFeature.getContig(), gtfFeature.getStart(), gtfFeature.getEnd());
+        final GtfInfo gtfInfo = new GtfInfo(interval, GtfInfo.Type.TRANSCRIPT, gtfFeature.getGeneName());
+
+        //store in hashmap with key as transcriptId
+        featureInfoMap.put(gtfFeature.getTranscriptId(), gtfInfo);
+
+        //update start/end of corresponding gene if needed
+        updateGeneStart(gtfFeature);
+        updateGeneEnd(gtfFeature);
+    }
+
+    // update the gene interval start position based on the transcript
+    private void updateGeneStart(GencodeGtfFeature gtfFeature) {
+        // get the start value of the gene
+        int geneStart = featureInfoMap.get(gtfFeature.getGeneId()).getStart();
+
+        // if the transcript start is less than the gene start
+        if (gtfFeature.getStart() < geneStart) {
+            // set the gene start to be the transcript start
+            geneStart = gtfFeature.getStart();
+            updateGeneInterval(gtfFeature, geneStart, featureInfoMap.get(gtfFeature.getGeneId()).getEnd());
+        }
+    }
+
+    // update the gene interval end position based on the transcript
+    private void updateGeneEnd(GencodeGtfFeature gtfFeature) {
+        // get the end value of the gene
+        int geneEnd = featureInfoMap.get(gtfFeature.getGeneId()).getEnd();
+
+        // if the transcript end is greater than the gene end
+        if (gtfFeature.getEnd() > geneEnd) {
+            // set the gene end to be the transcript end
+            geneEnd = gtfFeature.getEnd();
+            updateGeneInterval(gtfFeature, featureInfoMap.get(gtfFeature.getGeneId()).getStart(), geneEnd);
+        }
+    }
+
+    // updates an interval of the gene if it needs to be changed
+    private void updateGeneInterval(GencodeGtfFeature gtfFeature, int geneStart, int geneEnd) {
+        Interval geneInterval = new Interval(gtfFeature.getContig(), geneStart, geneEnd);
+        GtfInfo gtfGeneInfo = new GtfInfo(geneInterval, GtfInfo.Type.GENE, gtfFeature.getGeneName());
+        featureInfoMap.put(gtfFeature.getGeneId(), gtfGeneInfo);
+    }
+
+    @Override
+    public void onTraversalStart() {
+        sequenceDictionary = getBestAvailableSequenceDictionary();
+        if(sequenceDictionary == null){
+            throw new UserException("Sequence Dictionary must be specified (" + StandardArgumentDefinitions.SEQUENCE_DICTIONARY_NAME + ").");
+        }
+    }
+
+    // runs immediately after it has gone through each line of gtf (apply method)
+    @Override
+    public Object onTraversalSuccess() {
+        // create linked hash map to store sorted values of idToInfo
+        LinkedHashMap<String, GtfInfo> karyotypeIdToInfo = getSortedMap(sequenceDictionary);
+
+        // if user wants to sort by transcript only use transcripts else only use genes
+        GtfInfo.Type selectedType = sortByTranscript ? GtfInfo.Type.TRANSCRIPT : GtfInfo.Type.GENE;
+        writeToBed(selectedType, karyotypeIdToInfo);
+
+        return null;
+    }
+
+    //Compare GtfInfo objects positionally by contig and start position. If transcripts have the same contig and start, compare by TranscriptId
+    public static class GtfInfoComparator implements Comparator<Map.Entry<String, GtfInfo>> {
+
+        private final SAMSequenceDictionary dictionary;
+
+        GtfInfoComparator(SAMSequenceDictionary dictionary) {
+            this.dictionary = dictionary;
+        }
+
+        // compare two entries of a map where key = geneId or transcriptId and value = gtfInfo object
+        @Override
+        public int compare(Map.Entry<String, GtfInfo> e1, Map.Entry<String, GtfInfo> e2) {
+            final Interval e1Interval = e1.getValue().getInterval();
+            final Interval e2Interval = e2.getValue().getInterval();
+
+            Utils.nonNull(dictionary.getSequence(e1Interval.getContig()), "could not get sequence for " + e1Interval.getContig());
+            Utils.nonNull(dictionary.getSequence(e2Interval.getContig()), "could not get sequence for " + e2Interval.getContig());
+
+            //compare by contig, then start, then by key
+            return Comparator
+                    .comparingInt((Map.Entry<String, GtfInfo> e) ->
+                            dictionary.getSequence(e.getValue().getInterval().getContig()).getSequenceIndex())
+                    .thenComparingInt(e -> e.getValue().getInterval().getStart())
+                    .thenComparing(Map.Entry::getKey)
+                    .compare(e1,e2);
+
+
+        }
+    }
+
+    // sorts the map containing the features based on contig and start position
+    private LinkedHashMap<String, GtfInfo> getSortedMap(SAMSequenceDictionary sequenceDictionary) {
+        // create a list that has the keys and values of idToInfo and sort the list using GtfInfoComparator
+        List<Map.Entry<String, GtfInfo>> entries = new ArrayList<>(featureInfoMap.entrySet());
+        entries.sort(new GtfInfoComparator(sequenceDictionary));
+
+        // put each (sorted) entry in the list into a linked hashmap
+        LinkedHashMap<String, GtfInfo> karyotypeIdToInfo = new LinkedHashMap<>();
+        for (Map.Entry<String, GtfInfo> entry : entries) {
+            karyotypeIdToInfo.put(entry.getKey(), entry.getValue());
+        }
+
+        return karyotypeIdToInfo;
+    }
+
+    // writes to bed file
+    private void writeToBed(GtfInfo.Type type, Map<String, GtfInfo> sortedMap) {
+        try (final OutputStream writer = Files.newOutputStream(outputFile.toPath())) {
+            for (Map.Entry<String, GtfInfo> entry : sortedMap.entrySet()) {
+                if (entry.getValue().getType() == type) {
+                    String line = formatBedLine(entry, type);
+                    writer.write((line + System.lineSeparator()).getBytes());
+                }
+            }
+        } catch (IOException e) {
+            throw new GATKException("Error writing to BED file", e);
+        }
+    }
+
+    // formats each line of the bed file depending on whether user has selected gene or transcript
+    private String formatBedLine(Map.Entry<String, GtfInfo> entry, GtfInfo.Type type) {
+        GtfInfo info = entry.getValue();
+        String line = info.getInterval().getContig() + "\t" +
+                info.getInterval().getStart() + "\t" +
+                info.getInterval().getEnd() + "\t" +
+                info.getGeneName();
+
+        if (type == GtfInfo.Type.TRANSCRIPT) {
+            line += "," + entry.getKey();
+        }
+
+        return line;
+    }
+
+    @Override
+    public GATKPath getDrivingFeaturePath() {
+        return inputFile;
+    }
+}
diff --git a/src/main/java/org/broadinstitute/hellbender/utils/codecs/gtf/GencodeGtfExonFeature.java b/src/main/java/org/broadinstitute/hellbender/utils/codecs/gtf/GencodeGtfExonFeature.java
@@ -77,7 +77,7 @@ public void setStopCodon(final GencodeGtfStopCodonFeature stopCodon) {
 
 
     @Override
-    protected List<GencodeGtfFeature> getAllFeatures() {
+    public List<GencodeGtfFeature> getAllFeatures() {
         final ArrayList<GencodeGtfFeature> list = new ArrayList<>();
         list.add(this);