diff --git a/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java b/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java index 41c66815ca4..9a9a4ae8bd9 100644 --- a/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java +++ b/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportCopyNumberSegmentData.java @@ -54,85 +54,90 @@ private void importData(File file, int cancerStudyId) throws IOException, DaoExc FileReader reader = new FileReader(file); BufferedReader buf = new BufferedReader(reader); try { - String line = buf.readLine(); // skip header line - long segId = DaoCopyNumberSegment.getLargestId(); - while ((line=buf.readLine()) != null) { - ProgressMonitor.incrementCurValue(); - ConsoleUtil.showProgress(); - - String[] strs = line.split("\t"); - if (strs.length<6) { - System.err.println("wrong format: "+line); - } - - CancerStudy cancerStudy = DaoCancerStudy.getCancerStudyByInternalId(cancerStudyId); - //TODO - lines below should be removed. Agreed with JJ to remove this as soon as MSK moves to new validation - //procedure. In this new procedure, Patients and Samples should only be added - //via the corresponding ImportClinicalData process. Furthermore, the code below is wrong as it assumes one - //sample per patient, which is not always the case. - String barCode = strs[0]; - Sample sample = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudyId, + String line = buf.readLine(); // skip header line + long segId = DaoCopyNumberSegment.getLargestId(); + while ((line=buf.readLine()) != null) { + ProgressMonitor.incrementCurValue(); + ConsoleUtil.showProgress(); + + String[] strs = line.split("\t"); + if (strs.length<6) { + System.err.println("wrong format: "+line); + } + + CancerStudy cancerStudy = DaoCancerStudy.getCancerStudyByInternalId(cancerStudyId); + //TODO - lines below should be removed. Agreed with JJ to remove this as soon as MSK moves to new validation + //procedure. In this new procedure, Patients and Samples should only be added + //via the corresponding ImportClinicalData process. Furthermore, the code below is wrong as it assumes one + //sample per patient, which is not always the case. + String barCode = strs[0]; + Sample sample = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudyId, StableIdUtil.getSampleId(barCode)); if (sample == null ) { - ImportDataUtil.addPatients(new String[] { barCode }, cancerStudy); - ImportDataUtil.addSamples(new String[] { barCode }, cancerStudy); - ProgressMonitor.logWarning("WARNING: Sample added on the fly because it was missing in clinical data"); - } - - String sampleId = StableIdUtil.getSampleId(barCode); - String chrom = strs[1].trim(); - //validate in same way as GistitReader: - ValidationUtils.validateChromosome(chrom); - - long start = Double.valueOf(strs[2]).longValue(); - long end = Double.valueOf(strs[3]).longValue(); - if (start >= end) { - //workaround to skip with warning, according to https://github.com/cBioPortal/cbioportal/issues/839#issuecomment-203452415 - ProgressMonitor.logWarning("Start position of segment is not lower than end position. Skipping this entry."); - entriesSkipped++; - continue; - } - int numProbes = new BigDecimal((strs[4])).intValue(); - double segMean = Double.parseDouble(strs[5]); - - Sample s = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudyId, sampleId); - if (s == null) { - assert StableIdUtil.isNormal(sampleId); - entriesSkipped++; - continue; - } - CopyNumberSegment cns = new CopyNumberSegment(cancerStudyId, s.getInternalId(), chrom, start, end, numProbes, segMean); - cns.setSegId(++segId); + ImportDataUtil.addPatients(new String[] { barCode }, cancerStudy); + ImportDataUtil.addSamples(new String[] { barCode }, cancerStudy); + ProgressMonitor.logWarning("WARNING: Sample added on the fly because it was missing in clinical data"); + } + + String sampleId = StableIdUtil.getSampleId(barCode); + String chrom = strs[1].trim(); + //validate in same way as GistitReader: + ValidationUtils.validateChromosome(chrom); + + long start = Double.valueOf(strs[2]).longValue(); + long end = Double.valueOf(strs[3]).longValue(); + if (start >= end) { + //workaround to skip with warning, according to https://github.com/cBioPortal/cbioportal/issues/839#issuecomment-203452415 + ProgressMonitor.logWarning("Start position of segment is not lower than end position. Skipping this entry."); + entriesSkipped++; + continue; + } + int numProbes = new BigDecimal((strs[4])).intValue(); + double segMean = Double.parseDouble(strs[5]); + + Sample s = DaoSample.getSampleByCancerStudyAndSampleId(cancerStudyId, sampleId); + if (s == null) { + if (StableIdUtil.isNormal(sampleId)) { + entriesSkipped++; + continue; + } + else { + //this likely will not be reached since samples are added on the fly above if not known to database + throw new RuntimeException("Unknown sample id '" + sampleId + "' found in seg file: " + file.getCanonicalPath()); + } + } + CopyNumberSegment cns = new CopyNumberSegment(cancerStudyId, s.getInternalId(), chrom, start, end, numProbes, segMean); + cns.setSegId(++segId); DaoCopyNumberSegment.addCopyNumberSegment(cns); - } - MySQLbulkLoader.flushAll(); + } + MySQLbulkLoader.flushAll(); } finally { - buf.close(); + buf.close(); } } public void run() { try { - String description = "Import 'segment data' files"; - - OptionSet options = ConsoleUtil.parseStandardDataAndMetaOptions(args, description, true); - String dataFile = (String) options.valueOf("data"); - File descriptorFile = new File((String) options.valueOf("meta")); + String description = "Import 'segment data' files"; + + OptionSet options = ConsoleUtil.parseStandardDataAndMetaOptions(args, description, true); + String dataFile = (String) options.valueOf("data"); + File descriptorFile = new File((String) options.valueOf("meta")); - Properties properties = new Properties(); - properties.load(new FileInputStream(descriptorFile)); + Properties properties = new Properties(); + properties.load(new FileInputStream(descriptorFile)); ProgressMonitor.setCurrentMessage("Reading data from: " + dataFile); - SpringUtil.initDataSource(); - CancerStudy cancerStudy = getCancerStudy(properties); - - if (segmentDataExistsForCancerStudy(cancerStudy)) { - throw new IllegalArgumentException("Seg data for cancer study " + cancerStudy.getCancerStudyStableId() + " has already been imported: " + dataFile); - } - - importCopyNumberSegmentFileMetadata(cancerStudy, properties); + SpringUtil.initDataSource(); + CancerStudy cancerStudy = getCancerStudy(properties); + + if (segmentDataExistsForCancerStudy(cancerStudy)) { + throw new IllegalArgumentException("Seg data for cancer study " + cancerStudy.getCancerStudyStableId() + " has already been imported: " + dataFile); + } + + importCopyNumberSegmentFileMetadata(cancerStudy, properties); importCopyNumberSegmentFileData(cancerStudy, dataFile); importFractionGenomeAltered(cancerStudy); } catch (RuntimeException e) { diff --git a/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java b/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java index a15037936be..e7280352429 100644 --- a/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java +++ b/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportExtendedMutationData.java @@ -154,11 +154,15 @@ public void importData() throws IOException, DaoException { // can be null in case of 'normal' sample: // (if data files are run through validator, this condition should be minimal) if (sample == null) { - assert StableIdUtil.isNormal(barCode); - //if new sample: - if (sampleSet.add(barCode)) - samplesSkipped++; - continue; + if (StableIdUtil.isNormal(barCode)) { + //if new sample: + if (sampleSet.add(barCode)) + samplesSkipped++; + continue; + } + else { + throw new RuntimeException("Unknown sample id '" + StableIdUtil.getSampleId(barCode) + "' found in MAF file: " + this.mutationFile.getCanonicalPath()); + } } String validationStatus = record.getValidationStatus(); diff --git a/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportFusionData.java b/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportFusionData.java index fd29a3c5aca..cad204d7c0b 100644 --- a/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportFusionData.java +++ b/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportFusionData.java @@ -102,9 +102,13 @@ public void importData() throws IOException, DaoException { StableIdUtil.getSampleId(barCode)); // can be null in case of 'normal' sample: if (sample == null) { - assert StableIdUtil.isNormal(barCode); - line = buf.readLine(); - continue; + if (StableIdUtil.isNormal(barCode)) { + line = buf.readLine(); + continue; + } + else { + throw new RuntimeException("Unknown sample id '" + StableIdUtil.getSampleId(barCode) + "' found in fusion file: " + this.fusionFile.getCanonicalPath()); + } } // Assume we are dealing with Entrez Gene Ids (this is the best / most stable option) String geneSymbol = record.getHugoGeneSymbol(); diff --git a/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java b/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java index 7dda90c20d1..ed9319291b6 100644 --- a/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java +++ b/core/src/main/java/org/mskcc/cbio/portal/scripts/ImportTabDelimData.java @@ -56,7 +56,7 @@ public class ImportTabDelimData { public static final String CNA_VALUE_ZERO = "0"; private HashSet importSetOfGenes = new HashSet(); private HashSet importedGeneticEntitySet = new HashSet<>(); - private File mutationFile; + private File dataFile; private String targetLine; private int geneticProfileId; private GeneticProfile geneticProfile; @@ -76,7 +76,7 @@ public class ImportTabDelimData { * @deprecated : TODO shall we deprecate this feature (i.e. the targetLine)? */ public ImportTabDelimData(File dataFile, String targetLine, int geneticProfileId, String genePanelID) { - this.mutationFile = dataFile; + this.dataFile = dataFile; this.targetLine = targetLine; this.geneticProfileId = geneticProfileId; this.genePanelID = genePanelID; @@ -89,7 +89,7 @@ public ImportTabDelimData(File dataFile, String targetLine, int geneticProfileId * @param geneticProfileId GeneticProfile ID. */ public ImportTabDelimData(File dataFile, int geneticProfileId, String genePanelID) { - this.mutationFile = dataFile; + this.dataFile = dataFile; this.geneticProfileId = geneticProfileId; this.genePanelID = genePanelID; } @@ -104,7 +104,7 @@ public void importData(int numLines) throws IOException, DaoException { geneticProfile = DaoGeneticProfile.getGeneticProfileById(geneticProfileId); - FileReader reader = new FileReader(mutationFile); + FileReader reader = new FileReader(dataFile); BufferedReader buf = new BufferedReader(reader); String headerLine = buf.readLine(); String parts[] = headerLine.split("\t"); @@ -123,92 +123,96 @@ public void importData(int numLines) throws IOException, DaoException { int numRecordsToAdd = 0; int samplesSkipped = 0; try { - int hugoSymbolIndex = getHugoSymbolIndex(parts); - int entrezGeneIdIndex = getEntrezGeneIdIndex(parts); - int rppaGeneRefIndex = getRppaGeneRefIndex(parts); - int genesetIdIndex = getGenesetIdIndex(parts); - int sampleStartIndex = getStartIndex(parts, hugoSymbolIndex, entrezGeneIdIndex, rppaGeneRefIndex, genesetIdIndex); - if (rppaProfile) { - if (rppaGeneRefIndex == -1) { - throw new RuntimeException("Error: the following column should be present for RPPA data: Composite.Element.Ref"); - } - } else if (gsvaProfile) { - if (genesetIdIndex == -1) { - throw new RuntimeException("Error: the following column should be present for gene set score data: geneset_id"); - } - } else if (hugoSymbolIndex == -1 && entrezGeneIdIndex == -1) { - throw new RuntimeException("Error: at least one of the following columns should be present: Hugo_Symbol or Entrez_Gene_Id"); - } - - - String sampleIds[]; - sampleIds = new String[parts.length - sampleStartIndex]; - System.arraycopy(parts, sampleStartIndex, sampleIds, 0, parts.length - sampleStartIndex); + int hugoSymbolIndex = getHugoSymbolIndex(parts); + int entrezGeneIdIndex = getEntrezGeneIdIndex(parts); + int rppaGeneRefIndex = getRppaGeneRefIndex(parts); + int genesetIdIndex = getGenesetIdIndex(parts); + int sampleStartIndex = getStartIndex(parts, hugoSymbolIndex, entrezGeneIdIndex, rppaGeneRefIndex, genesetIdIndex); + if (rppaProfile) { + if (rppaGeneRefIndex == -1) { + throw new RuntimeException("Error: the following column should be present for RPPA data: Composite.Element.Ref"); + } + } else if (gsvaProfile) { + if (genesetIdIndex == -1) { + throw new RuntimeException("Error: the following column should be present for gene set score data: geneset_id"); + } + } else if (hugoSymbolIndex == -1 && entrezGeneIdIndex == -1) { + throw new RuntimeException("Error: at least one of the following columns should be present: Hugo_Symbol or Entrez_Gene_Id"); + } + + + String sampleIds[]; + sampleIds = new String[parts.length - sampleStartIndex]; + System.arraycopy(parts, sampleStartIndex, sampleIds, 0, parts.length - sampleStartIndex); - int nrUnknownSamplesAdded = 0; - ProgressMonitor.setCurrentMessage(" --> total number of samples: " + sampleIds.length); - - // link Samples to the genetic profile - ArrayList orderedSampleList = new ArrayList(); - ArrayList filteredSampleIndices = new ArrayList(); - for (int i = 0; i < sampleIds.length; i++) { - // backwards compatible part (i.e. in the new process, the sample should already be there. TODO - replace this workaround later with an exception: - Sample sample = DaoSample.getSampleByCancerStudyAndSampleId(geneticProfile.getCancerStudyId(), - StableIdUtil.getSampleId(sampleIds[i])); - if (sample == null ) { - //TODO - as stated above, this part should be removed. Agreed with JJ to remove this as soon as MSK moves to new validation - //procedure. In this new procedure, Patients and Samples should only be added - //via the corresponding ImportClinicalData process. Furthermore, the code below is wrong as it assumes one - //sample per patient, which is not always the case. - ImportDataUtil.addPatients(new String[] { sampleIds[i] }, geneticProfileId); - // add the sample (except if it is a 'normal' sample): - nrUnknownSamplesAdded += ImportDataUtil.addSamples(new String[] { sampleIds[i] }, geneticProfileId); - } - // check again (repeated because of workaround above): - sample = DaoSample.getSampleByCancerStudyAndSampleId(geneticProfile.getCancerStudyId(), + int nrUnknownSamplesAdded = 0; + ProgressMonitor.setCurrentMessage(" --> total number of samples: " + sampleIds.length); + + // link Samples to the genetic profile + ArrayList orderedSampleList = new ArrayList(); + ArrayList filteredSampleIndices = new ArrayList(); + for (int i = 0; i < sampleIds.length; i++) { + // backwards compatible part (i.e. in the new process, the sample should already be there. TODO - replace this workaround later with an exception: + Sample sample = DaoSample.getSampleByCancerStudyAndSampleId(geneticProfile.getCancerStudyId(), + StableIdUtil.getSampleId(sampleIds[i])); + if (sample == null ) { + //TODO - as stated above, this part should be removed. Agreed with JJ to remove this as soon as MSK moves to new validation + //procedure. In this new procedure, Patients and Samples should only be added + //via the corresponding ImportClinicalData process. Furthermore, the code below is wrong as it assumes one + //sample per patient, which is not always the case. + ImportDataUtil.addPatients(new String[] { sampleIds[i] }, geneticProfileId); + // add the sample (except if it is a 'normal' sample): + nrUnknownSamplesAdded += ImportDataUtil.addSamples(new String[] { sampleIds[i] }, geneticProfileId); + } + // check again (repeated because of workaround above): + sample = DaoSample.getSampleByCancerStudyAndSampleId(geneticProfile.getCancerStudyId(), StableIdUtil.getSampleId(sampleIds[i])); - // can be null in case of 'normal' sample: - if (sample == null) { - assert StableIdUtil.isNormal(sampleIds[i]); - filteredSampleIndices.add(i); - samplesSkipped++; - continue; - } + // can be null in case of 'normal' sample: + if (sample == null) { + if (StableIdUtil.isNormal(sampleIds[i])) { + filteredSampleIndices.add(i); + samplesSkipped++; + continue; + } + else { + throw new RuntimeException("Unknown sample id '" + StableIdUtil.getSampleId(sampleIds[i]) + "' found in tab-delimited file: " + this.dataFile.getCanonicalPath()); + } + } ImportDataUtil.addSampleProfile(sample, geneticProfileId, genePanelID); - orderedSampleList.add(sample.getInternalId()); - } - if (nrUnknownSamplesAdded > 0) { - ProgressMonitor.logWarning("WARNING: Number of samples added on the fly because they were missing in clinical data: " + nrUnknownSamplesAdded); - } - if (samplesSkipped > 0) { - ProgressMonitor.setCurrentMessage(" --> total number of samples skipped (normal samples): " + samplesSkipped); - } - ProgressMonitor.setCurrentMessage(" --> total number of data lines: " + (numLines-1)); - - DaoGeneticProfileSamples.addGeneticProfileSamples(geneticProfileId, orderedSampleList); - - //Gene cache: - DaoGeneOptimized daoGene = DaoGeneOptimized.getInstance(); - - //Object to insert records in the generic 'genetic_alteration' table: - DaoGeneticAlteration daoGeneticAlteration = DaoGeneticAlteration.getInstance(); - - //cache for data found in cna_event' table: - Map existingCnaEvents = null; - if (discretizedCnaProfile) { - existingCnaEvents = new HashMap(); - for (CnaEvent.Event event : DaoCnaEvent.getAllCnaEvents()) { - existingCnaEvents.put(event, event); - } - MySQLbulkLoader.bulkLoadOn(); - } - - int lenParts = parts.length; - - String line = buf.readLine(); - while (line != null) { - ProgressMonitor.incrementCurValue(); - ConsoleUtil.showProgress(); + orderedSampleList.add(sample.getInternalId()); + } + if (nrUnknownSamplesAdded > 0) { + ProgressMonitor.logWarning("WARNING: Number of samples added on the fly because they were missing in clinical data: " + nrUnknownSamplesAdded); + } + if (samplesSkipped > 0) { + ProgressMonitor.setCurrentMessage(" --> total number of samples skipped (normal samples): " + samplesSkipped); + } + ProgressMonitor.setCurrentMessage(" --> total number of data lines: " + (numLines-1)); + + DaoGeneticProfileSamples.addGeneticProfileSamples(geneticProfileId, orderedSampleList); + + //Gene cache: + DaoGeneOptimized daoGene = DaoGeneOptimized.getInstance(); + + //Object to insert records in the generic 'genetic_alteration' table: + DaoGeneticAlteration daoGeneticAlteration = DaoGeneticAlteration.getInstance(); + + //cache for data found in cna_event' table: + Map existingCnaEvents = null; + if (discretizedCnaProfile) { + existingCnaEvents = new HashMap(); + for (CnaEvent.Event event : DaoCnaEvent.getAllCnaEvents()) { + existingCnaEvents.put(event, event); + } + MySQLbulkLoader.bulkLoadOn(); + } + + int lenParts = parts.length; + + String line = buf.readLine(); + while (line != null) { + ProgressMonitor.incrementCurValue(); + ConsoleUtil.showProgress(); boolean recordAdded = false; // either parse line as geneset or gene for importing into 'genetic_alteration' table @@ -232,40 +236,40 @@ public void importData(int numLines) throws IOException, DaoException { else { entriesSkipped++; } - - line = buf.readLine(); - } - if (MySQLbulkLoader.isBulkLoad()) { - MySQLbulkLoader.flushAll(); - } - - if (rppaProfile) { - ProgressMonitor.setCurrentMessage(" --> total number of extra records added because of multiple genes in one line: " + nrExtraRecords); - } - if (entriesSkipped > 0) { - ProgressMonitor.setCurrentMessage(" --> total number of data entries skipped (see table below): " + entriesSkipped); - } + + line = buf.readLine(); + } + if (MySQLbulkLoader.isBulkLoad()) { + MySQLbulkLoader.flushAll(); + } + + if (rppaProfile) { + ProgressMonitor.setCurrentMessage(" --> total number of extra records added because of multiple genes in one line: " + nrExtraRecords); + } + if (entriesSkipped > 0) { + ProgressMonitor.setCurrentMessage(" --> total number of data entries skipped (see table below): " + entriesSkipped); + } - if (numRecordsToAdd == 0) { - throw new DaoException ("Something has gone wrong! I did not save any records" + - " to the database!"); - } + if (numRecordsToAdd == 0) { + throw new DaoException ("Something has gone wrong! I did not save any records" + + " to the database!"); + } } finally { - buf.close(); + buf.close(); } } private boolean parseLine(String line, int nrColumns, int sampleStartIndex, - int hugoSymbolIndex, int entrezGeneIdIndex, int rppaGeneRefIndex, - boolean rppaProfile, boolean discretizedCnaProfile, - DaoGeneOptimized daoGene, - List filteredSampleIndices, List orderedSampleList, - Map existingCnaEvents, DaoGeneticAlteration daoGeneticAlteration - ) throws DaoException { + int hugoSymbolIndex, int entrezGeneIdIndex, int rppaGeneRefIndex, + boolean rppaProfile, boolean discretizedCnaProfile, + DaoGeneOptimized daoGene, + List filteredSampleIndices, List orderedSampleList, + Map existingCnaEvents, DaoGeneticAlteration daoGeneticAlteration + ) throws DaoException { + + boolean recordStored = false; - boolean recordStored = false; - // Ignore lines starting with # if (!line.startsWith("#") && line.trim().length() > 0) { String[] parts = line.split("\t",-1); @@ -282,18 +286,18 @@ private boolean parseLine(String line, int nrColumns, int sampleStartIndex, String geneSymbol = null; if (hugoSymbolIndex != -1) { - geneSymbol = parts[hugoSymbolIndex]; + geneSymbol = parts[hugoSymbolIndex]; } //RPPA: //TODO - we should split up the RPPA scenario from this code...too many if/else because of this if (rppaGeneRefIndex != -1) { - geneSymbol = parts[rppaGeneRefIndex]; + geneSymbol = parts[rppaGeneRefIndex]; } if (geneSymbol!=null && geneSymbol.isEmpty()) { geneSymbol = null; } if (rppaProfile && geneSymbol == null) { - ProgressMonitor.logWarning("Ignoring line with no Composite.Element.REF value"); - return false; + ProgressMonitor.logWarning("Ignoring line with no Composite.Element.REF value"); + return false; } //get entrez String entrez = null; @@ -301,20 +305,20 @@ private boolean parseLine(String line, int nrColumns, int sampleStartIndex, entrez = parts[entrezGeneIdIndex]; } if (entrez!=null) { - if (entrez.isEmpty()) { - entrez = null; - } - else if (!entrez.matches("[0-9]+")) { - //TODO - would be better to give an exception in some cases, like negative Entrez values - ProgressMonitor.logWarning("Ignoring line with invalid Entrez_Id " + entrez); - return false; - } + if (entrez.isEmpty()) { + entrez = null; + } + else if (!entrez.matches("[0-9]+")) { + //TODO - would be better to give an exception in some cases, like negative Entrez values + ProgressMonitor.logWarning("Ignoring line with invalid Entrez_Id " + entrez); + return false; + } } //If all are empty, skip line: if (geneSymbol == null && entrez == null) { - ProgressMonitor.logWarning("Ignoring line with no Hugo_Symbol or Entrez_Id value"); - return false; + ProgressMonitor.logWarning("Ignoring line with no Hugo_Symbol or Entrez_Id value"); + return false; } else { if (geneSymbol != null && (geneSymbol.contains("///") || geneSymbol.contains("---"))) { @@ -335,7 +339,7 @@ else if (!entrez.matches("[0-9]+")) { //will be null when there is a parse error in this case, so we //can return here and avoid duplicated messages: return false; - } + } } else { //try entrez: @@ -387,8 +391,8 @@ else if (!entrez.matches("[0-9]+")) { return false; } } else if (genes.size()==1) { - List cnaEventsToAdd = new ArrayList(); - + List cnaEventsToAdd = new ArrayList(); + if (discretizedCnaProfile) { long entrezGeneId = genes.get(0).getEntrezGeneId(); for (int i = 0; i < values.length; i++) { @@ -411,49 +415,49 @@ else if (!entrez.matches("[0-9]+")) { recordStored = storeGeneticAlterations(values, daoGeneticAlteration, genes.get(0), geneSymbol); //only add extra CNA related records if the step above worked, otherwise skip: if (recordStored) { - for (CnaEvent cnaEvent : cnaEventsToAdd) { - if (existingCnaEvents.containsKey(cnaEvent.getEvent())) { - cnaEvent.setEventId(existingCnaEvents.get(cnaEvent.getEvent()).getEventId()); - DaoCnaEvent.addCaseCnaEvent(cnaEvent, false); - } else { - //cnaEvent.setEventId(++cnaEventId); not needed anymore, column now has AUTO_INCREMENT - DaoCnaEvent.addCaseCnaEvent(cnaEvent, true); - existingCnaEvents.put(cnaEvent.getEvent(), cnaEvent.getEvent()); - } - } + for (CnaEvent cnaEvent : cnaEventsToAdd) { + if (existingCnaEvents.containsKey(cnaEvent.getEvent())) { + cnaEvent.setEventId(existingCnaEvents.get(cnaEvent.getEvent()).getEventId()); + DaoCnaEvent.addCaseCnaEvent(cnaEvent, false); + } else { + //cnaEvent.setEventId(++cnaEventId); not needed anymore, column now has AUTO_INCREMENT + DaoCnaEvent.addCaseCnaEvent(cnaEvent, true); + existingCnaEvents.put(cnaEvent.getEvent(), cnaEvent.getEvent()); + } + } } } else { - int otherCase = 0; + int otherCase = 0; for (CanonicalGene gene : genes) { - if (gene.isMicroRNA() || rppaProfile) { // for micro rna or protein data, duplicate the data - boolean result = storeGeneticAlterations(values, daoGeneticAlteration, gene, geneSymbol); - if (result == true) { - recordStored = true; - nrExtraRecords++; - } - } - else { - otherCase++; - } + if (gene.isMicroRNA() || rppaProfile) { // for micro rna or protein data, duplicate the data + boolean result = storeGeneticAlterations(values, daoGeneticAlteration, gene, geneSymbol); + if (result == true) { + recordStored = true; + nrExtraRecords++; + } + } + else { + otherCase++; + } } if (recordStored) { - //skip one, to avoid double counting: - nrExtraRecords--; + //skip one, to avoid double counting: + nrExtraRecords--; } if (!recordStored) { - if (otherCase == 0) { - // this means that miRNA or RPPA could not be stored - ProgressMonitor.logWarning("Could not store miRNA or RPPA data"); //TODO detect the type of of data and give specific warning - } - else if (otherCase > 1) { - // this means that genes.size() > 1 and data was not rppa or microRNA, so it is not defined how to deal with - // the ambiguous alias list. Report this: - ProgressMonitor.logWarning("Gene symbol " + geneSymbol + " found to be ambigous. Record will be skipped for this gene."); - } - else { - //should not occur. It would mean something is wrong in preceding logic (see else if (genes.size()==1) ) or a configuration problem, e.g. where a symbol maps to both a miRNA and a normal gene: - throw new RuntimeException("Unexpected error: unable to process row with gene " + geneSymbol); - } + if (otherCase == 0) { + // this means that miRNA or RPPA could not be stored + ProgressMonitor.logWarning("Could not store miRNA or RPPA data"); //TODO detect the type of of data and give specific warning + } + else if (otherCase > 1) { + // this means that genes.size() > 1 and data was not rppa or microRNA, so it is not defined how to deal with + // the ambiguous alias list. Report this: + ProgressMonitor.logWarning("Gene symbol " + geneSymbol + " found to be ambigous. Record will be skipped for this gene."); + } + else { + //should not occur. It would mean something is wrong in preceding logic (see else if (genes.size()==1) ) or a configuration problem, e.g. where a symbol maps to both a miRNA and a normal gene: + throw new RuntimeException("Unexpected error: unable to process row with gene " + geneSymbol); + } } } } @@ -461,7 +465,7 @@ else if (otherCase > 1) { } } return recordStored; - } + } /** * Parses line for gene set record and stores record in 'genetic_alteration' table. @@ -504,30 +508,30 @@ private boolean parseGenesetLine(String line, int nrColumns, int sampleStartInde return storedRecord; } - private boolean storeGeneticAlterations(String[] values, DaoGeneticAlteration daoGeneticAlteration, + private boolean storeGeneticAlterations(String[] values, DaoGeneticAlteration daoGeneticAlteration, CanonicalGene gene, String geneSymbol) throws DaoException { - // Check that we have not already imported information regarding this gene. + // Check that we have not already imported information regarding this gene. // This is an important check, because a GISTIC or RAE file may contain // multiple rows for the same gene, and we only want to import the first row. - try { - if (!importSetOfGenes.contains(gene.getEntrezGeneId())) { - daoGeneticAlteration.addGeneticAlterations(geneticProfileId, gene.getEntrezGeneId(), values); - importSetOfGenes.add(gene.getEntrezGeneId()); - return true; - } - else { - //TODO - review this part - maybe it should be an Exception instead of just a warning. - String geneSymbolMessage = ""; - if (geneSymbol != null && !geneSymbol.equalsIgnoreCase(gene.getHugoGeneSymbolAllCaps())) - geneSymbolMessage = "(given as alias in your file as: " + geneSymbol + ") "; - ProgressMonitor.logWarning("Gene " + gene.getHugoGeneSymbolAllCaps() + " (" + gene.getEntrezGeneId() + ")" + geneSymbolMessage + " found to be duplicated in your file. Duplicated row will be ignored!"); - return false; - } - } - catch (Exception e) - { - throw new RuntimeException("Aborted: Error found for row starting with " + geneSymbol + ": " + e.getMessage()); - } + try { + if (!importSetOfGenes.contains(gene.getEntrezGeneId())) { + daoGeneticAlteration.addGeneticAlterations(geneticProfileId, gene.getEntrezGeneId(), values); + importSetOfGenes.add(gene.getEntrezGeneId()); + return true; + } + else { + //TODO - review this part - maybe it should be an Exception instead of just a warning. + String geneSymbolMessage = ""; + if (geneSymbol != null && !geneSymbol.equalsIgnoreCase(gene.getHugoGeneSymbolAllCaps())) + geneSymbolMessage = "(given as alias in your file as: " + geneSymbol + ") "; + ProgressMonitor.logWarning("Gene " + gene.getHugoGeneSymbolAllCaps() + " (" + gene.getEntrezGeneId() + ")" + geneSymbolMessage + " found to be duplicated in your file. Duplicated row will be ignored!"); + return false; + } + } + catch (Exception e) + { + throw new RuntimeException("Aborted: Error found for row starting with " + geneSymbol + ": " + e.getMessage()); + } } /** @@ -557,55 +561,55 @@ private boolean storeGeneticEntityGeneticAlterations(String[] values, DaoGenetic } } - /** - * Tries to parse the genes and look them up in DaoGeneOptimized - * - * @param antibodyWithGene - * @return returns null if something was wrong, e.g. could not parse the antibodyWithGene string; returns - * a list with 0 or more elements otherwise. - * @throws DaoException - */ - private List parseRPPAGenes(String antibodyWithGene) throws DaoException { + /** + * Tries to parse the genes and look them up in DaoGeneOptimized + * + * @param antibodyWithGene + * @return returns null if something was wrong, e.g. could not parse the antibodyWithGene string; returns + * a list with 0 or more elements otherwise. + * @throws DaoException + */ + private List parseRPPAGenes(String antibodyWithGene) throws DaoException { DaoGeneOptimized daoGene = DaoGeneOptimized.getInstance(); String[] parts = antibodyWithGene.split("\\|"); //validate: if (parts.length < 2) { - ProgressMonitor.logWarning("Could not parse Composite.Element.Ref value " + antibodyWithGene + ". Record will be skipped."); - //return null when there was a parse error: - return null; + ProgressMonitor.logWarning("Could not parse Composite.Element.Ref value " + antibodyWithGene + ". Record will be skipped."); + //return null when there was a parse error: + return null; } String[] symbols = parts[0].split(" "); String arrayId = parts[1]; //validate arrayId: if arrayId if duplicated, warn: if (!arrayIdSet.add(arrayId)) { - ProgressMonitor.logWarning("Id " + arrayId + " in [" + antibodyWithGene + "] found to be duplicated. Record will be skipped."); - return null; + ProgressMonitor.logWarning("Id " + arrayId + " in [" + antibodyWithGene + "] found to be duplicated. Record will be skipped."); + return null; } List symbolsNotFound = new ArrayList(); List genes = new ArrayList(); for (String symbol : symbols) { - if (symbol.equalsIgnoreCase("NA")) { - //workaround because of bug in firehose. See https://github.com/cBioPortal/cbioportal/issues/839#issuecomment-203523078 - ProgressMonitor.logWarning("Gene " + symbol + " will be interpreted as 'Not Available' in this case. Record will be skipped for this gene."); - } - else { - CanonicalGene gene = daoGene.getNonAmbiguousGene(symbol, null); - if (gene!=null) { - genes.add(gene); - } - else { - symbolsNotFound.add(symbol); - } - } + if (symbol.equalsIgnoreCase("NA")) { + //workaround because of bug in firehose. See https://github.com/cBioPortal/cbioportal/issues/839#issuecomment-203523078 + ProgressMonitor.logWarning("Gene " + symbol + " will be interpreted as 'Not Available' in this case. Record will be skipped for this gene."); + } + else { + CanonicalGene gene = daoGene.getNonAmbiguousGene(symbol, null); + if (gene!=null) { + genes.add(gene); + } + else { + symbolsNotFound.add(symbol); + } + } } if (genes.size() == 0) { - //return empty list: - return genes; + //return empty list: + return genes; } //So one or more genes were found, but maybe some were not found. If any //is not found, report it here: for (String symbol : symbolsNotFound) { - ProgressMonitor.logWarning("Gene " + symbol + " not found in DB. Record will be skipped for this gene."); + ProgressMonitor.logWarning("Gene " + symbol + " not found in DB. Record will be skipped for this gene."); } Pattern p = Pattern.compile("(p[STY][0-9]+(?:_[STY][0-9]+)*)"); @@ -632,7 +636,7 @@ private List importPhosphoGene(List genes, String String phosphoSymbol = gene.getStandardSymbol()+"_"+residue; CanonicalGene phosphoGene = daoGene.getGene(phosphoSymbol); if (phosphoGene==null) { - ProgressMonitor.logWarning("Phosphoprotein " + phosphoSymbol + " not yet known in DB. Adding it to `gene` table with 3 aliases in `gene_alias` table."); + ProgressMonitor.logWarning("Phosphoprotein " + phosphoSymbol + " not yet known in DB. Adding it to `gene` table with 3 aliases in `gene_alias` table."); phosphoGene = new CanonicalGene(phosphoSymbol, aliases); phosphoGene.setType(CanonicalGene.PHOSPHOPROTEIN_TYPE); phosphoGene.setCytoband(gene.getCytoband()); @@ -654,7 +658,7 @@ private int getGenesetIdIndex(String[] headers) { } private int getHugoSymbolIndex(String[] headers) { - for (int i = 0; i hugoSymbolIndex && i > entrezGeneIdIndex && i > rppaGeneRefIndex && i > genesetIdIndex) { - //then we consider this the start of the sample columns: - startIndex = i; - break; - } + //and the column is found after hugoSymbolIndex and entrezGeneIdIndex: + if (i > hugoSymbolIndex && i > entrezGeneIdIndex && i > rppaGeneRefIndex && i > genesetIdIndex) { + //then we consider this the start of the sample columns: + startIndex = i; + break; + } } } if (startIndex == -1) - throw new RuntimeException("Could not find a sample column in the file"); + throw new RuntimeException("Could not find a sample column in the file"); return startIndex; }