private Integer getIntValueOfChromosomeOrZero(String contig) { return referenceDictionary.getContigNameToID().getOrDefault(contig, UNKNOWN_CHROMOSOME); }
private static JannovarProto.ReferenceDictionary toProtoReferenceDictionary(ReferenceDictionary referenceDictionary) { return JannovarProto.ReferenceDictionary.newBuilder() .putAllContigNameToId(referenceDictionary.getContigNameToID()) .putAllContigIdToLength(referenceDictionary.getContigIDToLength()) .putAllContigIdToName(referenceDictionary.getContigIDToName()) .build(); }
/** * @return instance of immutable ReferenceDictionary object */ public ReferenceDictionary build() { return new ReferenceDictionary(contigID.build(), contigName.build(), contigLength.build()); }
/** * @return String representation of name of chromosome, e.g., <code>"chr2"</code> */ public String getChromosomeName() { return refDict.getContigIDToName().get(chrID); }
/** construct genome position from other with the selected strand */ public GenomePosition(GenomePosition other, Strand strand) { this.refDict = other.refDict; this.strand = strand; this.chr = other.chr; // transform coordinate system if (strand == other.strand) this.pos = other.pos; else this.pos = refDict.getContigIDToLength().get(other.chr) - other.pos - 1; }
@Override public String getChrName() { return this.pos.getRefDict().getContigIDToName().get(this.pos.getChr()); }
/** construct genome interval from other with selected strand */ public GenomeInterval(GenomeInterval other, Strand strand) { this.refDict = other.refDict; this.strand = strand; this.chr = other.chr; // transform coordinate system if (strand == other.strand) { this.beginPos = other.beginPos; this.endPos = other.endPos; } else { int beginPos = refDict.getContigIDToLength().get(other.chr) - other.beginPos; int endPos = refDict.getContigIDToLength().get(other.chr) - other.endPos; this.endPos = beginPos; this.beginPos = endPos; } }
private GenomeVariant parseGenomeChange(String changeStr) throws JannovarException { Pattern pat = Pattern.compile("(chr[0-9MXY]+):([0-9]+)([ACGTN]*)>([ACGTN]*)"); Matcher match = pat.matcher(changeStr); if (!match.matches()) { System.err.println("[ERROR] Input string for the chromosomal change " + changeStr + " does not fit the regular expression ... :("); System.exit(3); } int chr = refDict.getContigNameToID().get(match.group(1)); int pos = Integer.parseInt(match.group(2)); String ref = match.group(3); String alt = match.group(4); return new GenomeVariant(new GenomePosition(refDict, Strand.FWD, chr, pos, PositionType.ONE_BASED), ref, alt); }
/** * This function constructs a HashMap<Byte,Chromosome> map of Chromosome objects in which the {@link TranscriptModel} * objects are entered into an {@link IntervalArray} for the appropriate Chromosome. * * @param refDict * the {@link ReferenceDictionary} to use for the construction * @param transcriptModels * list of {@link TranscriptModel} objects with the transcripts of all chromosomes * @return a mapping from numeric chromsome ID to {@link Chromosome} object */ private static ImmutableMap<Integer, Chromosome> makeChromsomes(ReferenceDictionary refDict, ImmutableList<TranscriptModel> transcriptModels) { ImmutableMap.Builder<Integer, Chromosome> builder = new ImmutableMap.Builder<Integer, Chromosome>(); // First, factorize the TranscriptModel objects by chromosome ID. // create hash map for this HashMap<Integer, ArrayList<TranscriptModel>> transcripts = new HashMap<Integer, ArrayList<TranscriptModel>>(); for (Integer chrID : refDict.getContigIDToName().keySet()) transcripts.put(chrID, new ArrayList<TranscriptModel>()); // distribute TranscriptModel lists for (TranscriptModel transcript : transcriptModels) transcripts.get(transcript.getChr()).add(transcript); // Then, construct an interval tree for each chromosome and add the lists of intervals. for (Integer chrID : transcripts.keySet()) { IntervalArray<TranscriptModel> iTree = new IntervalArray<TranscriptModel>(transcripts.get(chrID), new TranscriptIntervalEndExtractor()); builder.put(chrID, new Chromosome(refDict, chrID, iTree)); } return builder.build(); }
private GenomeVariant parseGenomeChange(String changeStr) throws JannovarException { Pattern pat = Pattern.compile("(chr[0-9MXY]+):([0-9]+)([ACGTN]*)>([ACGTN]*)"); Matcher match = pat.matcher(changeStr); if (!match.matches()) throw new JannovarException("[ERROR] Input string for the chromosomal change " + changeStr + " does not fit the regular expression ... :("); int chr = refDict.getContigNameToID().get(match.group(1)); int pos = Integer.parseInt(match.group(2)); String ref = match.group(3); String alt = match.group(4); return new GenomeVariant(new GenomePosition(refDict, Strand.FWD, chr, pos, PositionType.ONE_BASED), ref, alt); }
@Override public String toString() { if (strand.isReverse()) return withStrand(Strand.FWD).toString(); return StringUtil.concatenate(refDict.getContigIDToName().get(chr), ":g.", beginPos + 1, "_", endPos); }
/** * Returns a new GeneticInterval from the parsed string. Strings are to be * of the format: <li>chr1:123-456 chrY:1234-1220 chr19:345-567</li> * * @param refDict * @param interval * @return */ public static GeneticInterval parseString(ReferenceDictionary refDict, String interval) { String intervalPattern = "chr(1|2|3|4|5|6|7|8|9|10|11|12|13|14|15|16|17|18|19|20|21|22|X|Y|M):[0-9]+-[0-9]+"; if (!Pattern.matches(intervalPattern, interval)) { throw new IllegalArgumentException(String.format("Genetic interval %s does not match expected pattern %s", interval, intervalPattern)); } String[] intervalSections = interval.split(":"); int localChr = refDict.getContigNameToID().get(intervalSections[0]); String positions = intervalSections[1]; String[] startEnd = positions.split("-"); int localStart = Integer.parseInt(startEnd[0]); int localEnd = Integer.parseInt(startEnd[1]); return new GeneticInterval(localChr, localStart, localEnd); }
@Override public String toString() { if (strand.isReverse()) return withStrand(Strand.FWD).toString(); return StringUtil.concatenate(refDict.getContigIDToName().get(chr), ":g.", pos + 1); }
private static Function<String, ChromosomalRegion> toChromosomalRegion() { return line -> { String[] tokens = line.split("\t"); if (tokens.length < 3) { throw new BedFileParseException("BED file requires at least 3 columns invalid line: '" + line + "'"); } if (tokens.length > 3) { logger.warn("Line contains more than 3 columns - ignoring optional columns 4+. Therefore STRAND will all be +"); } int chr = referenceDictionary.getContigNameToID().get(tokens[0]); //BED format is 0-based - we use 1-based in the exomiser. int start = Integer.parseInt(tokens[1]) + 1; int end = Integer.parseInt(tokens[2]); return new GeneticInterval(chr, start, end); }; }
/** * Load sequence from the given <code>region</code> from {@link #indexedFile} * * @param region * {@link GenomeInterval} to load sequence for * @return String with the selected sequenced loaded from {@link #indexedFile}. */ public String load(GenomeInterval region) { region = region.withStrand(Strand.FWD); String contigName = region.getRefDict().getContigIDToName().get(region.getChr()); contigName = mapContigToFasta(contigName); ReferenceSequence seq = indexedFile.getSubsequenceAt(contigName, region.getBeginPos() + 1, region.getEndPos()); return new String(seq.getBases()); }
/** Map contig name (from genome variant) to contig name in FASTA */ private String mapContigToFasta(String contigName) { // Map genome variant's contig to unique ID Integer contigID = jannovarData.getRefDict().getContigNameToID().get(contigName); if (contigID == null) throw new UncheckedJannovarException("Unknown contig name " + contigName); // Try to find matching contig in fasta String nameInFasta = null; for (SAMSequenceRecord record : indexedFile.getSequenceDictionary().getSequences()) { if (jannovarData.getRefDict().getContigNameToID().containsKey(record.getSequenceName())) { String contigInFasta = record.getSequenceName(); if (jannovarData.getRefDict().getContigNameToID().get(contigInFasta) == contigID) { nameInFasta = contigInFasta; break; } } } if (nameInFasta == null) throw new UncheckedJannovarException("Could not find corresponding contig in FASTA for " + contigName); return nameInFasta; }
final long startTime = System.nanoTime(); if (data == null || data.getRefDict().getContigNameToID().isEmpty()) throw new SerializationException("Attempting to serialize empty data set");
int chr = refDict.getContigNameToID().get(arr[0]); int beginPos = Integer.parseInt(arr[1]); int endPos = Integer.parseInt(arr[2]);
/** Map contig name (from genome variant) to contig name in FASTA */ private String mapContigToFasta(String contigName) { // Map genome variant's contig to unique ID Integer contigID = jannovarData.getRefDict().getContigNameToID().get(contigName); if (contigID == null) throw new UncheckedJannovarException("Unknown contig name " + contigName); // Try to find matching contig in fasta String nameInFasta = null; for (SAMSequenceRecord record : fasta.getSequenceDictionary().getSequences()) { if (jannovarData.getRefDict().getContigNameToID().containsKey(record.getSequenceName())) { nameInFasta = record.getSequenceName(); break; } } if (nameInFasta == null) throw new UncheckedJannovarException("Could not find corresponding contig in FASTA for " + contigName); return nameInFasta; }
boolean wrongContig = false; for (FeatureRecord record : featureRecords) { ImmutableMap<String, Integer> dict = refDict.getContigNameToID(); final String seqID = record.getSeqID(); if (!dict.containsKey(seqID)) { } else if (record.getType().equals("CDS")) { GenomeInterval cds = new GenomeInterval(refDict, Strand.FWD, refDict.getContigNameToID().get(record.getSeqID()), record.getBegin(), record.getEnd()); cds = cds.withStrand(strand); if (cdsRegion == null)