static public LinkedHashMap<String, GeneSequence> getGeneSequences(Collection<ChromosomeSequence> chromosomeSequences) throws Exception { LinkedHashMap<String, GeneSequence> geneSequenceHashMap = new LinkedHashMap<String, GeneSequence>(); for (ChromosomeSequence chromosomeSequence : chromosomeSequences) { for (GeneSequence geneSequence : chromosomeSequence.getGeneSequences().values()) { geneSequenceHashMap.put(geneSequence.getAccession().getID(), geneSequence); } } return geneSequenceHashMap; }
@Override public int getLength() { return Math.abs(this.getBioEnd() - this.getBioBegin()) + 1; }
/** * @return the strand */ public Strand getStrand() { return parentGeneSequence.getStrand(); }
/** * A class that keeps track of the details of a GeneSequence which is difficult to properly model. Two important concepts that is difficult * to make everything flexible but still work. You can have GFF features that only describe Exons or Exons/Introns or CDS regions and one * or more Transcriptions. You can have exon sequences but that does not imply transcription to the actual protein. * * The GeneSequence will keep track of Exons and Introns but to get a Protein sequence you need to start with a * TranscriptSequence and then add CDS sequences. * * This is also a key class in the biojava-3-genome module for reading and writing GFF3 files * * @param parentDNASequence * @param begin * @param end inclusive of end * @param strand force a gene to have strand and transcription sequence will inherit */ public GeneSequence(ChromosomeSequence parentSequence, int begin, int end, Strand strand) { chromosomeSequence = parentSequence; setParentSequence(parentSequence); setBioBegin(begin); setBioEnd(end); setStrand(strand); this.setCompoundSet(DNACompoundSet.getDNACompoundSet()); }
/** * Add a gene to the chromosome sequence using bioIndexing starts at 1 instead of 0. The * GeneSequence that is returned will have a reference to parent chromosome sequence * which actually contains the sequence data. Strand is important for positive and negative * direction where negative strand means we need reverse complement. If negative strand then * bioBegin will be greater than bioEnd * * * @param accession * @param begin * @param end * @param strand * @return */ public GeneSequence addGene(AccessionID accession, int bioBegin, int bioEnd, Strand strand) { GeneSequence geneSequence = new GeneSequence(this, bioBegin, bioEnd, strand); geneSequence.setAccession(accession); geneSequenceHashMap.put(accession.toString(), geneSequence); return geneSequence; }
/** * Try to give method clarity where you want a DNASequence coding in the 5' to 3' direction * Returns the DNASequence representative of the 5' and 3' reading based on strand * @return dna sequence */ public DNASequence getSequence5PrimeTo3Prime() { String sequence = getSequenceAsString(this.getBioBegin(), this.getBioEnd(), this.getStrand()); if (getStrand() == Strand.NEGATIVE) { //need to take complement of sequence because it is negative and we are returning the gene sequence from the opposite strand StringBuilder b = new StringBuilder(getLength()); CompoundSet<NucleotideCompound> compoundSet = this.getCompoundSet(); for (int i = 0; i < sequence.length(); i++) { String nucleotide = String.valueOf(sequence.charAt(i)); NucleotideCompound nucleotideCompound = compoundSet.getCompoundForString(nucleotide); b.append(nucleotideCompound.getComplement().getShortName()); } sequence = b.toString(); } DNASequence dnaSequence = null; try { dnaSequence = new DNASequence(sequence.toUpperCase()); } catch (CompoundNotFoundException e) { // this should not happen, the sequence is DNA originally, if it does, there's a bug somewhere logger.error("Could not create new DNA sequence in getSequence5PrimeTo3Prime(). Error: {}",e.getMessage()); } dnaSequence.setAccession(new AccessionID(this.getAccession().getID())); return dnaSequence; } }
seq = sequence.getSequence5PrimeTo3Prime().getSequenceAsString(); if (showExonUppercase) { StringBuilder sb = new StringBuilder(seq.toLowerCase()); int geneBioBegin = sequence.getBioBegin(); int geneBioEnd = sequence.getBioEnd(); for (ExonSequence exonSequence : sequence.getExonSequences()) { int featureBioBegin = 0; int featureBioEnd = 0; if (sequence.getStrand() != Strand.NEGATIVE) { featureBioBegin = exonSequence.getBioBegin() - geneBioBegin; featureBioEnd = exonSequence.getBioEnd() - geneBioBegin; logger.warn("Bad Feature, Accession: {}, Sequence Strand: {}, Gene Begin: {}, Gene End: {}, Exon Begin: {}, Exon End: {}", sequence.getAccession().toString(), sequence.getStrand(), geneBioBegin, geneBioEnd, exonSequence.getBioBegin(), exonSequence.getBioEnd()); } else { for (int i = featureBioBegin; i <= featureBioEnd; i++) { if ((sequence.getLength() % getLineLength()) != 0) { os.write(lineSep);
gff3line = key + "\t" + geneSequence.getSource() + "\t" + "gene" + "\t" + geneSequence.getBioBegin() + "\t" + geneSequence.getBioEnd() + "\t"; Double score = geneSequence.getSequenceScore(); if (score == null) { gff3line = gff3line + ".\t"; gff3line = gff3line + score + "\t"; gff3line = gff3line + geneSequence.getStrand().getStringRepresentation() + "\t"; gff3line = gff3line + ".\t"; gff3line = gff3line + "ID=" + geneSequence.getAccession().getID() + ";Name=" + geneSequence.getAccession().getID(); gff3line = gff3line + getGFF3Note(geneSequence.getNotesList()); gff3line = gff3line + "\n"; outputStream.write(gff3line.getBytes()); for (TranscriptSequence transcriptSequence : geneSequence.getTranscripts().values()) { transcriptIndex++; String id = geneSequence.getAccession().getID() + "." + transcriptIndex; gff3line = gff3line + "ID=" + id + ";Parent=" + geneSequence.getAccession().getID() + ";Name=" + id; gff3line = gff3line + getGFF3Note(transcriptSequence.getNotesList()); outputStream.write(gff3line.getBytes()); String transcriptParentName = geneSequence.getAccession().getID() + "." + transcriptIndex; ArrayList<CDSSequence> cdsSequenceList = new ArrayList<CDSSequence>(transcriptSequence.getCDSSequences().values()); Collections.sort(cdsSequenceList, new SequenceComparator());
if (geneSequence == null) { geneSequence = seq.addGene(geneAccessionID, startCodonBegin, stopCodonEnd, strand); geneSequence.setSource(geneSource); if (geneNote != null && geneNote.length() > 0) { geneSequence.addNote(geneNote); if (startCodonBegin < geneSequence.getBioBegin()) { geneSequence.setBioBegin(startCodonBegin); if (stopCodonEnd > geneSequence.getBioBegin()) { geneSequence.setBioEnd(stopCodonEnd); TranscriptSequence transcriptSequence = geneSequence.addTranscript(transcriptAccessionID, startCodonBegin, stopCodonEnd); transcriptSequence.setSource(mRNAsource); if (mRNANote != null && mRNANote.length() > 0) { ExonSequence exonSequence = geneSequence.addExon(cdsAccessionID, cdsFeature.location().bioStart(), cdsFeature.location().bioEnd()); exonSequence.setSource(cdsSource); if (cdsNote != null && cdsNote.length() > 0) { geneSequence.addIntronsUsingExons();
if (geneSequence == null) { geneSequence = seq.addGene(geneAccessionID, startCodonBegin, stopCodonEnd, strand); geneSequence.setSource(((Feature) feature).source()); } else { if (startCodonBegin < geneSequence.getBioBegin()) { geneSequence.setBioBegin(startCodonBegin); if (stopCodonEnd > geneSequence.getBioBegin()) { geneSequence.setBioEnd(stopCodonEnd); TranscriptSequence transcriptSequence = geneSequence.addTranscript(transcriptAccessionID, startCodonBegin, stopCodonEnd);
if (getStrand() == Strand.NEGATIVE) { shift = 1; ExonSequence exon1 = exonSequenceList.get(i); ExonSequence exon2 = exonSequenceList.get(i + 1); this.addIntron(new AccessionID(this.getAccession().getID() + "-" + "intron" + intronIndex), exon1.getBioEnd() - shift, exon2.getBioBegin() + shift); intronIndex++;
static public LinkedHashMap<String, ProteinSequence> getProteinSequences(Collection<ChromosomeSequence> chromosomeSequences) throws Exception { LinkedHashMap<String, ProteinSequence> proteinSequenceHashMap = new LinkedHashMap<String, ProteinSequence>(); for (ChromosomeSequence dnaSequence : chromosomeSequences) { for (GeneSequence geneSequence : dnaSequence.getGeneSequences().values()) { for (TranscriptSequence transcriptSequence : geneSequence.getTranscripts().values()) { //TODO remove? // DNASequence dnaCodingSequence = transcriptSequence.getDNACodingSequence(); // logger.info("CDS={}", dnaCodingSequence.getSequenceAsString()); try { ProteinSequence proteinSequence = transcriptSequence.getProteinSequence(); // logger.info("{} {}", proteinSequence.getAccession().getID(), proteinSequence); if (proteinSequenceHashMap.containsKey(proteinSequence.getAccession().getID())) { throw new Exception("Duplicate protein sequence id=" + proteinSequence.getAccession().getID() + " found at Gene id=" + geneSequence.getAccession().getID()); } else { proteinSequenceHashMap.put(proteinSequence.getAccession().getID(), proteinSequence); } } catch (Exception e) { logger.error("Exception: ", e); } } } } return proteinSequenceHashMap; }
public static void main(String[] args) { try { ArrayList<GeneSequence> sequences = new ArrayList<GeneSequence>(); ChromosomeSequence seq1 = new ChromosomeSequence("ATATATATATATATATATATATATATATATATACGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCATATATATATATATATATATATACGCGCGCGCGCGCGCGCATATATATATATATATATATATATATATATATACGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCATATATATATATATATATATATACGCGCGCGCGCGCGCGC"); GeneSequence gene1 = seq1.addGene(new AccessionID("gene1"), 1, 20, Strand.POSITIVE); gene1.addExon(new AccessionID("t1_1_10"), 1, 10); gene1.addExon(new AccessionID("t1_12_15"), 12, 15); GeneSequence gene2 = seq1.addGene(new AccessionID("gene2"), 1, 20, Strand.NEGATIVE); gene2.addExon(new AccessionID("t2_1_10"), 1, 10); gene2.addExon(new AccessionID("t2_12_15"), 12, 15); sequences.add(gene1); sequences.add(gene2); FastaGeneWriter fastaWriter = new FastaGeneWriter(System.out, sequences, new GenericFastaHeaderFormat<GeneSequence, NucleotideCompound>(), true); fastaWriter.process(); } catch (Exception e) { logger.warn("Exception: ", e); } } }
/** * Remove the exon sequence * @param accession * @return exon sequence */ public ExonSequence removeExon(String accession) { for (ExonSequence exonSequence : exonSequenceList) { if (exonSequence.getAccession().getID().equals(accession)) { exonSequenceList.remove(exonSequence); exonSequenceHashMap.remove(accession); // we now have a new gap which creates an intron intronSequenceList.clear(); intronSequenceHashMap.clear(); intronAdded = false; try{ addIntronsUsingExons(); } catch(Exception e){ logger.error("Remove Exon validate() error " + e.getMessage()); } return exonSequence; } } return null; }
/** * Try to give method clarity where you want a DNASequence coding in the 5' to 3' direction * Returns the DNASequence representative of the 5' and 3' reading based on strand * @return dna sequence */ public DNASequence getSequence5PrimeTo3Prime() { String sequence = getSequenceAsString(this.getBioBegin(), this.getBioEnd(), this.getStrand()); if (getStrand() == Strand.NEGATIVE) { //need to take complement of sequence because it is negative and we are returning the gene sequence from the opposite strand StringBuilder b = new StringBuilder(getLength()); CompoundSet<NucleotideCompound> compoundSet = this.getCompoundSet(); for (int i = 0; i < sequence.length(); i++) { String nucleotide = String.valueOf(sequence.charAt(i)); NucleotideCompound nucleotideCompound = compoundSet.getCompoundForString(nucleotide); b.append(nucleotideCompound.getComplement().getShortName()); } sequence = b.toString(); } DNASequence dnaSequence = null; try { dnaSequence = new DNASequence(sequence.toUpperCase()); } catch (CompoundNotFoundException e) { // this should not happen, the sequence is DNA originally, if it does, there's a bug somewhere logger.error("Could not create new DNA sequence in getSequence5PrimeTo3Prime(). Error: {}",e.getMessage()); } dnaSequence.setAccession(new AccessionID(this.getAccession().getID())); return dnaSequence; } }
seq = sequence.getSequence5PrimeTo3Prime().getSequenceAsString(); if (showExonUppercase) { StringBuilder sb = new StringBuilder(seq.toLowerCase()); int geneBioBegin = sequence.getBioBegin(); int geneBioEnd = sequence.getBioEnd(); for (ExonSequence exonSequence : sequence.getExonSequences()) { int featureBioBegin = 0; int featureBioEnd = 0; if (sequence.getStrand() != Strand.NEGATIVE) { featureBioBegin = exonSequence.getBioBegin() - geneBioBegin; featureBioEnd = exonSequence.getBioEnd() - geneBioBegin; logger.warn("Bad Feature, Accession: {}, Sequence Strand: {}, Gene Begin: {}, Gene End: {}, Exon Begin: {}, Exon End: {}", sequence.getAccession().toString(), sequence.getStrand(), geneBioBegin, geneBioEnd, exonSequence.getBioBegin(), exonSequence.getBioEnd()); } else { for (int i = featureBioBegin; i <= featureBioEnd; i++) { if ((sequence.getLength() % getLineLength()) != 0) { os.write(lineSep);
if (geneSequence == null) { geneSequence = seq.addGene(geneAccessionID, startCodonBegin, stopCodonEnd, strand); geneSequence.setSource(((Feature) feature).source()); } else { if (startCodonBegin < geneSequence.getBioBegin()) { geneSequence.setBioBegin(startCodonBegin); if (stopCodonEnd > geneSequence.getBioBegin()) { geneSequence.setBioEnd(stopCodonEnd); TranscriptSequence transcriptSequence = geneSequence.addTranscript(transcriptAccessionID, startCodonBegin, stopCodonEnd); if (startCodon != null) { if (startCodonName == null || startCodonName.length() == 0) {
/** * A class that keeps track of the details of a GeneSequence which is difficult to properly model. Two important concepts that is difficult * to make everything flexible but still work. You can have GFF features that only describe Exons or Exons/Introns or CDS regions and one * or more Transcriptions. You can have exon sequences but that does not imply transcription to the actual protein. * * The GeneSequence will keep track of Exons and Introns but to get a Protein sequence you need to start with a * TranscriptSequence and then add CDS sequences. * * This is also a key class in the biojava-3-genome module for reading and writing GFF3 files * * @param parentDNASequence * @param begin * @param end inclusive of end * @param strand force a gene to have strand and transcription sequence will inherit */ public GeneSequence(ChromosomeSequence parentSequence, int begin, int end, Strand strand) { chromosomeSequence = parentSequence; setParentSequence(parentSequence); setBioBegin(begin); setBioEnd(end); setStrand(strand); this.setCompoundSet(DNACompoundSet.getDNACompoundSet()); }
if (getStrand() == Strand.NEGATIVE) { shift = 1; ExonSequence exon1 = exonSequenceList.get(i); ExonSequence exon2 = exonSequenceList.get(i + 1); this.addIntron(new AccessionID(this.getAccession().getID() + "-" + "intron" + intronIndex), exon1.getBioEnd() - shift, exon2.getBioBegin() + shift); intronIndex++;
public static void main(String[] args) { try { ArrayList<GeneSequence> sequences = new ArrayList<GeneSequence>(); ChromosomeSequence seq1 = new ChromosomeSequence("ATATATATATATATATATATATATATATATATACGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCATATATATATATATATATATATACGCGCGCGCGCGCGCGCATATATATATATATATATATATATATATATATACGCGCGCGCGCGCGCGCGCGCGCGCGCGCGCATATATATATATATATATATATACGCGCGCGCGCGCGCGC"); GeneSequence gene1 = seq1.addGene(new AccessionID("gene1"), 1, 20, Strand.POSITIVE); gene1.addExon(new AccessionID("t1_1_10"), 1, 10); gene1.addExon(new AccessionID("t1_12_15"), 12, 15); GeneSequence gene2 = seq1.addGene(new AccessionID("gene2"), 1, 20, Strand.NEGATIVE); gene2.addExon(new AccessionID("t2_1_10"), 1, 10); gene2.addExon(new AccessionID("t2_12_15"), 12, 15); sequences.add(gene1); sequences.add(gene2); FastaGeneWriter fastaWriter = new FastaGeneWriter(System.out, sequences, new GenericFastaHeaderFormat<GeneSequence, NucleotideCompound>(), true); fastaWriter.process(); } catch (Exception e) { logger.warn("Exception: ", e); } } }