LOGGER.trace("Matching protein count: " + matchingProteins.size()); for (Protein protein : matchingProteins) { LOGGER.trace("Protein ID: " + protein.getId() + " MD5: " + protein.getMd5()); LOGGER.trace("Has " + protein.getMatches().size() + " matches"); for (ProteinXref xref : protein.getCrossReferences()) { LOGGER.trace("Xref: " + xref.getIdentifier());
protected void addToMoleculeCollection(String sequence, final String currentId, final Set<Protein> parsedMolecules) { sequence = WHITE_SPACE_PATTERN.matcher(sequence).replaceAll(""); Protein thisProtein = new Protein(sequence); // Check if this sequence is already in the Set. If it is, retrieve it. boolean isMoleculeAdded = parsedMolecules.add(thisProtein); if (!isMoleculeAdded) { for (Protein existing : parsedMolecules) { if (existing.getMd5().equals(thisProtein.getMd5())) { thisProtein = existing; break; } } } // Add the Xref to the Protein object. (Being added to a Set, so no risk of duplicates) thisProtein.addCrossReference(XrefParser.getProteinXref(currentId)); } }
@XmlElement(name = "sequence") private Sequence getSequenceObject() { return new Sequence(getSequence(), getMd5()); }
public Protein build() { Protein protein = new Protein(sequence); if (!matches.isEmpty()) { for (Match m : matches) { protein.addMatch(m); } } if (!crossReferences.isEmpty()) { for (ProteinXref x : crossReferences) { protein.addCrossReference(x); } } return protein; }
/** * Writes out all protein matches for the specified protein (GFF formatted). * * @param protein containing matches to be written out * @return the number of rows printed (i.e. the number of Locations on Matches). * @throws java.io.IOException in the event of I/O problem writing out the file. */ public int write(Protein protein) throws IOException { List<String> proteinIdsForGFF = getProteinAccessions(protein); int sequenceLength = protein.getSequenceLength(); String md5 = protein.getMd5(); String date = dmyFormat.format(new Date()); Set<Match> matches = protein.getMatches(); //Write sequence region information for (String proteinIdForGFF: proteinIdsForGFF) { if (matches.size() > 0) { //Check if protein accessions are GFF3 valid proteinIdForGFF = ProteinMatchesGFFResultWriter.getValidGFF3SeqId(proteinIdForGFF); //Write sequence-region super.gffWriter.write("##sequence-region " + proteinIdForGFF + " 1 " + sequenceLength); if (writeFullGFF) { writeReferenceLine(proteinIdForGFF, sequenceLength, md5); addFASTASeqToMap(proteinIdForGFF, protein.getSequence()); } processMatches(matches, proteinIdForGFF, date, protein, proteinIdForGFF, writeFullGFF); }//end match size check } return 0; }
/** * Writes out a Protein object to a GFF version 3 file * * @param protein containing matches to be written out * @return the number of rows printed (i.e. the number of Locations on Matches). * @throws java.io.IOException in the event of I/O problem writing out the file. */ public int write(Protein protein) throws IOException { int sequenceLength = protein.getSequenceLength(); String md5 = protein.getMd5(); String date = dmyFormat.format(new Date()); Set<Match> matches = protein.getMatches(); String proteinIdForGFF = null; List<String> proteinIdsFromGetOrf = getProteinAccessions(protein); for (String proteinIdFromGetorf : proteinIdsFromGetOrf) { if (matches.size() > 0) { proteinIdFromGetorf = getValidGFF3SeqId(proteinIdFromGetorf); writeSequenceRegionPart(protein, sequenceLength, md5, proteinIdFromGetorf); processMatches(matches, proteinIdForGFF, date, protein, getNucleotideId()); } } return 0; }
final String proteinIdFromGetorf) throws IOException { for (OpenReadingFrame orf : protein.getOpenReadingFrames()) { String nucleotideSequenceXrefId = nucleotideSequenceXref.getIdentifier(); for (ProteinXref proteinXref : protein.getCrossReferences()) { addFASTASeqToMap(proteinIdForGFF, protein.getSequence()); throw new IllegalStateException("Cannot find the ORF object that maps to protein with PK / MD5: " + protein.getId() + " / " + protein.getMd5());
Protein protein = proteinIdToProteinMap.get(rp.getProteinIdentifier()); if (LOGGER.isDebugEnabled()) { LOGGER.debug("persist protein: " + protein.getId() + " md5:" + protein.getMd5()); for (T rawMatch: rp.getMatches()){ if (! isLocationWithinRange(protein, rawMatch)){ LOGGER.error("Location coordinates Error - sequenceLength: " + protein.getSequenceLength() + " Location : " + rawMatch.getLocationStart() + "-" + rawMatch.getLocationEnd()); throw new IllegalStateException("Attempting to persist a match location outside sequence range " + rawMatch.toString() + "\n" + protein.toString()); protein.addMatch(match); // Adds protein to match (yes, I know it doesn't look that way!) entityManager.persist(match); matchLocationCount += match.getLocations().size();
List<String> proteinAcs = getProteinAccessions(protein); final int length = protein.getSequenceLength(); final String sequence = protein.getSequence(); final String crc64 = getCrc64(sequence); String date = dmyFormat.format(new Date()); Set<Match> matches = protein.getMatches(); for (String proteinAc: proteinAcs) { for (Match match : matches) {
newMd5s.add(newProtein.getMd5()); if (LOGGER.isDebugEnabled()) { LOGGER.debug("MD5 of new protein: " + newProtein.getMd5()); for (Protein existingProtein : (List<Protein>) query.getResultList()) { if (LOGGER.isDebugEnabled()) { LOGGER.debug("Found 1 existing protein with MD5: " + existingProtein.getMd5()); md5ToExistingProtein.put(existingProtein.getMd5(), existingProtein); if (md5ToExistingProtein.keySet().contains(candidate.getMd5())) { Protein existingProtein = md5ToExistingProtein.get(candidate.getMd5()); boolean updateRequired = false; if (candidate.getCrossReferences() != null) { if (LOGGER.isTraceEnabled()) { LOGGER.trace("Protein TO BE STORED has xrefs:"); for (ProteinXref xref : candidate.getCrossReferences()) { if (LOGGER.isTraceEnabled()) { LOGGER.trace(xref.getIdentifier()); if (!existingProtein.getCrossReferences().contains(xref)) { if (LOGGER.isTraceEnabled()) { LOGGER.trace("Adding " + xref.getIdentifier() + " and setting updateRequired = true"); existingProtein.addCrossReference(xref); updateRequired = true;
proteinCount ++; Long startPersistProtein = System.currentTimeMillis(); Set<ProteinXref> xrefs = newProtein.getCrossReferences(); if (LOGGER.isDebugEnabled()) { LOGGER.debug("Protein with ID " + newProtein.getId() + " has " + xrefs.size() + " cross references."); toDebugPrint(newProteins.size(), proteinCount, "SetProtein in ORF: " + (startAddOpenReadingFrame - startSetProtein ) + " millis "); newProtein.addOpenReadingFrame(newOrf); Long startOrfAwaitingPersistence = System.currentTimeMillis(); toDebugPrint(newProteins.size(), proteinCount,
protected String getProteinAccession(Protein protein) { StringBuilder proteinXRef = new StringBuilder(); Set<ProteinXref> crossReferences = protein.getCrossReferences(); for (ProteinXref crossReference : crossReferences) { if (proteinXRef.length() > 0) proteinXRef.append(VALUE_SEPARATOR); proteinXRef.append(crossReference.getIdentifier()); } return proteinXRef.toString(); }
/** * Build up a list of the protein lengths. * * @param bottomProtein Protein Id to start from * @param topProtein Protein Id to end on * @return Map of values */ private Map<Long, Integer> getProteinSequenceLengths(long bottomProtein, long topProtein) { Map<Long, Integer> proteinLengthMap = new HashMap<Long, Integer>(); List<Protein> proteins = proteinDAO.getProteinsBetweenIds(bottomProtein, topProtein); for (Protein protein : proteins) { proteinLengthMap.put(protein.getId(), protein.getSequenceLength()); } return proteinLengthMap; } }
String seq = protein.getSequence(); if (protein.getId() == null) { throw new FastaFileWritingException("The FastaFileWriter class can only write out Protein objects that have already been persisted to the database as it uses the database primary key as the protein ID in the fasta file.", filePath); FastaEntryWriter.writeFastaFileEntry(writer, protein.getId().toString(), seq, sequenceLineLength);
private void setMatches(Set<Match> matches) { for (Match m : matches) { addMatch(m); } }
/** * Get the length of the sequence. * Lazy load for efficiency - may be called repeatedly. * * @return The length */ public int getSequenceLength() { if (sequenceLength == 0) { final String seq = getSequence(); if (seq == null) { throw new IllegalStateException("Protein sequence was NULL"); } sequenceLength = seq.length(); } return sequenceLength; }
final Map<String, Protein> md5ToPrecalcProtein = new HashMap<>(localPrecalculatedProteins.size()); for (Protein precalc : localPrecalculatedProteins) { md5ToPrecalcProtein.put(precalc.getMd5(), precalc); if (md5ToPrecalcProtein.keySet().contains(protein.getMd5())) { precalculatedProteins.add(md5ToPrecalcProtein.get(protein.getMd5())); } else { addProteinToBatch(protein);
/** * This method stores sequences with (optionally) cross references. * The method attempts to store them in batches by calling the addProteinToBatch(Protein protein) * method. This in turn calls persistBatch(), when the batch size has been reached. * <p/> * * @param sequence being the protein sequence to store * @param crossReferences being a set of Cross references. */ public void store(String sequence, Map<String, SignatureLibraryRelease> analysisJobMap, String... crossReferences) { if (sequence != null && sequence.length() > 0) { Protein protein = new Protein(sequence); if (crossReferences != null) { for (String crossReference : crossReferences) { ProteinXref xref = XrefParser.getProteinXref(crossReference); protein.addCrossReference(xref); } } proteinsAwaitingPrecalcLookup.add(protein); if (proteinsAwaitingPrecalcLookup.size() > proteinPrecalcLookupBatchSize) { lookupProteins(analysisJobMap); } } }
lastRawMatch.getScore() ); protein.addMatch(match); LOGGER.debug(" protein length = " + protein.getSequenceLength() + " start location of raw match : " + rawMatch.getLocationStart() + " end location of raw match : " + rawMatch.getLocationEnd()); + " protein length = " + protein.getSequenceLength() + " raw match : " + rawMatch.toString()); throw new IllegalStateException("PANTHER match location is out of range " + currentSignatureAc + " protein length = " + protein.getSequenceLength() + " raw match : " + rawMatch.toString()); lastRawMatch.getScore() ); protein.addMatch(match);