/** * Calculates the predefined NM tag from the SAM spec: (# of mismatches + # of indels) * For the purposes for calculating mismatches, we do not yet support IUPAC ambiguous codes * (see <code>readBaseMatchesRefBaseWithAmbiguity</code> method). * * @param referenceOffset 0-based offset of the first element of referenceBases relative to the start * of that reference sequence. */ public static int calculateSamNmTag(final SAMRecord read, final byte[] referenceBases, final int referenceOffset) { return calculateSamNmTag(read, referenceBases, referenceOffset, false); }
/** * Throws an exception if both parameters are non-null and unequal. * * @param s1 a list of sequence headers * @param s2 a second list of sequence headers */ public static void assertSequenceDictionariesEqual(final SAMSequenceDictionary s1, final SAMSequenceDictionary s2) { assertSequenceDictionariesEqual(s1, s2, false); }
/** * Helper method to handle the various use cases of base comparison. * * @param readBase the read base to match * @param refBase the reference base to match * @param negativeStrand set to true if the base to test is on the negative strand and should be reverse complemented (only applies if bisulfiteSequence is true) * @param bisulfiteSequence set to true if the base to match is a bisulfite sequence and needs to be converted * @param matchAmbiguousRef causes the match to return true when the read base is a subset of the possible IUPAC reference bases, but not the other way around * @return true if the bases match, false otherwise */ private static boolean basesMatch(final byte readBase, final byte refBase, final boolean negativeStrand, final boolean bisulfiteSequence, final boolean matchAmbiguousRef) { if (bisulfiteSequence) { if (matchAmbiguousRef) return bisulfiteBasesMatchWithAmbiguity(negativeStrand, readBase, refBase); else return bisulfiteBasesEqual(negativeStrand, readBase, refBase); } else { if (matchAmbiguousRef) return readBaseMatchesRefBaseWithAmbiguity(readBase, refBase); else return basesEqual(readBase, refBase); } }
/** * True if there's a C in the reference as well as read (possibly bisulfite converted) */ private boolean isC(final byte refBase, final byte readBase) { return (SequenceUtil.basesEqual(refBase, SequenceUtil.C) && SequenceUtil.bisulfiteBasesEqual(readBase, refBase)); }
/** Returns true if the bases are equal OR if the mismatch cannot be accounted for by * bisfulite treatment. C->T on the positive strand and G->A on the negative strand * do not count as mismatches */ public static boolean bisulfiteBasesEqual(final boolean negativeStrand, final byte read, final byte reference) { return (basesEqual(read, reference)) || (isBisulfiteConverted(read, reference, negativeStrand)); }
/** * Same as above, but use <code>readBaseMatchesRefBaseWithAmbiguity</code> instead of <code>basesEqual</code>. * Note that <code>isBisulfiteConverted</code> is not affected because it only applies when the * reference base is non-ambiguous. */ public static boolean bisulfiteBasesMatchWithAmbiguity(final boolean negativeStrand, final byte read, final byte reference) { return (readBaseMatchesRefBaseWithAmbiguity(read, reference)) || (isBisulfiteConverted(read, reference, negativeStrand)); }
smallReadCount++; return; } else if (SequenceUtil.countMismatches(samRecord, refBases, true) > Math.round(samRecord.getReadLength() * maxMismatchRate)) { mismatchCount++; return; SequenceUtil.reverseComplement(refFragment); SequenceUtil.reverseComplement(readFragment); SequenceUtil.reverseQualities(readQualityFragment); if ((SequenceUtil.basesEqual(refFragment[i], SequenceUtil.C)) && (SequenceUtil.basesEqual(refFragment[i+1], SequenceUtil.G))) { final CpgLocation curLocation = new CpgLocation(samRecord.getReferenceName(), curRefIndex); cpgTotal.increment(curLocation); if (SequenceUtil.isBisulfiteConverted(readFragment[i], refFragment[i])) { cpgConverted.increment(curLocation); SequenceUtil.bisulfiteBasesEqual(false, readFragment[i+1], refFragment[i+1])) { if (SequenceUtil.isBisulfiteConverted(readFragment[i], refFragment[i])) { nCytoConverted++;
@Test public void testReverseComplement() { Assert.assertEquals(SequenceUtil.reverseComplement("ABCDEFGHIJKLMNOPQRSTUVWXYZ"),"ZYXWVUASRQPONMLKJIHCFEDGBT"); Assert.assertEquals(SequenceUtil.reverseComplement("abcdefghijklmnopqrstuvwxy"),"yxwvuasrqponmlkjihcfedgbt"); //missing "z" on purpose so that we test both even-lengthed and odd-lengthed strings }
private void addRead(final GcObject gcObj, final SAMRecord rec, final String group, final byte[] gc, final byte[] refBases) { if (!rec.getReadPairedFlag() || rec.getFirstOfPairFlag()) ++gcObj.totalClusters; final int pos = rec.getReadNegativeStrandFlag() ? rec.getAlignmentEnd() - scanWindowSize : rec.getAlignmentStart(); ++gcObj.totalAlignedReads; if (pos > 0) { final int windowGc = gc[pos]; if (windowGc >= 0) { ++gcObj.readsByGc[windowGc]; gcObj.basesByGc[windowGc] += rec.getReadLength(); gcObj.errorsByGc[windowGc] += SequenceUtil.countMismatches(rec, refBases, bisulfite) + SequenceUtil.countInsertedBases(rec) + SequenceUtil.countDeletedBases(rec); } } if (gcObj.group == null) { gcObj.group = group; } } }
/** Calculates and sets the NM, MD, and and UQ tags from the record and the reference * * @param record the record to be fixed * @param refSeqWalker a ReferenceSequenceWalker that will be used to traverse the reference * @param isBisulfiteSequence a flag indicating whether the sequence came from bisulfite-sequencing which would imply a different * calculation of the NM tag. * * No return value, modifies the provided record. */ public static void fixNmMdAndUq(final SAMRecord record, final ReferenceSequenceFileWalker refSeqWalker, final boolean isBisulfiteSequence) { final byte[] referenceBases = refSeqWalker.get(record.getReferenceIndex()).getBases(); // only recalculate NM if it isn't bisulfite, since it needs to be treated specially below SequenceUtil.calculateMdAndNmTags(record, referenceBases, true, !isBisulfiteSequence); if (isBisulfiteSequence) { // recalculate the NM tag for bisulfite data record.setAttribute(SAMTag.NM.name(), SequenceUtil.calculateSamNmTag(record, referenceBases, 0, isBisulfiteSequence)); } fixUq(record, refSeqWalker, isBisulfiteSequence); }
/** * Calculates the number of mismatches between the read and the reference sequence provided. * * @param referenceBases Array of ASCII bytes that covers at least the the portion of the reference sequence * to which read is aligned from getReferenceStart to getReferenceEnd. * @param bisulfiteSequence If this is true, it is assumed that the reads were bisulfite treated * and C->T on the positive strand and G->A on the negative strand will not be counted * as mismatches. */ public static int countMismatches(final SAMRecord read, final byte[] referenceBases, final boolean bisulfiteSequence) { return countMismatches(read, referenceBases, 0, bisulfiteSequence); }
public static void reverseComplement(final byte[] bases, final int offset, final int len) { final int lastIndex = len - 1; int i, j; for (i = offset, j = offset + lastIndex; i < j; ++i, --j) { final byte tmp = complement(bases[i]); bases[i] = complement(bases[j]); bases[j] = tmp; } if (len % 2 == 1) { bases[i] = complement(bases[i]); } }
@Test(dataProvider = "countInsertedAndDeletedBasesTestCases") public void testCountInsertedAndDeletedBases(final String cigarString, final int insertedBases, final int deletedBases) { final Cigar cigar = TextCigarCodec.decode(cigarString); Assert.assertEquals(SequenceUtil.countInsertedBases(cigar), insertedBases); Assert.assertEquals(SequenceUtil.countDeletedBases(cigar), deletedBases); }
/** * Calculates the sum of qualities for mismatched bases in the read. * @param referenceBases Array of ASCII bytes in which the 0th position in the array corresponds * to the first element of the reference sequence to which read is aligned. */ public static int sumQualitiesOfMismatches(final SAMRecord read, final byte[] referenceBases) { return sumQualitiesOfMismatches(read, referenceBases, 0, false); }
private static char stratifySequenceBase(final byte input, final Boolean getComplement) { return (char) SequenceUtil.upperCase(getComplement ? SequenceUtil.complement(input) : input); }
@Test(dataProvider = "mismatchCountsDataProvider") public void testCountMismatches(final String readString, final String cigar, final String reference, final int expectedMismatchesExact, final int expectedMismatchesAmbiguous) { final SAMRecord rec = new SAMRecord(null); rec.setReadName("test"); rec.setReadString(readString); final byte[] byteArray = new byte[readString.length()]; Arrays.fill(byteArray, (byte)33); rec.setBaseQualities(byteArray); rec.setCigarString(cigar); final byte[] refBases = StringUtil.stringToBytes(reference); final int nExact = SequenceUtil.countMismatches(rec, refBases, -1, false, false); Assert.assertEquals(nExact, expectedMismatchesExact); final int sumMismatchesQualityExact = SequenceUtil.sumQualitiesOfMismatches(rec, refBases, -1, false); Assert.assertEquals(sumMismatchesQualityExact, expectedMismatchesExact * 33); final int nAmbiguous = SequenceUtil.countMismatches(rec, refBases, -1, false, true); Assert.assertEquals(nAmbiguous, expectedMismatchesAmbiguous); }
/** * Checks for bisulfite conversion, C->T on the positive strand and G->A on the negative strand. */ public static boolean isBisulfiteConverted(final byte read, final byte reference, final boolean negativeStrand) { if (negativeStrand) { if (basesEqual(reference, G) && basesEqual(read, A)) { return true; } } else { if (basesEqual(reference, C) && basesEqual(read, T)) { return true; } } return false; }
private static int homologyLength(ReferenceLookup lookup, int referenceIndex, int referencePosition, int referenceStep, byte[] seq, int seqPosition, int seqStep) { SAMSequenceRecord refSeq = lookup.getSequenceDictionary().getSequence(referenceIndex); int homlen = 0; boolean complement = referenceStep != seqStep; while (seqPosition >= 0 && seqPosition < seq.length && // next step must still be on the contig referencePosition >= 1 && referencePosition <= refSeq.getSequenceLength()) { byte base = seq[seqPosition]; if (complement) { base = SequenceUtil.complement(base); } if (SequenceUtil.basesEqual(base, lookup.getBase(referenceIndex, referencePosition)) && !SequenceUtil.basesEqual(base, SequenceUtil.N)) { referencePosition += referenceStep; seqPosition += seqStep; homlen++; } else { break; } } return homlen; }
private static boolean matchesAdapterSequence(String adapter, byte[] read, int readStartOffset, int readDirection, boolean complementAdapter) { boolean nonzeroSize = false; for (int i = 0; readStartOffset + i * readDirection < read.length && readStartOffset + i * readDirection >= 0 && i < adapter.length(); i++) { byte readBase = read[readStartOffset + i * readDirection]; byte adapterBase = (byte)adapter.charAt(i); if (complementAdapter) adapterBase = SequenceUtil.complement(adapterBase); if (SequenceUtil.isValidBase(readBase) && readBase != adapterBase) { return false; } nonzeroSize = true; } return nonzeroSize; // no bases compared = no adapter match } }
read.setAttribute(SAMTag.NM.name(), SequenceUtil.calculateSamNmTag(read, reference, leftmostIndex - 1)); if ( read.getAttribute(SAMTag.UQ.name()) != null ) read.setAttribute(SAMTag.UQ.name(), SequenceUtil.sumQualitiesOfMismatches(read, reference, leftmostIndex-1)); } catch (Exception e) {