/** * Calculates the number of mismatches between the read and the reference sequence provided. * * @param referenceBases Array of ASCII bytes that covers at least the the portion of the reference sequence * to which read is aligned from getReferenceStart to getReferenceEnd. * @param bisulfiteSequence If this is true, it is assumed that the reads were bisulfite treated * and C->T on the positive strand and G->A on the negative strand will not be counted * as mismatches. */ public static int countMismatches(final SAMRecord read, final byte[] referenceBases, final boolean bisulfiteSequence) { return countMismatches(read, referenceBases, 0, bisulfiteSequence); }
/** * Calculates the number of mismatches between the read and the reference sequence provided. * * @param referenceBases Array of ASCII bytes that covers at least the the portion of the reference sequence * to which read is aligned from getReferenceStart to getReferenceEnd. * @param bisulfiteSequence If this is true, it is assumed that the reads were bisulfite treated * and C->T on the positive strand and G->A on the negative strand will not be counted * as mismatches. */ public static int countMismatches(final SAMRecord read, final byte[] referenceBases, final boolean bisulfiteSequence) { return countMismatches(read, referenceBases, 0, bisulfiteSequence); }
/** * Calculates the number of mismatches between the read and the reference sequence provided. * * @param referenceBases Array of ASCII bytes that covers at least the the portion of the reference sequence * to which read is aligned from getReferenceStart to getReferenceEnd. * @param bisulfiteSequence If this is true, it is assumed that the reads were bisulfite treated * and C->T on the positive strand and G->A on the negative strand will not be counted * as mismatches. */ public static int countMismatches(final SAMRecord read, final byte[] referenceBases, final boolean bisulfiteSequence) { return countMismatches(read, referenceBases, 0, bisulfiteSequence); }
/** Calculates the number of mismatches between the read and the reference sequence provided. */ public static int countMismatches(final SAMRecord read, final byte[] referenceBases) { return countMismatches(read, referenceBases, 0, false); }
/** Calculates the number of mismatches between the read and the reference sequence provided. */ public static int countMismatches(final SAMRecord read, final byte[] referenceBases, final int referenceOffset) { return countMismatches(read, referenceBases, referenceOffset, false); }
/** Calculates the number of mismatches between the read and the reference sequence provided. */ public static int countMismatches(final SAMRecord read, final byte[] referenceBases, final int referenceOffset) { return countMismatches(read, referenceBases, referenceOffset, false); }
/** * Calculates the number of mismatches between the read and the reference sequence provided. * * @param referenceBases Array of ASCII bytes that covers at least the the portion of the reference sequence * to which read is aligned from getReferenceStart to getReferenceEnd. * @param referenceOffset 0-based offset of the first element of referenceBases relative to the start * of that reference sequence. * @param bisulfiteSequence If this is true, it is assumed that the reads were bisulfite treated * and C->T on the positive strand and G->A on the negative strand will not be counted * as mismatches. */ public static int countMismatches(final SAMRecord read, final byte[] referenceBases, final int referenceOffset, final boolean bisulfiteSequence) { return countMismatches(read, referenceBases, referenceOffset, bisulfiteSequence, false); }
/** Calculates the number of mismatches between the read and the reference sequence provided. */ public static int countMismatches(final SAMRecord read, final byte[] referenceBases) { return countMismatches(read, referenceBases, 0, false); }
/** Calculates the number of mismatches between the read and the reference sequence provided. */ public static int countMismatches(final SAMRecord read, final byte[] referenceBases, final int referenceOffset) { return countMismatches(read, referenceBases, referenceOffset, false); }
/** Calculates the number of mismatches between the read and the reference sequence provided. */ public static int countMismatches(final SAMRecord read, final byte[] referenceBases) { return countMismatches(read, referenceBases, 0, false); }
/** * Calculates the number of mismatches between the read and the reference sequence provided. * * @param referenceBases Array of ASCII bytes that covers at least the the portion of the reference sequence * to which read is aligned from getReferenceStart to getReferenceEnd. * @param referenceOffset 0-based offset of the first element of referenceBases relative to the start * of that reference sequence. * @param bisulfiteSequence If this is true, it is assumed that the reads were bisulfite treated * and C->T on the positive strand and G->A on the negative strand will not be counted * as mismatches. */ public static int countMismatches(final SAMRecord read, final byte[] referenceBases, final int referenceOffset, final boolean bisulfiteSequence) { return countMismatches(read, referenceBases, referenceOffset, bisulfiteSequence, false); }
@Test(dataProvider = "mismatchCountsDataProvider") public void testCountMismatches(final String readString, final String cigar, final String reference, final int expectedMismatchesExact, final int expectedMismatchesAmbiguous) { final SAMRecord rec = new SAMRecord(null); rec.setReadName("test"); rec.setReadString(readString); final byte[] byteArray = new byte[readString.length()]; Arrays.fill(byteArray, (byte)33); rec.setBaseQualities(byteArray); rec.setCigarString(cigar); final byte[] refBases = StringUtil.stringToBytes(reference); final int nExact = SequenceUtil.countMismatches(rec, refBases, -1, false, false); Assert.assertEquals(nExact, expectedMismatchesExact); final int sumMismatchesQualityExact = SequenceUtil.sumQualitiesOfMismatches(rec, refBases, -1, false); Assert.assertEquals(sumMismatchesQualityExact, expectedMismatchesExact * 33); final int nAmbiguous = SequenceUtil.countMismatches(rec, refBases, -1, false, true); Assert.assertEquals(nAmbiguous, expectedMismatchesAmbiguous); }
/** * Sadly, this is a duplicate of the method above, except that it takes char[] for referenceBases rather * than byte[]. This is because GATK needs it this way. * * TODO: Remove this method when GATK map method is changed to take refseq as byte[]. */ public static int calculateSamNmTag(final SAMRecord read, final char[] referenceBases, final int referenceOffset) { int samNm = countMismatches(read, referenceBases, referenceOffset); for (final CigarElement el : read.getCigar().getCigarElements()) { if (el.getOperator() == CigarOperator.INSERTION || el.getOperator() == CigarOperator.DELETION) { samNm += el.getLength(); } } return samNm; }
/** * Calculates the predefined NM tag from the SAM spec: (# of mismatches + # of indels) * For the purposes for calculating mismatches, we do not yet support IUPAC ambiguous codes * (see <code>readBaseMatchesRefBaseWithAmbiguity</code> method). * * @param referenceOffset 0-based offset of the first element of referenceBases relative to the start * of that reference sequence. * @param bisulfiteSequence If this is true, it is assumed that the reads were bisulfite treated * and C->T on the positive strand and G->A on the negative strand will not be counted * as mismatches. */ public static int calculateSamNmTag(final SAMRecord read, final byte[] referenceBases, final int referenceOffset, final boolean bisulfiteSequence) { int samNm = countMismatches(read, referenceBases, referenceOffset, bisulfiteSequence, false); for (final CigarElement el : read.getCigar().getCigarElements()) { if (el.getOperator() == CigarOperator.INSERTION || el.getOperator() == CigarOperator.DELETION) { samNm += el.getLength(); } } return samNm; }
/** * Calculates the for the predefined NM tag from the SAM spec. To the result of * countMismatches() it adds 1 for each indel. * @param referenceOffset 0-based offset of the first element of referenceBases relative to the start * of that reference sequence. * @param bisulfiteSequence If this is true, it is assumed that the reads were bisulfite treated * and C->T on the positive strand and G->A on the negative strand will not be counted * as mismatches. */ public static int calculateSamNmTag(final SAMRecord read, final byte[] referenceBases, final int referenceOffset, final boolean bisulfiteSequence) { int samNm = countMismatches(read, referenceBases, referenceOffset, bisulfiteSequence); for (final CigarElement el : read.getCigar().getCigarElements()) { if (el.getOperator() == CigarOperator.INSERTION || el.getOperator() == CigarOperator.DELETION) { samNm += el.getLength(); } } return samNm; }
/** * Calculates the predefined NM tag from the SAM spec: (# of mismatches + # of indels) * For the purposes for calculating mismatches, we do not yet support IUPAC ambiguous codes * (see <code>readBaseMatchesRefBaseWithAmbiguity</code> method). * * @param referenceOffset 0-based offset of the first element of referenceBases relative to the start * of that reference sequence. * @param bisulfiteSequence If this is true, it is assumed that the reads were bisulfite treated * and C->T on the positive strand and G->A on the negative strand will not be counted * as mismatches. */ public static int calculateSamNmTag(final SAMRecord read, final byte[] referenceBases, final int referenceOffset, final boolean bisulfiteSequence) { int samNm = countMismatches(read, referenceBases, referenceOffset, bisulfiteSequence, false); for (final CigarElement el : read.getCigar().getCigarElements()) { if (el.getOperator() == CigarOperator.INSERTION || el.getOperator() == CigarOperator.DELETION) { samNm += el.getLength(); } } return samNm; }
@Test(dataProvider = "mismatchBisulfiteCountsDataProvider") public void testMismatchBisulfiteCounts(final String readString, final String cigar, final String reference, final boolean positiveStrand, final int expectedMismatches) { final byte baseQuality = 30; final SAMRecord rec = new SAMRecord(null); rec.setReadName("test"); rec.setReadString(readString); rec.setReadNegativeStrandFlag(!positiveStrand); final byte[] byteArray = new byte[readString.length()]; Arrays.fill(byteArray,baseQuality); rec.setBaseQualities(byteArray); rec.setCigarString(cigar); final byte[] refBases = StringUtil.stringToBytes(reference); final int nExact = SequenceUtil.countMismatches(rec, refBases, -1, true, false); Assert.assertEquals(nExact, expectedMismatches); final int sumMismatchesQualityExact = SequenceUtil.sumQualitiesOfMismatches(rec, refBases, -1, true); Assert.assertEquals(sumMismatchesQualityExact, expectedMismatches * baseQuality); }
private void addRead(final GcObject gcObj, final SAMRecord rec, final String group, final byte[] gc, final byte[] refBases) { if (!rec.getReadPairedFlag() || rec.getFirstOfPairFlag()) ++gcObj.totalClusters; final int pos = rec.getReadNegativeStrandFlag() ? rec.getAlignmentEnd() - scanWindowSize : rec.getAlignmentStart(); ++gcObj.totalAlignedReads; if (pos > 0) { final int windowGc = gc[pos]; if (windowGc >= 0) { ++gcObj.readsByGc[windowGc]; gcObj.basesByGc[windowGc] += rec.getReadLength(); gcObj.errorsByGc[windowGc] += SequenceUtil.countMismatches(rec, refBases, bisulfite) + SequenceUtil.countInsertedBases(rec) + SequenceUtil.countDeletedBases(rec); } } if (gcObj.group == null) { gcObj.group = group; } } }
private void addRead(final GcObject gcObj, final SAMRecord rec, final String group, final byte[] gc, final byte[] refBases) { if (!rec.getReadPairedFlag() || rec.getFirstOfPairFlag()) ++gcObj.totalClusters; final int pos = rec.getReadNegativeStrandFlag() ? rec.getAlignmentEnd() - scanWindowSize : rec.getAlignmentStart(); ++gcObj.totalAlignedReads; if (pos > 0) { final int windowGc = gc[pos]; if (windowGc >= 0) { ++gcObj.readsByGc[windowGc]; gcObj.basesByGc[windowGc] += rec.getReadLength(); gcObj.errorsByGc[windowGc] += SequenceUtil.countMismatches(rec, refBases, bisulfite) + SequenceUtil.countInsertedBases(rec) + SequenceUtil.countDeletedBases(rec); } } if (gcObj.group == null) { gcObj.group = group; } } }
smallReadCount++; return; } else if (SequenceUtil.countMismatches(samRecord, refBases, true) > Math.round(samRecord.getReadLength() * maxMismatchRate)) { mismatchCount++; return;