/** * Checks for bisulfite conversion, C->T on the positive strand and G->A on the negative strand. */ public static boolean isBisulfiteConverted(final byte read, final byte reference, final boolean negativeStrand) { if (negativeStrand) { if (basesEqual(reference, G) && basesEqual(read, A)) { return true; } } else { if (basesEqual(reference, C) && basesEqual(read, T)) { return true; } } return false; }
/** * Checks for bisulfite conversion, C->T on the positive strand and G->A on the negative strand. */ public static boolean isBisulfiteConverted(final byte read, final byte reference, final boolean negativeStrand) { if (negativeStrand) { if (basesEqual(reference, G) && basesEqual(read, A)) { return true; } } else { if (basesEqual(reference, C) && basesEqual(read, T)) { return true; } } return false; }
/** * Checks for bisulfite conversion, C->T on the positive strand and G-> on the negative strand. */ public static boolean isBisulfiteConverted(final byte read, final byte reference, final boolean negativeStrand) { if (negativeStrand) { if (basesEqual(reference, G) && basesEqual(read, A)) { return true; } } else { if (basesEqual(reference, C) && basesEqual(read, T)) { return true; } } return false; }
public static int calculateGc(final byte[] bases, final int startIndex, final int endIndex, final CalculateGcState state) { if (state.init) { state.init = false; state.gcCount = 0; state.nCount = 0; for (int i = startIndex; i < endIndex; ++i) { final byte base = bases[i]; if (SequenceUtil.basesEqual(base, (byte)'G') || SequenceUtil.basesEqual(base, (byte)'C')) ++state.gcCount; else if (SequenceUtil.basesEqual(base, (byte)'N')) ++state.nCount; } } else { final byte newBase = bases[endIndex - 1]; if (SequenceUtil.basesEqual(newBase, (byte)'G') || SequenceUtil.basesEqual(newBase, (byte)'C')) ++state.gcCount; else if (newBase == 'N') ++state.nCount; if (SequenceUtil.basesEqual(state.priorBase, (byte)'G') || SequenceUtil.basesEqual(state.priorBase, (byte)'C')) --state.gcCount; else if (SequenceUtil.basesEqual(state.priorBase, (byte)'N')) --state.nCount; } state.priorBase = bases[startIndex]; if (state.nCount > 4) return -1; else return (state.gcCount * 100) / (endIndex - startIndex); }
public static int calculateGc(final byte[] bases, final int startIndex, final int endIndex, final CalculateGcState state) { if (state.init) { state.init = false; state.gcCount = 0; state.nCount = 0; for (int i = startIndex; i < endIndex; ++i) { final byte base = bases[i]; if (SequenceUtil.basesEqual(base, (byte)'G') || SequenceUtil.basesEqual(base, (byte)'C')) ++state.gcCount; else if (SequenceUtil.basesEqual(base, (byte)'N')) ++state.nCount; } } else { final byte newBase = bases[endIndex - 1]; if (SequenceUtil.basesEqual(newBase, (byte)'G') || SequenceUtil.basesEqual(newBase, (byte)'C')) ++state.gcCount; else if (newBase == 'N') ++state.nCount; if (SequenceUtil.basesEqual(state.priorBase, (byte)'G') || SequenceUtil.basesEqual(state.priorBase, (byte)'C')) --state.gcCount; else if (SequenceUtil.basesEqual(state.priorBase, (byte)'N')) --state.nCount; } state.priorBase = bases[startIndex]; if (state.nCount > 4) return -1; else return (state.gcCount * 100) / (endIndex - startIndex); }
/** Returns true if the bases are equal OR if the mismatch cannot be accounted for by * bisfulite treatment. C->T on the positive strand and G->A on the negative strand * do not count as mismatches */ public static boolean bisulfiteBasesEqual(final boolean negativeStrand, final byte read, final byte reference) { return (basesEqual(read, reference)) || (isBisulfiteConverted(read, reference, negativeStrand)); }
/** * True if there's a C in the reference as well as read (possibly bisulfite converted) */ private boolean isC(final byte refBase, final byte readBase) { return (SequenceUtil.basesEqual(refBase, SequenceUtil.C) && SequenceUtil.bisulfiteBasesEqual(readBase, refBase)); }
/** * True if there's a C in the reference as well as read (possibly bisulfite converted) */ private boolean isC(final byte refBase, final byte readBase) { return (SequenceUtil.basesEqual(refBase, SequenceUtil.C) && SequenceUtil.bisulfiteBasesEqual(readBase, refBase)); }
/** * Returns true if the bases are equal OR if the mismatch can be accounted for by * bisulfite treatment. C->T on the positive strand and G->A on the negative strand * do not count as mismatches. */ public static boolean bisulfiteBasesEqual(final boolean negativeStrand, final byte read, final byte reference) { return (basesEqual(read, reference)) || (isBisulfiteConverted(read, reference, negativeStrand)); }
/** * Returns true if the bases are equal OR if the mismatch can be accounted for by * bisulfite treatment. C->T on the positive strand and G->A on the negative strand * do not count as mismatches. */ public static boolean bisulfiteBasesEqual(final boolean negativeStrand, final byte read, final byte reference) { return (basesEqual(read, reference)) || (isBisulfiteConverted(read, reference, negativeStrand)); }
/** * Compare barcode sequence to bases from read * * @return how many bases did not match */ private static int countMismatches(final byte[][] barcodeBytes, final byte[][] readSubsequence, final byte[][] qualities, final int minimumBaseQuality) { int numMismatches = 0; for (int j = 0; j < barcodeBytes.length; j++) { for (int i = 0; (i < barcodeBytes[j].length && readSubsequence[j].length > i); ++i) { if (SequenceUtil.isNoCall(readSubsequence[j][i])) { continue; } if (!SequenceUtil.basesEqual(barcodeBytes[j][i], readSubsequence[j][i])) { ++numMismatches; continue; } if (qualities != null && qualities[j][i] < minimumBaseQuality) { ++numMismatches; } } } return numMismatches; } }
/** * Compare barcode sequence to bases from read * * @return how many bases did not match */ private static int countMismatches(final byte[][] barcodeBytes, final byte[][] readSubsequence, final byte[][] qualities, final int minimumBaseQuality) { int numMismatches = 0; for (int j = 0; j < barcodeBytes.length; j++) { for (int i = 0; (i < barcodeBytes[j].length && readSubsequence[j].length > i); ++i) { if (SequenceUtil.isNoCall(readSubsequence[j][i])) { continue; } if (!SequenceUtil.basesEqual(barcodeBytes[j][i], readSubsequence[j][i])) { ++numMismatches; continue; } if (qualities != null && qualities[j][i] < minimumBaseQuality) { ++numMismatches; } } } return numMismatches; } }
/** * Checks a pair of bases for CpG status & quality thresholds */ private boolean isValidCpg(final byte[] refBases, final byte[] readBases, final byte[] readQualities, final int index) { return isC(refBases[index], readBases[index]) && SequenceUtil.basesEqual(refBases[index+1], readBases[index+1]) && isAboveCytoQcThreshold(readQualities, index); }
/** * Checks a pair of bases for CpG status & quality thresholds */ private boolean isValidCpg(final byte[] refBases, final byte[] readBases, final byte[] readQualities, final int index) { return isC(refBases[index], readBases[index]) && SequenceUtil.basesEqual(refBases[index+1], readBases[index+1]) && isAboveCytoQcThreshold(readQualities, index); }
private static int homologyLength(ReferenceLookup lookup, int referenceIndex, int referencePosition, int referenceStep, byte[] seq, int seqPosition, int seqStep) { SAMSequenceRecord refSeq = lookup.getSequenceDictionary().getSequence(referenceIndex); int homlen = 0; boolean complement = referenceStep != seqStep; while (seqPosition >= 0 && seqPosition < seq.length && // next step must still be on the contig referencePosition >= 1 && referencePosition <= refSeq.getSequenceLength()) { byte base = seq[seqPosition]; if (complement) { base = SequenceUtil.complement(base); } if (SequenceUtil.basesEqual(base, lookup.getBase(referenceIndex, referencePosition)) && !SequenceUtil.basesEqual(base, SequenceUtil.N)) { referencePosition += referenceStep; seqPosition += seqStep; homlen++; } else { break; } } return homlen; }
/** * Helper method to handle the various use cases of base comparison. * * @param readBase the read base to match * @param refBase the reference base to match * @param negativeStrand set to true if the base to test is on the negative strand and should be reverse complemented (only applies if bisulfiteSequence is true) * @param bisulfiteSequence set to true if the base to match is a bisulfite sequence and needs to be converted * @param matchAmbiguousRef causes the match to return true when the read base is a subset of the possible IUPAC reference bases, but not the other way around * @return true if the bases match, false otherwise */ private static boolean basesMatch(final byte readBase, final byte refBase, final boolean negativeStrand, final boolean bisulfiteSequence, final boolean matchAmbiguousRef) { if (bisulfiteSequence) { if (matchAmbiguousRef) return bisulfiteBasesMatchWithAmbiguity(negativeStrand, readBase, refBase); else return bisulfiteBasesEqual(negativeStrand, readBase, refBase); } else { if (matchAmbiguousRef) return readBaseMatchesRefBaseWithAmbiguity(readBase, refBase); else return basesEqual(readBase, refBase); } }
/** * Helper method to handle the various use cases of base comparison. * * @param readBase the read base to match * @param refBase the reference base to match * @param negativeStrand set to true if the base to test is on the negative strand and should be reverse complemented (only applies if bisulfiteSequence is true) * @param bisulfiteSequence set to true if the base to match is a bisulfite sequence and needs to be converted * @param matchAmbiguousRef causes the match to return true when the read base is a subset of the possible IUPAC reference bases, but not the other way around * @return true if the bases match, false otherwise */ private static boolean basesMatch(final byte readBase, final byte refBase, final boolean negativeStrand, final boolean bisulfiteSequence, final boolean matchAmbiguousRef) { if (bisulfiteSequence) { if (matchAmbiguousRef) return bisulfiteBasesMatchWithAmbiguity(negativeStrand, readBase, refBase); else return bisulfiteBasesEqual(negativeStrand, readBase, refBase); } else { if (matchAmbiguousRef) return readBaseMatchesRefBaseWithAmbiguity(readBase, refBase); else return basesEqual(readBase, refBase); } }
@Test(dataProvider = "basesEqualDataProvider") public void testBasesEqual(final char base1, final char base2, final boolean expectedB1EqualsB2, final boolean expectedB1ReadMatchesB2Ref, final boolean expectedB2ReadMatchesB1Ref) { final char[] base1UcLc = new char[] { toUpperCase(base1), toLowerCase(base1) }; final char[] base2UcLc = new char[] { toUpperCase(base2), toLowerCase(base2) }; // Test over all permutations - uc vs uc, uc vs lc, lc vs uc, lc vs lc for (char theBase1 : base1UcLc) { for (char theBase2 : base2UcLc) { // for equality, order should not matter final boolean b1EqualsB2 = SequenceUtil.basesEqual((byte) theBase1, (byte) theBase2); Assert.assertEquals(b1EqualsB2, expectedB1EqualsB2, "basesEqual test failed for '" + theBase1 + "' vs. '" + theBase2 + "'"); final boolean b2EqualsB1 = SequenceUtil.basesEqual((byte) theBase2, (byte) theBase1); Assert.assertEquals(b2EqualsB1, expectedB1EqualsB2, "basesEqual test failed for '" + theBase1 + "' vs. '" + theBase2 + "'"); // for ambiguous read/ref matching, the order does matter final boolean b1ReadMatchesB2Ref = SequenceUtil.readBaseMatchesRefBaseWithAmbiguity((byte) theBase1, (byte) theBase2); Assert.assertEquals(b1ReadMatchesB2Ref, expectedB1ReadMatchesB2Ref, "readBaseMatchesRefBaseWithAmbiguity test failed for '" + theBase1 + "' vs. '" + theBase2 + "'"); final boolean b2ReadMatchesB1Ref = SequenceUtil.readBaseMatchesRefBaseWithAmbiguity((byte) theBase2, (byte) theBase1); Assert.assertEquals(b2ReadMatchesB1Ref, expectedB2ReadMatchesB1Ref, "readBaseMatchesRefBaseWithAmbiguity test failed for '" + theBase1 + "' vs. '" + theBase2 + "'"); } } }
/** * Sadly, this is a duplicate of the method above, except that it takes char[] for referenceBases rather * than byte[]. This is because GATK needs it this way. * * TODO: Remove this method when GATK map method is changed to take refseq as byte[]. */ private static int countMismatches(final SAMRecord read, final char[] referenceBases, final int referenceOffset) { int mismatches = 0; final byte[] readBases = read.getReadBases(); for (final AlignmentBlock block : read.getAlignmentBlocks()) { final int readBlockStart = block.getReadStart() - 1; final int referenceBlockStart = block.getReferenceStart() - 1 - referenceOffset; final int length = block.getLength(); for (int i=0; i<length; ++i) { if (!basesEqual(readBases[readBlockStart+i], StringUtil.charToByte(referenceBases[referenceBlockStart+i]))) { ++mismatches; } } } return mismatches; }
@Test public void testSamLocusAndReferenceIterator() { final File reference = new File(TEST_DATA_DIR, "Homo_sapiens_assembly18.trimmed.fasta"); final File samFile = new File(TEST_DATA_DIR, "simpleSmallFile.sam"); final ReferenceSequenceFile referenceSequenceFile = new FastaSequenceFile(reference, false); final ReferenceSequenceFileWalker referenceSequenceFileWalker = new ReferenceSequenceFileWalker(referenceSequenceFile); final SamReader samReader = SamReaderFactory.makeDefault().open(samFile); final SamLocusIterator samLocusIterator = new SamLocusIterator(samReader); final SamLocusAndReferenceIterator samLocusAndReferences = new SamLocusAndReferenceIterator(referenceSequenceFileWalker, samLocusIterator); IntervalList intervalList = new IntervalList(samReader.getFileHeader()); intervalList.add(new Interval("chrM", 1, 36)); intervalList.add(new Interval("chr20", 8401, 8460)); OverlapDetector<Interval> overlapDetector = new OverlapDetector<>(0, 0); overlapDetector.addAll(intervalList.getIntervals(), intervalList.getIntervals()); for (final SamLocusAndReferenceIterator.SAMLocusAndReference samLocusAndReference : samLocusAndReferences) { // The sam file only has coverage in the intervals that are within 'intervalList', and there the coverage should // be exactly 2 since there are two overlapping, paired reads. This is what this test is testing: Assert.assertEquals(samLocusAndReference.getRecordAndOffsets().size(), overlapDetector.overlapsAny(samLocusAndReference.getLocus()) ? 2 : 0, "Position:" + samLocusAndReference.getLocus().toString()); // all the reads are equal to the reference...this is what this test is testing. for (final SamLocusIterator.RecordAndOffset recordAndOffset : samLocusAndReference.getRecordAndOffsets()) Assert.assertTrue(SequenceUtil.basesEqual(samLocusAndReference.getReferenceBase(), recordAndOffset.getReadBase()), "Record: " + recordAndOffset.getRecord() + " Position:" + samLocusAndReference.getLocus().toString()); } }