public PackedReferenceSequence(ReferenceSequence seq) { super(seq.getBases(), false, false); this.name = seq.getName(); this.contigIndex = seq.getContigIndex(); this.length = seq.length(); this.ambiguous = new BitSet(seq.length()); byte[] seqBases = seq.getBases(); for (int i = 0; i < length; i++) { if (KmerEncodingHelper.isAmbiguous(seqBases[i])) { ambiguous.set(i); } } } public ReferenceSequence getSequence() {
@Override public ReferenceSequence getSubsequenceAt(String contig, long start, long stop) { int length = (int)(stop - start + 1); ReferenceSequence fullContig = getSequence(contig); if (length > fullContig.length()) { throw new IllegalArgumentException("subsequence out of contig bounds"); } if (start > stop + 1) { throw new IllegalArgumentException("start after stop"); } byte[] target = new byte[length]; System.arraycopy(fullContig.getBases(), (int) (start - 1), target, 0, target.length); return new ReferenceSequence(fullContig.getName(), fullContig.getContigIndex(), target); } @Override
@Override public List<Entry> call() throws Exception { List<Entry> list = new ArrayList<RefRepo.Entry>(); ReferenceSequenceFile rsFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(file); ReferenceSequence sequence = null; while ((sequence = rsFile.nextSequence()) != null) { sequence.getBases(); Entry e = new Entry(); e.md5 = Utils.calculateMD5String(sequence.getBases()); e.file = "file://" + file.getAbsolutePath(); e.name = sequence.getName(); e.length = sequence.length(); log.info(String.format("New entry: %s", e.toString())); list.add(e); } return list; } };
@Override List<Bait> design(final BaitDesigner designer, final Interval target, final ReferenceSequence reference) { final List<Bait> baits = new LinkedList<Bait>(); final int baitSize = designer.BAIT_SIZE; final int baitOffset = designer.BAIT_OFFSET; final int lastPossibleBaitStart = Math.min(target.getEnd(), reference.length() - baitSize); final int baitCount = 1 + (int) Math.floor((lastPossibleBaitStart - target.getStart()) / (double) baitOffset); int i = 0; for (int start = target.getStart(); start < lastPossibleBaitStart; start += baitOffset) { final Bait bait = new Bait(target.getContig(), start, CoordMath.getEnd(start, baitSize), target.isNegativeStrand(), designer.makeBaitName(target.getName(), ++i, baitCount)); bait.addBases(reference, designer.DESIGN_ON_TARGET_STRAND); baits.add(bait); } return baits; } };
@Test(dataProvider="fastaTestParameters") public void testSingleShortSequence(int chroms, int basesPerChrom) throws Exception { File f = makeRandomReference(chroms, basesPerChrom); ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(f); for (int i=1; i<=chroms; ++i) { ReferenceSequence seq = ref.nextSequence(); Assert.assertNotNull(seq); Assert.assertEquals(seq.length(), basesPerChrom); Assert.assertEquals(seq.getName(), "chr" + i); Assert.assertEquals(seq.getContigIndex(), i-1); } Assert.assertNull(ref.nextSequence()); }
public Integer map(RefMetaDataTracker tracker, ReferenceContext ref, AlignmentContext context) { final String locusContigName = ref.getLocus().getContig(); if ( ! locusContigName.equals(contigName) ) { contigName = locusContigName; ReferenceSequence refSeq = uncachedRef.getSequence(contigName); contigStart = 1; contigEnd = contigStart + refSeq.length() - 1; uncachedBases = uncachedRef.getSubsequenceAt(contigName, contigStart, contigEnd).getBases(); logger.info(String.format("Loading contig %s (%d-%d)", contigName, contigStart, contigEnd)); } final byte refBase = ref.getBase(); if (! ( BaseUtils.isRegularBase(refBase) || isExtendFastaBase(refBase) ) ) throwError(ref, String.format("Refbase isn't a regular base (%d %c)", refBase, (char)refBase)); // check bases are equal final int pos = (int)context.getPosition() - contigStart; if ( pos > contigEnd ) throwError(ref, String.format("off contig (len=%d)", contigEnd)); final byte uncachedBase = uncachedBases[pos]; if ( uncachedBase != refBase ) throwError(ref, String.format("Provided refBase (%d %c) not equal to uncached one (%d %c)", refBase, (char)refBase, uncachedBase, (char)uncachedBase)); return 1; }
@Override List<Bait> design(final BaitDesigner designer, final Interval target, final ReferenceSequence reference) { final List<Bait> baits = new LinkedList<Bait>(); final int baitSize = designer.BAIT_SIZE; final int baitOffset = designer.BAIT_OFFSET; final int lastPossibleBaitStart = Math.min(target.getEnd(), reference.length() - baitSize); final int baitCount = 1 + (int) Math.floor((lastPossibleBaitStart - target.getStart()) / (double) baitOffset); int i = 0; for (int start = target.getStart(); start < lastPossibleBaitStart; start += baitOffset) { final Bait bait = new Bait(target.getContig(), start, CoordMath.getEnd(start, baitSize), target.isNegativeStrand(), designer.makeBaitName(target.getName(), ++i, baitCount)); bait.addBases(reference, designer.DESIGN_ON_TARGET_STRAND); baits.add(bait); } return baits; } };
@Test public void testOverlappingReferenceBases() { Shard shard = new MockLocusShard(genomeLocParser,Collections.singletonList(genomeLocParser.createGenomeLoc(sequenceFile.getSequenceDictionary().getSequence(0).getSequenceName(), sequenceFile.getSequence("chrM").length() - 10, sequenceFile.getSequence("chrM").length()))); LocusShardDataProvider dataProvider = new LocusShardDataProvider(shard, null, genomeLocParser, shard.getGenomeLocs().get(0), null, sequenceFile, null); LocusReferenceView view = new LocusReferenceView(dataProvider); byte[] results = view.getReferenceBases(genomeLocParser.createGenomeLoc(sequenceFile.getSequenceDictionary().getSequence(0).getSequenceName(), sequenceFile.getSequence("chrM").length() - 10, sequenceFile.getSequence("chrM").length() + 9)); System.out.printf("results are %s%n", new String(results)); Assert.assertEquals(results.length, 20); for (int x = 0; x < results.length; x++) { if (x <= 10) Assert.assertTrue(results[x] != 'X'); else Assert.assertTrue(results[x] == 'X'); } }
@Test(dataProvider="comparative") public void testNextElementOfIterator(ReferenceSequenceFile originalSequenceFile, AbstractIndexedFastaSequenceFile sequenceFile) { // Skip past the first one and load the second one. originalSequenceFile.nextSequence(); ReferenceSequence expectedSequence = originalSequenceFile.nextSequence(); long startTime = System.currentTimeMillis(); sequenceFile.nextSequence(); ReferenceSequence sequence = sequenceFile.nextSequence(); long endTime = System.currentTimeMillis(); Assert.assertEquals(sequence.getName(),"chr20","Sequence contig is not correct"); Assert.assertEquals(sequence.getContigIndex(),1,"Sequence contig index is not correct"); Assert.assertEquals(sequence.length(),expectedSequence.length(),"Sequence size is not correct"); Assert.assertEquals(StringUtil.bytesToString(sequence.getBases()),StringUtil.bytesToString(expectedSequence.getBases()),"chr1 is incorrect"); CloserUtil.close(originalSequenceFile); CloserUtil.close(sequenceFile); System.err.printf("testNextElementOfIterator runtime: %dms%n", (endTime - startTime)) ; }
@Test @Transactional(propagation = Propagation.REQUIRED) public void getSequence() throws Exception { ChromosomeReferenceSequence referenceSequence = new ChromosomeReferenceSequence(chromosomes, referenceId, referenceManager); Assert.assertTrue(referenceSequence.getSequenceDictionary() != null); Assert.assertEquals(EXPECTED_CHROMOSOMES.size(), referenceSequence.getSequenceDictionary().size()); for (Map.Entry<String, Integer> chrSize : EXPECTED_CHROMOSOMES.entrySet()) { ReferenceSequence sequence = referenceSequence.getSequence(chrSize.getKey()); Assert.assertEquals(chrSize.getValue().intValue(), sequence.length()); Assert.assertEquals(EXPECTED_SEQUENCES.get(chrSize.getKey()), sequence.getBaseString()); } }
t2 = new Interval(target.getContig(), Math.max(target.getStart() - left, 1), Math.min(target.getEnd() + right, reference.length()), target.isNegativeStrand(), target.getName()); int end = CoordMath.getEnd(start, baitSize); if (end > reference.length()) break; if (end + move <= reference.length() && getMaskedBaseCount(bases, start + move - 1, end + move) <= MAX_MASKED) { start = start + move; end = end + move;
t2 = new Interval(target.getContig(), Math.max(target.getStart() - left, 1), Math.min(target.getEnd() + right, reference.length()), target.isNegativeStrand(), target.getName()); int end = CoordMath.getEnd(start, baitSize); if (end > reference.length()) break; if (end + move <= reference.length() && getMaskedBaseCount(bases, start + move - 1, end + move) <= MAX_MASKED) { start = start + move; end = end + move;
@Test(dataProvider="comparative") public void testReset(ReferenceSequenceFile originalSequenceFile, AbstractIndexedFastaSequenceFile sequenceFile) { // Skip past the first one and load the second one. ReferenceSequence expectedSequence = originalSequenceFile.nextSequence(); long startTime = System.currentTimeMillis(); sequenceFile.nextSequence(); sequenceFile.nextSequence(); sequenceFile.reset(); ReferenceSequence sequence = sequenceFile.nextSequence(); long endTime = System.currentTimeMillis(); Assert.assertEquals(sequence.getName(),"chrM","Sequence contig is not correct"); Assert.assertEquals(sequence.getContigIndex(),0,"Sequence contig index is not correct"); Assert.assertEquals(sequence.length(),expectedSequence.length(), "Sequence size is not correct"); Assert.assertEquals(StringUtil.bytesToString(sequence.getBases()),StringUtil.bytesToString(expectedSequence.getBases()),"chrM is incorrect"); CloserUtil.close(originalSequenceFile); CloserUtil.close(sequenceFile); System.err.printf("testReset runtime: %dms%n", (endTime - startTime)) ; }
/** * Create one SAMSequenceRecord from a single fasta sequence */ private SAMSequenceRecord makeSequenceRecord(final ReferenceSequence refSeq) { final SAMSequenceRecord ret = new SAMSequenceRecord(refSeq.getName(), refSeq.length()); // Compute MD5 of upcased bases final byte[] bases = refSeq.getBases(); for (int i = 0; i < bases.length; ++i) { bases[i] = StringUtil.toUpperCase(bases[i]); } ret.setAttribute(SAMSequenceRecord.MD5_TAG, md5Hash(bases)); if (GENOME_ASSEMBLY != null) { ret.setAttribute(SAMSequenceRecord.ASSEMBLY_TAG, GENOME_ASSEMBLY); } ret.setAttribute(SAMSequenceRecord.URI_TAG, URI); if (SPECIES != null) { ret.setAttribute(SAMSequenceRecord.SPECIES_TAG, SPECIES); } return ret; }
/** * Create one SAMSequenceRecord from a single fasta sequence */ private SAMSequenceRecord makeSequenceRecord(final ReferenceSequence refSeq) { final SAMSequenceRecord ret = new SAMSequenceRecord(refSeq.getName(), refSeq.length()); // Compute MD5 of upcased bases final byte[] bases = refSeq.getBases(); for (int i = 0; i < bases.length; ++i) { bases[i] = StringUtil.toUpperCase(bases[i]); } ret.setAttribute(SAMSequenceRecord.MD5_TAG, md5Hash(bases)); if (GENOME_ASSEMBLY != null) { ret.setAttribute(SAMSequenceRecord.ASSEMBLY_TAG, GENOME_ASSEMBLY); } ret.setAttribute(SAMSequenceRecord.URI_TAG, URI); if (SPECIES != null) { ret.setAttribute(SAMSequenceRecord.SPECIES_TAG, SPECIES); } return ret; }
if (contextStartIndex < 0 || contextStartIndex + contextFullLength > ref.length()) continue;
if (contextStartIndex < 0 || contextStartIndex + contextFullLength > ref.length()) continue;