public String toString() { return "ReferenceSequence " + getName(); } }
public String toString() { return "ReferenceSequence " + getName(); } }
public String toString() { return "ReferenceSequence " + getName(); } }
public static byte[] getBasesFromReferenceFile(String referenceFilePath, String seqName, int from, int length) { ReferenceSequenceFile referenceSequenceFile = ReferenceSequenceFileFactory.getReferenceSequenceFile(new File( referenceFilePath)); ReferenceSequence sequence = referenceSequenceFile.getSequence(seqName); byte[] bases = referenceSequenceFile.getSubsequenceAt(sequence.getName(), from, from + length).getBases(); return bases; }
public void add(final String name, final byte[] bases) { final ReferenceSequence sequence = new ReferenceSequence(name, map.size(), bases); map.put(sequence.getName(), sequence); }
@Override public ReferenceSequence getSubsequenceAt(String contig, long start, long stop) { int length = (int)(stop - start + 1); ReferenceSequence fullContig = getSequence(contig); if (length > fullContig.length()) { throw new IllegalArgumentException("subsequence out of contig bounds"); } if (start > stop + 1) { throw new IllegalArgumentException("start after stop"); } byte[] target = new byte[length]; System.arraycopy(fullContig.getBases(), (int) (start - 1), target, 0, target.length); return new ReferenceSequence(fullContig.getName(), fullContig.getContigIndex(), target); } @Override
public PackedReferenceSequence(ReferenceSequence seq) { super(seq.getBases(), false, false); this.name = seq.getName(); this.contigIndex = seq.getContigIndex(); this.length = seq.length(); this.ambiguous = new BitSet(seq.length()); byte[] seqBases = seq.getBases(); for (int i = 0; i < length; i++) { if (KmerEncodingHelper.isAmbiguous(seqBases[i])) { ambiguous.set(i); } } } public ReferenceSequence getSequence() {
@Test(dataProvider="homosapiens") public void testFirstSequenceExtended(AbstractIndexedFastaSequenceFile sequenceFile) { long startTime = System.currentTimeMillis(); ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",1,extendedBasesOfChrM.length()); long endTime = System.currentTimeMillis(); Assert.assertEquals(sequence.getName(),"chrM","Sequence contig is not correct"); Assert.assertEquals(sequence.getContigIndex(),0,"Sequence contig index is not correct"); Assert.assertEquals(StringUtil.bytesToString(sequence.getBases()),extendedBasesOfChrM,"First n bases of chrM are incorrect"); CloserUtil.close(sequenceFile); System.err.printf("testFirstSequenceExtended runtime: %dms%n", (endTime - startTime)) ; }
@Test public void testTrailingWhitespace() throws Exception { final File fasta = File.createTempFile("test", ".fasta"); fasta.deleteOnExit(); final PrintWriter writer = new PrintWriter(fasta); final String chr1 = "chr1"; writer.println(">" + chr1); final String sequence = "ACGTACGT"; writer.println(sequence); writer.println(sequence + " \t"); writer.close(); final FastaSequenceFile fastaReader = new FastaSequenceFile(fasta, true); final ReferenceSequence referenceSequence = fastaReader.nextSequence(); Assert.assertEquals(referenceSequence.getName(), chr1); Assert.assertEquals(StringUtil.bytesToString(referenceSequence.getBases()), sequence + sequence); }
@Test(dataProvider="homosapiens") public void testFirstSequence(AbstractIndexedFastaSequenceFile sequenceFile) { long startTime = System.currentTimeMillis(); ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM",1,firstBasesOfChrM.length()); long endTime = System.currentTimeMillis(); Assert.assertEquals(sequence.getName(),"chrM","Sequence contig is not correct"); Assert.assertEquals(sequence.getContigIndex(),0,"Sequence contig index is not correct"); Assert.assertEquals(StringUtil.bytesToString(sequence.getBases()),firstBasesOfChrM,"First n bases of chrM are incorrect"); CloserUtil.close(sequenceFile); System.err.printf("testFirstSequence runtime: %dms%n", (endTime - startTime)) ; }
@Test public void testTrailingWhitespaceWithPreexistingSequenceDictionary() throws Exception { final File fasta = new File("src/test/resources/htsjdk/samtools/reference/reference_with_trailing_whitespace.fasta"); final FastaSequenceFile fastaReader = new FastaSequenceFile(fasta, true); ReferenceSequence referenceSequence = fastaReader.nextSequence(); Assert.assertEquals(referenceSequence.getName(), "chr1"); Assert.assertEquals(StringUtil.bytesToString(referenceSequence.getBases()), "ACGTACGT"); referenceSequence = fastaReader.nextSequence(); Assert.assertEquals(referenceSequence.getName(), "chr2"); Assert.assertEquals(StringUtil.bytesToString(referenceSequence.getBases()), "TCGATCGA"); }
@Test(dataProvider="homosapiens") public void testReadStartingInCenterOfMiddleLine(AbstractIndexedFastaSequenceFile sequenceFile) { final int bytesToChopOff = 120; String truncated = extendedBasesOfChrM.substring(bytesToChopOff); long startTime = System.currentTimeMillis(); ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM", bytesToChopOff + 1, bytesToChopOff + truncated.length()); long endTime = System.currentTimeMillis(); Assert.assertEquals(sequence.getName(),"chrM","Sequence contig is not correct"); Assert.assertEquals(sequence.getContigIndex(),0,"Sequence contig index is not correct"); Assert.assertEquals(StringUtil.bytesToString(sequence.getBases()),truncated,"First n bases of chrM are incorrect"); CloserUtil.close(sequenceFile); System.err.printf("testReadStartingInCenterOfMiddleLine runtime: %dms%n", (endTime - startTime)) ; }
@Test(dataProvider="homosapiens") public void testLastOfChr20(AbstractIndexedFastaSequenceFile sequenceFile) { long startTime = System.currentTimeMillis(); ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chr20", CHR20_LENGTH - lastBasesOfChr20.length()+1, CHR20_LENGTH); long endTime = System.currentTimeMillis(); Assert.assertEquals(sequence.getName(),"chr20","Sequence contig is not correct"); Assert.assertEquals(sequence.getContigIndex(),1,"Sequence contig index is not correct"); Assert.assertEquals(StringUtil.bytesToString(sequence.getBases()),lastBasesOfChr20,"First n bases of chr1 are incorrect"); CloserUtil.close(sequenceFile); System.err.printf("testFirstOfChr1 runtime: %dms%n", (endTime - startTime)) ; }
@Test(dataProvider="homosapiens") public void testReadStartingInCenterOfFirstLine(AbstractIndexedFastaSequenceFile sequenceFile) { final int bytesToChopOff = 5; String truncated = extendedBasesOfChrM.substring(bytesToChopOff); long startTime = System.currentTimeMillis(); ReferenceSequence sequence = sequenceFile.getSubsequenceAt("chrM", bytesToChopOff + 1, bytesToChopOff + truncated.length()); long endTime = System.currentTimeMillis(); Assert.assertEquals(sequence.getName(),"chrM","Sequence contig is not correct"); Assert.assertEquals(sequence.getContigIndex(),0,"Sequence contig index is not correct"); Assert.assertEquals(StringUtil.bytesToString(sequence.getBases()),truncated,"First n bases of chrM are incorrect"); CloserUtil.close(sequenceFile); System.err.printf("testReadStartingInCenterOfFirstLine runtime: %dms%n", (endTime - startTime)) ; }
@Test(dataProvider="fastaTestParameters") public void testSingleShortSequence(int chroms, int basesPerChrom) throws Exception { File f = makeRandomReference(chroms, basesPerChrom); ReferenceSequenceFile ref = ReferenceSequenceFileFactory.getReferenceSequenceFile(f); for (int i=1; i<=chroms; ++i) { ReferenceSequence seq = ref.nextSequence(); Assert.assertNotNull(seq); Assert.assertEquals(seq.length(), basesPerChrom); Assert.assertEquals(seq.getName(), "chr" + i); Assert.assertEquals(seq.getContigIndex(), i-1); } Assert.assertNull(ref.nextSequence()); }
@Test(dataProvider="comparative") public void testLastCompleteContigRead(ReferenceSequenceFile originalSequenceFile, AbstractIndexedFastaSequenceFile sequenceFile) { ReferenceSequence expectedSequence = originalSequenceFile.nextSequence(); while( !expectedSequence.getName().equals("chr20") ) expectedSequence = originalSequenceFile.nextSequence(); long startTime = System.currentTimeMillis(); ReferenceSequence sequence = sequenceFile.getSequence("chr20"); long endTime = System.currentTimeMillis(); Assert.assertEquals(sequence.getName(),"chr20","Sequence contig is not correct"); Assert.assertEquals(sequence.getContigIndex(),1,"Sequence contig index is not correct"); Assert.assertEquals(StringUtil.bytesToString(sequence.getBases()),StringUtil.bytesToString(expectedSequence.getBases()),"chrX_random is incorrect"); CloserUtil.close(originalSequenceFile); CloserUtil.close(sequenceFile); System.err.printf("testLastCompleteContigRead runtime: %dms%n", (endTime - startTime)) ; }
@Test(dataProvider="comparative") public void testFirstCompleteContigRead(ReferenceSequenceFile originalSequenceFile, AbstractIndexedFastaSequenceFile sequenceFile) { ReferenceSequence expectedSequence = originalSequenceFile.nextSequence(); long startTime = System.currentTimeMillis(); ReferenceSequence sequence = sequenceFile.getSequence("chrM"); long endTime = System.currentTimeMillis(); Assert.assertEquals(sequence.getName(),"chrM","Sequence contig is not correct"); Assert.assertEquals(sequence.getContigIndex(),0,"Sequence contig index is not correct"); Assert.assertEquals(StringUtil.bytesToString(sequence.getBases()),StringUtil.bytesToString(expectedSequence.getBases()),"chrM is incorrect"); CloserUtil.close(originalSequenceFile); CloserUtil.close(sequenceFile); System.err.printf("testFirstCompleteContigRead runtime: %dms%n", (endTime - startTime)) ; }
@Test(dataProvider="comparative") public void testFirstElementOfIterator(ReferenceSequenceFile originalSequenceFile, AbstractIndexedFastaSequenceFile sequenceFile) { ReferenceSequence expectedSequence = originalSequenceFile.nextSequence(); long startTime = System.currentTimeMillis(); ReferenceSequence sequence = sequenceFile.nextSequence(); long endTime = System.currentTimeMillis(); Assert.assertEquals(sequence.getName(), "chrM","Sequence contig is not correct"); Assert.assertEquals(sequence.getContigIndex(), 0,"Sequence contig index is not correct"); Assert.assertEquals(StringUtil.bytesToString(sequence.getBases()),StringUtil.bytesToString(expectedSequence.getBases()),"chrM is incorrect"); CloserUtil.close(originalSequenceFile); CloserUtil.close(sequenceFile); System.err.printf("testFirstElementOfIterator runtime: %dms%n", (endTime - startTime)) ; }
@Test(dataProvider="comparative") public void testNextElementOfIterator(ReferenceSequenceFile originalSequenceFile, AbstractIndexedFastaSequenceFile sequenceFile) { // Skip past the first one and load the second one. originalSequenceFile.nextSequence(); ReferenceSequence expectedSequence = originalSequenceFile.nextSequence(); long startTime = System.currentTimeMillis(); sequenceFile.nextSequence(); ReferenceSequence sequence = sequenceFile.nextSequence(); long endTime = System.currentTimeMillis(); Assert.assertEquals(sequence.getName(),"chr20","Sequence contig is not correct"); Assert.assertEquals(sequence.getContigIndex(),1,"Sequence contig index is not correct"); Assert.assertEquals(sequence.length(),expectedSequence.length(),"Sequence size is not correct"); Assert.assertEquals(StringUtil.bytesToString(sequence.getBases()),StringUtil.bytesToString(expectedSequence.getBases()),"chr1 is incorrect"); CloserUtil.close(originalSequenceFile); CloserUtil.close(sequenceFile); System.err.printf("testNextElementOfIterator runtime: %dms%n", (endTime - startTime)) ; }
@Test(dataProvider="comparative") public void testReset(ReferenceSequenceFile originalSequenceFile, AbstractIndexedFastaSequenceFile sequenceFile) { // Skip past the first one and load the second one. ReferenceSequence expectedSequence = originalSequenceFile.nextSequence(); long startTime = System.currentTimeMillis(); sequenceFile.nextSequence(); sequenceFile.nextSequence(); sequenceFile.reset(); ReferenceSequence sequence = sequenceFile.nextSequence(); long endTime = System.currentTimeMillis(); Assert.assertEquals(sequence.getName(),"chrM","Sequence contig is not correct"); Assert.assertEquals(sequence.getContigIndex(),0,"Sequence contig index is not correct"); Assert.assertEquals(sequence.length(),expectedSequence.length(), "Sequence size is not correct"); Assert.assertEquals(StringUtil.bytesToString(sequence.getBases()),StringUtil.bytesToString(expectedSequence.getBases()),"chrM is incorrect"); CloserUtil.close(originalSequenceFile); CloserUtil.close(sequenceFile); System.err.printf("testReset runtime: %dms%n", (endTime - startTime)) ; }