public static SingleFileHdfsReader getHdfsReader(ReaderType readerType, SystemStreamPartition systemStreamPartition) { switch (readerType) { case AVRO: return new AvroFileHdfsReader(systemStreamPartition); default: throw new SamzaException("Unsupported reader type: " + readerType); } }
public static int offsetComparator(ReaderType readerType, String offset1, String offset2) { switch (readerType) { case AVRO: return AvroFileHdfsReader.offsetComparator(offset1, offset2); default: throw new SamzaException("Unsupported reader type: " + readerType); } }
@Override public void open(String pathStr, String singleFileOffset) { LOG.info(String.format("%s: Open file [%s] with file offset [%s] for read", systemStreamPartition, pathStr, singleFileOffset)); Path path = new Path(pathStr); try { AvroFSInput input = new AvroFSInput(FileContext.getFileContext(path.toUri()), path); fileReader = new DataFileReader<>(input, new GenericDatumReader<>()); seek(singleFileOffset); } catch (IOException e) { throw new SamzaException(e); } }
@Override public IncomingMessageEnvelope readNext() { // get checkpoint for THIS record String checkpoint = nextOffset(); GenericRecord record = fileReader.next(); if (fileReader.previousSync() != curBlockStart) { curBlockStart = fileReader.previousSync(); curRecordOffset = 0; } else { curRecordOffset++; } // avro schema doesn't necessarily have key field return new IncomingMessageEnvelope(systemStreamPartition, checkpoint, null, record); }
@Test public void testFileReopen() throws Exception { SystemStreamPartition ssp = new SystemStreamPartition("hdfs", "testStream", new Partition(0)); SingleFileHdfsReader reader = new AvroFileHdfsReader(ssp); reader.open(AVRO_FILE, "0"); int index = 0; for (;index < NUM_EVENTS / 2; index++) { GenericRecord record = (GenericRecord) reader.readNext().getMessage(); Assert.assertEquals(index, record.get(FIELD_1)); Assert.assertEquals("string_" + index, record.get(FIELD_2).toString()); } String offset = reader.nextOffset(); reader.close(); reader = new AvroFileHdfsReader(ssp); reader.open(AVRO_FILE, offset); for (;index < NUM_EVENTS; index++) { GenericRecord record = (GenericRecord) reader.readNext().getMessage(); Assert.assertEquals(index, record.get(FIELD_1)); Assert.assertEquals("string_" + index, record.get(FIELD_2).toString()); } Assert.assertEquals(NUM_EVENTS, index); reader.close(); }
@Test(expected = Exception.class) public void testOffsetComparator_InvalidInput() { AvroFileHdfsReader.offsetComparator("1982,13", "1930,1"); } }
@Test public void testSequentialRead() throws Exception { SystemStreamPartition ssp = new SystemStreamPartition("hdfs", "testStream", new Partition(0)); SingleFileHdfsReader reader = new AvroFileHdfsReader(ssp); reader.open(AVRO_FILE, "0"); int index = 0; while (reader.hasNext()) { GenericRecord record = (GenericRecord) reader.readNext().getMessage(); Assert.assertEquals(index, record.get(FIELD_1)); Assert.assertEquals("string_" + index, record.get(FIELD_2).toString()); index++; } Assert.assertEquals(NUM_EVENTS, index); reader.close(); }
@Test public void testOffsetComparator() { Assert.assertEquals(-1, AvroFileHdfsReader.offsetComparator("0", "1452")); Assert.assertEquals(-1, AvroFileHdfsReader.offsetComparator("2001@3", "2001@4")); Assert.assertEquals(-1, AvroFileHdfsReader.offsetComparator("2001@4", "2010@1")); Assert.assertEquals(-1, AvroFileHdfsReader.offsetComparator("2001@3", "2011@3")); Assert.assertEquals(-1, AvroFileHdfsReader.offsetComparator("2001", "2001@4")); Assert.assertEquals(-1, AvroFileHdfsReader.offsetComparator("2001", "2010@1")); Assert.assertEquals(-1, AvroFileHdfsReader.offsetComparator("2001@3", "2010")); Assert.assertEquals(1, AvroFileHdfsReader.offsetComparator("1984", "0")); Assert.assertEquals(1, AvroFileHdfsReader.offsetComparator("1984@2", "1984@1")); Assert.assertEquals(1, AvroFileHdfsReader.offsetComparator("14341@2", "1984@2")); Assert.assertEquals(1, AvroFileHdfsReader.offsetComparator("14341@1", "1984@10")); Assert.assertEquals(1, AvroFileHdfsReader.offsetComparator("14341", "1984@10")); Assert.assertEquals(1, AvroFileHdfsReader.offsetComparator("14341@1", "1984")); Assert.assertEquals(0, AvroFileHdfsReader.offsetComparator("1989", "1989")); Assert.assertEquals(0, AvroFileHdfsReader.offsetComparator("1989@0", "1989")); Assert.assertEquals(0, AvroFileHdfsReader.offsetComparator("1989", "1989@0")); Assert.assertEquals(0, AvroFileHdfsReader.offsetComparator("0", "0")); Assert.assertEquals(0, AvroFileHdfsReader.offsetComparator("1989@1", "1989@1")); }
@Test public void testRandomRead() throws Exception { SystemStreamPartition ssp = new SystemStreamPartition("hdfs", "testStream", new Partition(0)); SingleFileHdfsReader reader = new AvroFileHdfsReader(ssp); reader.open(AVRO_FILE, "0"); for (int i = 0;i < NUM_EVENTS / 2; i++) { reader.readNext(); } String offset = reader.nextOffset(); IncomingMessageEnvelope envelope = reader.readNext(); Assert.assertEquals(offset, envelope.getOffset()); GenericRecord record1 = (GenericRecord) envelope.getMessage(); for (int i = 0; i < 5; i++) reader.readNext(); // seek to the offset within the same reader reader.seek(offset); Assert.assertEquals(offset, reader.nextOffset()); envelope = reader.readNext(); Assert.assertEquals(offset, envelope.getOffset()); GenericRecord record2 = (GenericRecord) envelope.getMessage(); Assert.assertEquals(record1, record2); reader.close(); // open a new reader and initialize it with the offset reader = new AvroFileHdfsReader(ssp); reader.open(AVRO_FILE, offset); envelope = reader.readNext(); Assert.assertEquals(offset, envelope.getOffset()); GenericRecord record3 = (GenericRecord) envelope.getMessage(); Assert.assertEquals(record1, record3); reader.close(); }