@Override public void serialize(JobData jobData, JsonGenerator generator, SerializerProvider serializerProvider) throws IOException, JsonProcessingException { generator.writeStartObject(); generator.writeFieldName("rowCount"); generator.writeNumber(jobData.getRowCount()); generator.writeFieldName("schema"); generator.writeStartArray(); for (Field field : jobData.getJobDataFragment().getSchema().getFields()) { writeField(field, generator, false); } generator.writeEndArray(); generator.writeFieldName("rows"); generator.writeStartArray(); final APIJobResultsSerializer jsonWriter = new APIJobResultsSerializer(generator); jsonWriter.setup(); for(RecordBatchHolder batchHolder : jobData.delegate.getRecordBatches()) { final EventBasedRecordWriter recordWriter = new EventBasedRecordWriter(batchHolder.getData().getContainer(), jsonWriter); for (int i = batchHolder.getStart(); i < batchHolder.getEnd(); i++) { recordWriter.writeOneRecord(i); } } generator.writeEndArray(); generator.writeEndObject(); } }
public RecordBatches toRecordBatches() { return new RecordBatches(batches.stream() .map(t -> RecordBatchHolder.newRecordBatchHolder(t, 0, t.getRecordCount())) .collect(Collectors.toList()) ); } }
private Pair<RecordBatchData, Integer> find(int index) { if (index >= recordBatches.getSize()) { throw new IllegalArgumentException(String.format("Invalid index %s", index)); } // Add the offset in the first batch int indexWorkspace = index; for(RecordBatchHolder batchHolder : recordBatches.getBatches()) { if (indexWorkspace < batchHolder.size()) { return new Pair<>(batchHolder.getData(), batchHolder.getStart() + indexWorkspace); } indexWorkspace -= batchHolder.size(); } throw new IllegalArgumentException(String.format("Invalid index %s", index)); }
public RecordBatches(final List<RecordBatchHolder> batches) { Preconditions.checkArgument(batches != null && batches.size() >= 1); this.batches = ImmutableList.copyOf(batches); int size = 0; if (batches != null) { for(RecordBatchHolder batch : batches) { size += batch.size(); } } this.size = size; this.schema = batches.get(0).getData().getContainer().getSchema(); }
private static void releaseBatches(List<RecordBatchHolder> holders) throws Exception { for(RecordBatchHolder holder : holders) { holder.getData().close(); } }
final int batchEnd = (int) Math.min(currentBatchCount, batchStart + remaining); final RecordBatchHolder batchHolder = newRecordBatchHolder( new RecordBatchData(vectorContainer, allocator), batchStart, remaining -= batchHolder.size();
/** * Create a new {@link RecordBatchHolder} * @param data underlying {@link RecordBatchData} * @param start starting index where the interested data starts. * @param end ending index (exclusive) where interested data ends. * @return */ public static RecordBatchHolder newRecordBatchHolder(final RecordBatchData data, final int start, final int end) { checkArgument(start >= 0, "Invalid start index (%d) in RecordBatchData (of size (%d))", start, data.getRecordCount()); checkArgument(end <= data.getRecordCount(), "Invalid end index (%d) in RecordBatchData (of size (%d))", data.getRecordCount()); checkArgument(start <= end, "Invalid range indices. Start (%d), End (%d), Batch size (%d)", start, end, data.getRecordCount()); return new RecordBatchHolder(data, start, end); }
VectorContainer batchContainer = batchHolders.get(i).getData().getContainer(); assertEquals(TEST_BIT_VALUES, getBitValues(batchContainer, 0, 5)); assertEquals(TEST_VARCHAR_VALUES, getVarCharValues(batchContainer, 0, 5)); VectorContainer batchContainer = batchHolders.get(0).getData().getContainer(); assertEquals(TEST_BIT_VALUES, getBitValues(batchContainer, 0, 5)); assertEquals(TEST_VARCHAR_VALUES, getVarCharValues(batchContainer, 0, 5)); batchContainer = batchHolders.get(1).getData().getContainer(); assertEquals(TEST_BIT_VALUES.subList(0, 2), getBitValues(batchContainer, 0, 2)); assertEquals(TEST_VARCHAR_VALUES.subList(0, 2), getVarCharValues(batchContainer, 0, 2)); VectorContainer batchContainer = batchHolders.get(0).getData().getContainer(); assertEquals(TEST_BIT_VALUES.subList(2, 5), getBitValues(batchContainer, 2, 5)); assertEquals(TEST_VARCHAR_VALUES.subList(2, 5), getVarCharValues(batchContainer, 2, 5)); batchContainer = batchHolders.get(1).getData().getContainer(); assertEquals(TEST_BIT_VALUES, getBitValues(batchContainer, 0, 5)); assertEquals(TEST_VARCHAR_VALUES, getVarCharValues(batchContainer, 0, 5)); batchContainer = batchHolders.get(2).getData().getContainer(); assertEquals(TEST_BIT_VALUES.subList(0, 3), getBitValues(batchContainer, 0, 3)); assertEquals(TEST_VARCHAR_VALUES.subList(0, 3), getVarCharValues(batchContainer, 0, 3)); VectorContainer batchContainer = batchHolders.get(0).getData().getContainer(); assertEquals(TEST_BIT_VALUES.subList(2, 5), getBitValues(batchContainer, 2, 5)); assertEquals(TEST_VARCHAR_VALUES.subList(2, 5), getVarCharValues(batchContainer, 2, 5)); batchContainer = batchHolders.get(1).getData().getContainer();
for(RecordBatchHolder batchHolder : data.delegate.getRecordBatches()) { final EventBasedRecordWriter recordWriter = new EventBasedRecordWriter(batchHolder.getData().getContainer(), jsonWriter); for (int i = batchHolder.getStart(); i < batchHolder.getEnd(); i++) { context.setRowNum(currentRowInWriting++); recordWriter.writeOneRecord(i);
private Pair<RecordBatchData, Integer> find(int index) { if (index >= getReturnedRowCount()) { throw new IllegalArgumentException(String.format("Invalid index %s", index)); } // Add the offset in the first batch int indexWorkspace = index; for(RecordBatchHolder batchHolder : delegate.getRecordBatches()) { if (indexWorkspace < batchHolder.size()) { return new Pair<>(batchHolder.getData(), batchHolder.getStart() + indexWorkspace); } indexWorkspace -= batchHolder.size(); } throw new IllegalArgumentException(String.format("Invalid index %s", index)); }
/** * Helper method that creates an empty batch from schema in Arrow footer. * @return * @throws IOException */ private RecordBatchHolder getEmptyBatch() throws IOException { final FileStatus fileStatus = dfs.getFileStatus(path); final long len = fileStatus.getLen(); inputStream.seek(len - (MAGIC_STRING_LENGTH + FOOTER_OFFSET_SIZE)); final long footerOffset = inputStream.readLong(); // Read the footer inputStream.seek(footerOffset); ArrowFileFormat.ArrowFileFooter footer = ArrowFileFormat.ArrowFileFooter.parseDelimitedFrom(inputStream); BatchSchema footerSchema = BatchSchema.newBuilder().addSerializedFields(footer.getFieldList()).build(); final VectorContainer vectorContainer = new VectorContainer(); try (RollbackCloseable rollback = new RollbackCloseable()) { rollback.add(vectorContainer); for(Field field : footerSchema) { vectorContainer.add(TypeHelper.getNewVector(field, allocator)); } rollback.commit(); } catch (Exception e) { throw new IOException(e); } vectorContainer.setRecordCount(0); vectorContainer.buildSchema(); return newRecordBatchHolder(new RecordBatchData(vectorContainer, allocator), 0, 0); }
VectorContainer batchContainer = batchHolders.get(0).getData().getContainer(); assertEquals(TEST_BIT_VALUES, getBitValues(batchContainer, 0, 5)); assertEquals(TEST_VARCHAR_VALUES, getVarCharValues(batchContainer, 0, 5)); VectorContainer batchContainer = batchHolders.get(0).getData().getContainer(); assertEquals(TEST_BIT_VALUES.subList(0, 2), getBitValues(batchContainer, 0, 2)); assertEquals(TEST_VARCHAR_VALUES.subList(0, 2), getVarCharValues(batchContainer, 0, 2)); VectorContainer batchContainer = batchHolders.get(0).getData().getContainer(); assertEquals(TEST_BIT_VALUES.subList(2, 4), getBitValues(batchContainer, 2, 4)); assertEquals(TEST_VARCHAR_VALUES.subList(2, 4), getVarCharValues(batchContainer, 2, 4));
@Test public void writeAndReadEmptyListVectors() throws Exception { try (final VectorContainer batchData = createBatch(1, testEmptyListVector())) { final Path basePath = new Path(dateGenFolder.getRoot().getPath()); final ArrowFileMetadata metadata = writeArrowFile(batchData); try (final ArrowFileReader reader = new ArrowFileReader(FileSystem.get(FS_CONF), basePath, metadata, ALLOCATOR)) { final List<RecordBatchHolder> batchHolders = reader.read(0, 1); assertEquals(1, batchHolders.size()); assertNotNull(batchHolders.get(0).getData()); assertEquals(0, batchHolders.get(0).getStart()); assertEquals(1, batchHolders.get(0).getEnd()); final BatchSchema schema = batchHolders.get(0).getData().getContainer().getSchema(); assertEquals(1, schema.getFieldCount()); assertEquals("emptyListVector", schema.getColumn(0).getName()); assertEquals(MinorType.LIST, Types.getMinorTypeForArrowType(schema.getColumn(0).getType())); final VectorContainer batchContainer = batchHolders.get(0).getData().getContainer(); assertTrue(Iterators.size(batchContainer.iterator()) == 1); for (final VectorWrapper<?> wrapper : batchContainer) { assertTrue(wrapper.getValueVector() instanceof ListVector); assertTrue(((ListVector) (wrapper.getValueVector())).getDataVector() instanceof ZeroVector); } releaseBatches(batchHolders); } } }
@Test public void testDataTrunc() throws Exception { Pair<? extends ValueVector, ResultVerifier> varChar1 = testVarCharVector(0, 0); Pair<? extends ValueVector, ResultVerifier> varChar2 = testVarCharVector(5, 5); Pair<? extends ValueVector, ResultVerifier> varChar3 = testVarCharVector(10, 10); Pair<? extends ValueVector, ResultVerifier> date1 = testDateMilliVector(0, 0); Pair<? extends ValueVector, ResultVerifier> date2 = testDateMilliVector(5, 5); Pair<? extends ValueVector, ResultVerifier> date3 = testDateMilliVector(10, 10); RecordBatchData batch1 = createRecordBatch(varChar1.getKey(), date1.getKey()); RecordBatchData batch2 = createRecordBatch(varChar2.getKey(), date2.getKey()); RecordBatchData batch3 = createRecordBatch(varChar3.getKey(), date3.getKey()); JobLoader jobLoader = mock(JobLoader.class); when(jobLoader.load(anyInt(), anyInt())).thenReturn( new RecordBatches(asList( newRecordBatchHolder(batch1, 0, 5), newRecordBatchHolder(batch2, 0, 5), newRecordBatchHolder(batch3, 0, 5) )) ); try (JobData dataInput = new JobDataWrapper(new JobDataImpl(jobLoader, TEST_JOB_ID))) { JobDataFragment truncDataInput = dataInput.truncate(10); DataPOJO dataOutput = OBJECT_MAPPER.readValue(OBJECT_MAPPER.writeValueAsString(truncDataInput), DataPOJO.class); assertEquals(truncDataInput.getColumns().toString(), dataOutput.getColumns().toString()); assertEquals(truncDataInput.getReturnedRowCount(), dataOutput.getReturnedRowCount()); varChar1.getValue().verify(dataOutput); varChar2.getValue().verify(dataOutput); date1.getValue().verify(dataOutput); date2.getValue().verify(dataOutput); } }
BatchSchema schema = batchHolders.get(0).getData().getContainer().getSchema(); assertEquals(4, schema.getFieldCount());
/** * Helper method to verify that the batch holder contains valid data including the standard two columns * (colBit - BIT, colVarChar - VARCHAR) used in this test class. */ private static void verifyBatchHolder(RecordBatchHolder holder, int expStart, int expEnd) { assertNotNull(holder); assertNotNull(holder.getData()); assertEquals(expStart, holder.getStart()); assertEquals(expEnd, holder.getEnd()); // verify schema BatchSchema schema = holder.getData().getContainer().getSchema(); assertEquals(2, schema.getFieldCount()); assertEquals("colBit", schema.getColumn(0).getName()); assertEquals(MinorType.BIT, Types.getMinorTypeForArrowType(schema.getColumn(0).getType())); assertEquals("colVarChar", schema.getColumn(1).getName()); assertEquals(MinorType.VARCHAR, Types.getMinorTypeForArrowType(schema.getColumn(1).getType())); }
@Test public void testDataRange() throws Exception { Pair<? extends ValueVector, ResultVerifier> varChar1 = testVarCharVector(0, 0); Pair<? extends ValueVector, ResultVerifier> varChar2 = testVarCharVector(0, 5); Pair<? extends ValueVector, ResultVerifier> varChar3 = testVarCharVector(5, 10); Pair<? extends ValueVector, ResultVerifier> date1 = testDateMilliVector(0, 0); Pair<? extends ValueVector, ResultVerifier> date2 = testDateMilliVector(0, 5); Pair<? extends ValueVector, ResultVerifier> date3 = testDateMilliVector(5, 10); RecordBatchData batch1 = createRecordBatch(varChar1.getKey(), date1.getKey()); RecordBatchData batch2 = createRecordBatch(varChar2.getKey(), date2.getKey()); RecordBatchData batch3 = createRecordBatch(varChar3.getKey(), date3.getKey()); JobLoader jobLoader = mock(JobLoader.class); when(jobLoader.load(anyInt(), anyInt())).thenReturn( new RecordBatches(asList( newRecordBatchHolder(batch1, 0, 5), newRecordBatchHolder(batch2, 0, 5), newRecordBatchHolder(batch3, 0, 5) )) ); try (JobData dataInput = new JobDataWrapper(new JobDataImpl(jobLoader, TEST_JOB_ID))) { JobDataFragment rangeDataInput = dataInput.range(5, 10); DataPOJO dataOutput = OBJECT_MAPPER.readValue(OBJECT_MAPPER.writeValueAsString(rangeDataInput), DataPOJO.class); assertEquals(rangeDataInput.getColumns().toString(), dataOutput.getColumns().toString()); assertEquals(rangeDataInput.getReturnedRowCount(), dataOutput.getReturnedRowCount()); varChar2.getValue().verify(dataOutput); varChar3.getValue().verify(dataOutput); date2.getValue().verify(dataOutput); date3.getValue().verify(dataOutput); } }
private static com.dremio.dac.model.job.JobDataFragment createDataObject(ValueVector... vv) { RecordBatchData batch = createRecordBatch(vv); return new JobDataFragmentWrapper(0, new JobDataFragmentImpl( new RecordBatches(asList(newRecordBatchHolder(batch, 0, batch.getRecordCount()))), 0, TEST_JOB_ID)); }
RecordBatchData data2 = createRecordBatch(varChar2.getKey(), date2.getKey()); RecordBatchData data3 = createRecordBatch(varChar3.getKey(), date3.getKey()); recordBatches.add(newRecordBatchHolder(data1, 0, data1.getRecordCount())); recordBatches.add(newRecordBatchHolder(data2, 0, data2.getRecordCount())); recordBatches.add(newRecordBatchHolder(data3, 0, data3.getRecordCount())); when(jobLoader.load(anyInt(), anyInt())).thenReturn( new RecordBatches(asList( newRecordBatchHolder(data1, 2, 5), newRecordBatchHolder(data2, 1, 3), newRecordBatchHolder(data3, 0, 4) )) );