@Override public boolean next(NullWritable key, ArrowWrapperWritable value) throws IOException { try { // Need a way to know what thread to interrupt, since this is a blocking thread. setReaderThread(Thread.currentThread()); boolean hasInput = arrowStreamReader.loadNextBatch(); if (hasInput) { VectorSchemaRoot vectorSchemaRoot = arrowStreamReader.getVectorSchemaRoot(); //There must be at least one column vector Preconditions.checkState(vectorSchemaRoot.getFieldVectors().size() > 0); if(vectorSchemaRoot.getFieldVectors().get(0).getValueCount() == 0) { //An empty batch will appear at the end of the stream return false; } value.setVectorSchemaRoot(arrowStreamReader.getVectorSchemaRoot()); return true; } else { processReaderEvent(); return false; } } catch (IOException io) { failOnInterruption(io); return false; } }
public ArrowWrapperWritable emptyBatch() { rootVector.setValueCount(0); for (int fieldIndex = 0; fieldIndex < fieldTypeInfos.size(); fieldIndex++) { final TypeInfo fieldTypeInfo = fieldTypeInfos.get(fieldIndex); final String fieldName = fieldNames.get(fieldIndex); final FieldType fieldType = toFieldType(fieldTypeInfo); final FieldVector arrowVector = rootVector.addOrGet(fieldName, fieldType, FieldVector.class); arrowVector.setInitialCapacity(0); arrowVector.allocateNew(); } VectorSchemaRoot vectorSchemaRoot = new VectorSchemaRoot(rootVector); return new ArrowWrapperWritable(vectorSchemaRoot, allocator, rootVector); }
public Object deserialize(Writable writable) { final ArrowWrapperWritable arrowWrapperWritable = (ArrowWrapperWritable) writable; final VectorSchemaRoot vectorSchemaRoot = arrowWrapperWritable.getVectorSchemaRoot(); final List<FieldVector> fieldVectors = vectorSchemaRoot.getFieldVectors(); final int fieldCount = fieldVectors.size(); final int rowCount = vectorSchemaRoot.getRowCount(); vectorizedRowBatch.ensureSize(rowCount); if (rows == null || rows.length < rowCount ) { rows = new Object[rowCount][]; for (int rowIndex = 0; rowIndex < rowCount; rowIndex++) { rows[rowIndex] = new Object[fieldCount]; } } for (int fieldIndex = 0; fieldIndex < fieldCount; fieldIndex++) { final FieldVector fieldVector = fieldVectors.get(fieldIndex); final int projectedCol = vectorizedRowBatch.projectedColumns[fieldIndex]; final ColumnVector columnVector = vectorizedRowBatch.cols[projectedCol]; final TypeInfo typeInfo = serDe.rowTypeInfo.getAllStructFieldTypeInfos().get(fieldIndex); read(fieldVector, columnVector, typeInfo); } for (int rowIndex = 0; rowIndex < rowCount; rowIndex++) { vectorExtractRow.extractRow(vectorizedRowBatch, rowIndex, rows[rowIndex]); } vectorizedRowBatch.reset(); return rows; }
/** * Validate two arrow vectorSchemaRoot are equal. * * @param root1 the 1st schema to compare * @param root2 the 2nd schema to compare * @throws IllegalArgumentException if they are different. */ public static void compareVectorSchemaRoot(VectorSchemaRoot root1, VectorSchemaRoot root2) { compareSchemas(root2.getSchema(), root1.getSchema()); if (root1.getRowCount() != root2.getRowCount()) { throw new IllegalArgumentException("Different row count:\n" + root1.getRowCount() + " != " + root2.getRowCount()); } List<FieldVector> vectors1 = root1.getFieldVectors(); List<FieldVector> vectors2 = root2.getFieldVectors(); if (vectors1.size() != vectors2.size()) { throw new IllegalArgumentException("Different column count:\n" + vectors1.toString() + "\n!=\n" + vectors2.toString()); } for (int i = 0; i < vectors1.size(); i++) { compareFieldVectors(vectors1.get(i), vectors2.get(i)); } }
public void execute(int recordCount, List<ValueVector> outVectors) throws Exception { root.setRowCount(recordCount); List<ArrowBuf> buffers = Lists.newArrayList(); for (FieldVector v : root.getFieldVectors()) { buffers.addAll(v.getFieldBuffers()); } projector.evaluate(recordCount, buffers, outVectors); }
private void writeBatch(VectorSchemaRoot recordBatch) throws IOException { generator.writeStartObject(); { generator.writeObjectField("count", recordBatch.getRowCount()); generator.writeArrayFieldStart("columns"); for (Field field : recordBatch.getSchema().getFields()) { FieldVector vector = recordBatch.getVector(field.getName()); writeFromVectorIntoJson(field, vector); } generator.writeEndArray(); } generator.writeEndObject(); }
public boolean read(VectorSchemaRoot root) throws IOException { JsonToken t = parser.nextToken(); if (t == START_OBJECT) { { int count = readNextField("count", Integer.class); root.setRowCount(count); nextFieldIs("columns"); readToken(START_ARRAY); { for (Field field : root.getSchema().getFields()) { FieldVector vector = root.getVector(field.getName()); readFromJsonIntoVector(field, vector); } } readToken(END_ARRAY); } readToken(END_OBJECT); return true; } else if (t == END_ARRAY) { root.setRowCount(0); return false; } else { throw new IllegalArgumentException("Invalid token: " + t); } }
this.out = new WriteChannel(out); List<Field> fields = new ArrayList<>(root.getSchema().getFields().size()); Set<Long> dictionaryIdsUsed = new HashSet<>(); for (Field field : root.getSchema().getFields()) { fields.add(DictionaryUtility.toMessageFormat(field, provider, dictionaryIdsUsed)); FieldVector vector = dictionary.getVector(); int count = vector.getValueCount(); VectorSchemaRoot dictRoot = new VectorSchemaRoot( Collections.singletonList(vector.getField()), Collections.singletonList(vector), this.schema = new Schema(fields, root.getSchema().getCustomMetadata());
public void write(VectorSchemaRoot recordBatch) throws IOException { if (!recordBatch.getSchema().equals(schema)) { throw new IllegalArgumentException("record batches must have the same schema: " + schema); } writeBatch(recordBatch); }
public VectorSchemaRoot read() throws IOException { JsonToken t = parser.nextToken(); if (t == START_OBJECT) { VectorSchemaRoot recordBatch = VectorSchemaRoot.create(schema, allocator); { int count = readNextField("count", Integer.class); recordBatch.setRowCount(count); nextFieldIs("columns"); readToken(START_ARRAY); { for (Field field : schema.getFields()) { FieldVector vector = recordBatch.getVector(field.getName()); readFromJsonIntoVector(field, vector); } } readToken(END_ARRAY); } readToken(END_OBJECT); return recordBatch; } else if (t == END_ARRAY) { return null; } else { throw new IllegalArgumentException("Invalid token: " + t); } }
@Override public void close() throws IOException { if(arrowRecordBatch != null) arrowRecordBatch.close(); if(vectorLoader != null) vectorLoader.close(); }
/** * Ensure the reader has been initialized and reset the VectorSchemaRoot row count to 0. * * @throws IOException on error */ protected void prepareLoadNextBatch() throws IOException { ensureInitialized(); root.setRowCount(0); }
private static ArrowWritableRecordBatch asDataVecBatch(ArrowRecordBatch arrowRecordBatch, Schema schema, VectorSchemaRoot vectorLoader) { //iterate column wise over the feature vectors, returning entries List<FieldVector> fieldVectors = new ArrayList<>(); for(int j = 0; j < schema.numColumns(); j++) { String name = schema.getName(j); FieldVector fieldVector = vectorLoader.getVector(name); fieldVectors.add(fieldVector); } ArrowWritableRecordBatch ret = new ArrowWritableRecordBatch(fieldVectors, schema); ret.setArrowRecordBatch(arrowRecordBatch); return ret; }
/** * Loads the record batch in the vectors. * will not close the record batch * * @param recordBatch the batch to load */ public void load(ArrowRecordBatch recordBatch) { Iterator<ArrowBuf> buffers = recordBatch.getBuffers().iterator(); Iterator<ArrowFieldNode> nodes = recordBatch.getNodes().iterator(); for (FieldVector fieldVector : root.getFieldVectors()) { loadBuffers(fieldVector, fieldVector.getField(), buffers, nodes); } root.setRowCount(recordBatch.getLength()); if (nodes.hasNext() || buffers.hasNext()) { throw new IllegalArgumentException("not all nodes and buffers were consumed. nodes: " + Collections2.toList(nodes).toString() + " buffers: " + Collections2.toList(buffers).toString()); } }
public void build() throws GandivaException { root = GandivaUtils.getSchemaRoot(incoming, referencedFields); projector = Projector.make(root.getSchema(), columnExprList); }
@Override public void close() throws IOException { if(arrowRecordBatch != null) arrowRecordBatch.close(); if(vectorLoader != null) vectorLoader.close(); }
VectorSchemaRoot vectorSchemaRoot = new VectorSchemaRoot(rootVector); return new ArrowWrapperWritable(vectorSchemaRoot, allocator, rootVector);
List<FieldVector> vectors = batchData.getVectorSchemaRoot().getFieldVectors();
/** * Filter a batch of records against the expression. * @param recordCount - number of records to consume * @return the number of records that passed the filter * @throws GandivaException on evaluation exception. */ public int filterBatch(int recordCount) throws GandivaException { if (recordCount == 0) { return 0; } root.setRowCount(recordCount); List<ArrowBuf> buffers = Lists.newArrayList(); for (FieldVector v : root.getFieldVectors()) { buffers.addAll(v.getFieldBuffers()); } selectionVector.allocateNew(recordCount); // do not take ownership of the buffer. ArrowBuf svBuffer = selectionVector.getBuffer(false); SelectionVector selectionVectorGandiva = new SelectionVectorInt16(svBuffer); filter.evaluate(recordCount, buffers, selectionVectorGandiva); selectionVector.setRecordCount(selectionVectorGandiva.getRecordCount()); return selectionVector.getCount(); }
public ArrowRecordBatch getRecordBatch() { List<ArrowFieldNode> nodes = new ArrayList<>(); List<ArrowBuf> buffers = new ArrayList<>(); for (FieldVector vector : root.getFieldVectors()) { appendNodes(vector, nodes, buffers); } return new ArrowRecordBatch(root.getRowCount(), nodes, buffers, alignBuffers); }