public CoercionReader(OperatorContext context, List<SchemaPath> columns, RecordReader inner, BatchSchema targetSchema) { super(context, columns); this.mutator = new SampleMutator(context.getAllocator()); this.incoming = mutator.getContainer(); this.inner = inner; this.outgoing = new VectorContainer(context.getAllocator()); this.targetSchema = targetSchema; this.exprs = new ArrayList<>(targetSchema.getFieldCount()); for (Field field : targetSchema.getFields()) { final FieldReference inputRef = FieldReference.getWithQuotedRef(field.getName()); final CompleteType targetType = CompleteType.fromField(field); if (targetType.isUnion() || targetType.isComplex()) { // we are assuming that map and list fields won't need coercion but inner reader may rely on sampling // a handful of rows to figure out the schema and if the list/map is empty in those rows, the schema will be // incomplete exprs.add(new NamedExpression(inputRef, inputRef)); // one way to fix this issue is to add the target field in the incoming container and rely on // schema learning to handle any changes we hit when reading from the underlying reader mutator.addField(field, TypeHelper.getValueVectorClass(field)); } else { final MajorType majorType = MajorTypeHelper.getMajorTypeForField(field); LogicalExpression cast = FunctionCallFactory.createCast(majorType, inputRef); exprs.add(new NamedExpression(cast, inputRef)); } //TODO check that the expression type is a subset of the targetSchema type } }
@Override public void allocate(Map<String, ValueVector> vectorMap) throws OutOfMemoryException { super.allocate(vectorMap); inner.allocate(mutator.getFieldVectorMap()); }
@Override public int next() { int recordCount = inner.next(); if (mutator.isSchemaChanged()) { newSchema(); } incoming.setAllCount(recordCount); if (DEBUG_PRINT) { FragmentHandle h = context.getFragmentHandle(); outgoing.buildSchema(); String op = String.format("CoercionReader:%d:%d:%d --> (%d), %s", h.getMajorFragmentId(), h.getMinorFragmentId(), context.getStats().getOperatorId(), recordCount, outgoing.getSchema()); System.out.println(op); BatchPrinter.printBatch(mutator.getContainer()); } if (projector != null) { projector.projectRecords(recordCount); for (final ValueVector v : allocationVectors) { v.setValueCount(recordCount); } } return recordCount; }
readDefinition, pluginConfig); final SampleMutator mutator = new SampleMutator(sampleAllocator) ) { reader.setup(mutator); reader.next(); mutator.getContainer().buildSchema(SelectionVectorMode.NONE); return mutator.getContainer().getSchema();
BufferAllocator sampleAllocator = context.getAllocator().newChildAllocator("sample-alloc", 0, Long.MAX_VALUE); OperatorContextImpl operatorContext = new OperatorContextImpl(context.getConfig(), sampleAllocator, context.getOptionManager(), 1000); SampleMutator mutator = new SampleMutator(sampleAllocator) ){ final Optional<FileStatus> firstFileO = selection.getFirstFile(); mutator.allocate(100); mutator.getContainer().buildSchema(BatchSchema.SelectionVectorMode.NONE); return mutator.getContainer().getSchema();
@Test public void testFileNotFound() { FileSplit split = mock(FileSplit.class); when(split.getPath()).thenReturn(new Path("/notExist/notExitFile")); TextParsingSettings settings = mock(TextParsingSettings.class); when(settings.isHeaderExtractionEnabled()).thenReturn(true); SchemaPath column = mock(SchemaPath.class); List<SchemaPath> columns = new ArrayList<>(1); columns.add(column); SabotContext context = mock(SabotContext.class); BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); when(context.getAllocator()).thenReturn(allocator); Path path = new Path("/notExist"); try (BufferAllocator sampleAllocator = context.getAllocator().newChildAllocator("sample-alloc", 0, Long.MAX_VALUE); OperatorContextImpl operatorContext = new OperatorContextImpl(context.getConfig(), sampleAllocator, context.getOptionManager(), 1000); FileSystemWrapper dfs = FileSystemWrapper.get(path, new Configuration()); SampleMutator mutator = new SampleMutator(sampleAllocator); CompliantTextRecordReader reader = new CompliantTextRecordReader(split, dfs, operatorContext, settings, columns); ){ reader.setup(mutator); } catch (Exception e) { // java.io.FileNotFoundException is expected, but memory leak is not expected. assertTrue(e.getCause() instanceof FileNotFoundException); } allocator.close(); } }
BufferAllocator sampleAllocator = context.getAllocator().newChildAllocator("sample-alloc", 0, Long.MAX_VALUE); OperatorContextImpl operatorContext = new OperatorContextImpl(context.getConfig(), sampleAllocator, context.getOptionManager(), 1000); SampleMutator mutator = new SampleMutator(sampleAllocator) ){ final ImplicitFilesystemColumnFinder explorer = new ImplicitFilesystemColumnFinder(context.getOptionManager(), dfs, GroupScan.ALL_COLUMNS); reader.setup(mutator); Map<String, ValueVector> fieldVectorMap = new HashMap<>(); for (VectorWrapper<?> vw : mutator.getContainer()) { fieldVectorMap.put(vw.getField().getName(), vw.getValueVector()); mutator.getContainer().buildSchema(BatchSchema.SelectionVectorMode.NONE); return mutator.getContainer().getSchema();
private BatchSchema getSampledSchema(HTableDescriptor descriptor, DatasetConfig oldConfig) throws Exception { BatchSchema oldSchema = null; ByteString bytes = oldConfig != null ? DatasetHelper.getSchemaBytes(oldConfig) : null; if(bytes != null) { oldSchema = BatchSchema.deserialize(bytes); } final HBaseSubScanSpec spec = new HBaseSubScanSpec(getNamespace(), getTableName(), null, null, null); try ( BufferAllocator allocator = context.getAllocator().newChildAllocator("hbase-sample", 0, Long.MAX_VALUE); SampleMutator mutator = new SampleMutator(allocator); HBaseRecordReader reader = new HBaseRecordReader(connect.getConnection(), spec, GroupScan.ALL_COLUMNS, null, true); ) { reader.setNumRowsPerBatch(100); if(oldSchema != null) { oldSchema.materializeVectors(GroupScan.ALL_COLUMNS, mutator); } // add row key. mutator.addField(CompleteType.VARBINARY.toField(HBaseRecordReader.ROW_KEY), ValueVector.class); // add all column families. for (HColumnDescriptor col : descriptor.getFamilies()) { mutator.addField(CompleteType.struct().toField(col.getNameAsString()), ValueVector.class); } reader.setup(mutator); reader.next(); mutator.getContainer().buildSchema(SelectionVectorMode.NONE); return mutator.getContainer().getSchema(); } catch (ExecutionSetupException e) { throw UserException.dataReadError(e).message("Unable to sample schema for table %s.", key).build(logger); } }
mutator.isSchemaChanged(); String op = String.format("CoercionReader:%d:%d:%d, %s --> %s", h.getMajorFragmentId(), h.getMinorFragmentId(), context.getStats().getOperatorId(), incoming.getSchema(), outgoing.getSchema()); System.out.println(op); mutator.getContainer().setAllCount(2); BatchPrinter.printBatch(mutator.getContainer());