@Override public RecordReader getRecordReader(OperatorContext context, FileSystemWrapper dfs, FileStatus status) throws ExecutionSetupException { EasyDatasetSplitXAttr attr = new EasyDatasetSplitXAttr() .setPath(status.getPath().toString()) .setStart(0L) .setLength(status.getLen()); return getRecordReader(context, dfs, attr, GroupScan.ALL_COLUMNS); }
@Override public RecordReader getRecordReader(OperatorContext context, FileSystemWrapper dfs, EasyDatasetSplitXAttr splitAttributes, List<SchemaPath> columns) throws ExecutionSetupException { final Path path = dfs.makeQualified(new Path(splitAttributes.getPath())); final FileSplit split = new FileSplit(path, splitAttributes.getStart(), splitAttributes.getLength(), new String[]{""}); return new SequenceFileRecordReader(context, split, dfs); }
@Override public int compare(SplitAndExtended o1e, SplitAndExtended o2e) { EasyDatasetSplitXAttr o1 = o1e.getExtended(); EasyDatasetSplitXAttr o2 = o2e.getExtended(); // sort by path, and then by start. The most important point is to ensure that the first line of a file is read first, // as it may contain a header. int cmp = o1.getPath().compareTo(o2.getPath()); if (cmp != 0) { return cmp; } else { return Long.compare(o1.getStart(), o2.getStart()); } } };
public EasyDatasetSplitXAttr newMessage() { return new EasyDatasetSplitXAttr(); }
@Override public RecordReader getRecordReader(OperatorContext context, FileSystemWrapper dfs, EasyDatasetSplitXAttr splitAttributes, List<SchemaPath> columns) throws ExecutionSetupException { return new JSONRecordReader(context, splitAttributes.getPath(), dfs, columns); }
@Override public RecordReader getRecordReader(OperatorContext context, FileSystemWrapper dfs, EasyDatasetSplitXAttr splitAttributes, List<SchemaPath> columns) throws ExecutionSetupException { final Path path = dfs.makeQualified(new Path(splitAttributes.getPath())); final ExcelFormatPluginConfig excelFormatConfig = (ExcelFormatPluginConfig) formatConfig; return new ExcelRecordReader( context, dfs, path, excelFormatConfig, columns); }
EasyDatasetSplitXAttr dataset = new EasyDatasetSplitXAttr(); dataset.setStart(0l); dataset.setLength(Long.MAX_VALUE); dataset.setPath(file.getPath().toString()); try(RecordReader reader = new AdditionalColumnsRecordReader(((EasyFormatPlugin)formatPlugin).getRecordReader(operatorContext, dfs, dataset, GroupScan.ALL_COLUMNS), explorer.getImplicitFieldsForSample(selection))) { reader.setup(mutator);
@Override public RecordReader getRecordReader(OperatorContext context, FileSystemWrapper dfs, EasyDatasetSplitXAttr splitAttributes, List<SchemaPath> columns) throws ExecutionSetupException { return new AvroRecordReader(context, splitAttributes.getPath(), splitAttributes.getStart(), splitAttributes.getLength(), columns, getFsPlugin().getFsConf()); }
@Override public RecordReader getRecordReader(final OperatorContext context, final FileSystemWrapper dfs, EasyDatasetSplitXAttr splitAttributes, final List<SchemaPath> columns) throws ExecutionSetupException { final Path path = dfs.makeQualified(new Path(splitAttributes.getPath())); return new ArrowRecordReader(context, dfs, path, columns); }
new EasyDatasetSplitXAttr() .setPath(pathString) .setStart(completeFileWork.getStart()) .setLength(completeFileWork.getLength()) .setUpdateKey(new FileSystemCachedEntity() .setPath(pathString) .setLastModificationTime(completeFileWork.getStatus().getModificationTime()))
@Override public RecordReader getRecordReader(OperatorContext context, FileSystemWrapper dfs, EasyDatasetSplitXAttr splitAttributes, List<SchemaPath> columns) throws ExecutionSetupException { Path path = dfs.makeQualified(new Path(splitAttributes.getPath())); FileSplit split = new FileSplit(path, splitAttributes.getStart(), splitAttributes.getLength(), new String[]{""}); TextParsingSettings settings = new TextParsingSettings(); settings.set((TextFormatConfig)formatConfig); return new CompliantTextRecordReader(split, dfs, context, settings, columns); }
private Collection<FsPermissionTask> getSplitPermissiomTasks(DatasetConfig datasetConfig, FileSystemWrapper userFs, String user) { final SplitsPointer splitsPointer = DatasetSplitsPointer.of(context.getNamespaceService(user), datasetConfig); final boolean isParquet = datasetConfig.getPhysicalDataset().getFormatSettings().getType() == FileType.PARQUET; final List<FsPermissionTask> fsPermissionTasks = Lists.newArrayList(); final List<Path> batch = Lists.newArrayList(); for (DatasetSplit split: splitsPointer.getSplitIterable()) { final Path filePath; if (isParquet) { filePath = new Path(PARQUET_DATASET_SPLIT_XATTR_SERIALIZER.revert(split.getExtendedProperty().toByteArray()).getPath()); } else { filePath = new Path(EASY_DATASET_SPLIT_XATTR_SERIALIZER.revert(split.getExtendedProperty().toByteArray()).getPath()); } batch.add(filePath); if (batch.size() == PERMISSION_CHECK_TASK_BATCH_SIZE) { // make a copy of batch fsPermissionTasks.add(new FsPermissionTask(userFs, new ArrayList<>(batch), FsAction.READ)); batch.clear(); } } if (!batch.isEmpty()) { fsPermissionTasks.add(new FsPermissionTask(userFs, batch, FsAction.READ)); } return fsPermissionTasks; }