@Override public RecordReader getRecordReader(OperatorContext context, FileSystemWrapper dfs, EasyDatasetSplitXAttr splitAttributes, List<SchemaPath> columns) throws ExecutionSetupException { return new JSONRecordReader(context, splitAttributes.getPath(), dfs, columns); }
@Override public RecordReader getRecordReader(OperatorContext context, FileSystemWrapper dfs, EasyDatasetSplitXAttr splitAttributes, List<SchemaPath> columns) throws ExecutionSetupException { final Path path = dfs.makeQualified(new Path(splitAttributes.getPath())); final ExcelFormatPluginConfig excelFormatConfig = (ExcelFormatPluginConfig) formatConfig; return new ExcelRecordReader( context, dfs, path, excelFormatConfig, columns); }
@Override public RecordReader getRecordReader(final OperatorContext context, final FileSystemWrapper dfs, EasyDatasetSplitXAttr splitAttributes, final List<SchemaPath> columns) throws ExecutionSetupException { final Path path = dfs.makeQualified(new Path(splitAttributes.getPath())); return new ArrowRecordReader(context, dfs, path, columns); }
@Override public int compare(SplitAndExtended o1e, SplitAndExtended o2e) { EasyDatasetSplitXAttr o1 = o1e.getExtended(); EasyDatasetSplitXAttr o2 = o2e.getExtended(); // sort by path, and then by start. The most important point is to ensure that the first line of a file is read first, // as it may contain a header. int cmp = o1.getPath().compareTo(o2.getPath()); if (cmp != 0) { return cmp; } else { return Long.compare(o1.getStart(), o2.getStart()); } } };
@Override public RecordReader getRecordReader(OperatorContext context, FileSystemWrapper dfs, EasyDatasetSplitXAttr splitAttributes, List<SchemaPath> columns) throws ExecutionSetupException { final Path path = dfs.makeQualified(new Path(splitAttributes.getPath())); final FileSplit split = new FileSplit(path, splitAttributes.getStart(), splitAttributes.getLength(), new String[]{""}); return new SequenceFileRecordReader(context, split, dfs); }
private Collection<FsPermissionTask> getSplitPermissiomTasks(DatasetConfig datasetConfig, FileSystemWrapper userFs, String user) { final SplitsPointer splitsPointer = DatasetSplitsPointer.of(context.getNamespaceService(user), datasetConfig); final boolean isParquet = datasetConfig.getPhysicalDataset().getFormatSettings().getType() == FileType.PARQUET; final List<FsPermissionTask> fsPermissionTasks = Lists.newArrayList(); final List<Path> batch = Lists.newArrayList(); for (DatasetSplit split: splitsPointer.getSplitIterable()) { final Path filePath; if (isParquet) { filePath = new Path(PARQUET_DATASET_SPLIT_XATTR_SERIALIZER.revert(split.getExtendedProperty().toByteArray()).getPath()); } else { filePath = new Path(EASY_DATASET_SPLIT_XATTR_SERIALIZER.revert(split.getExtendedProperty().toByteArray()).getPath()); } batch.add(filePath); if (batch.size() == PERMISSION_CHECK_TASK_BATCH_SIZE) { // make a copy of batch fsPermissionTasks.add(new FsPermissionTask(userFs, new ArrayList<>(batch), FsAction.READ)); batch.clear(); } } if (!batch.isEmpty()) { fsPermissionTasks.add(new FsPermissionTask(userFs, batch, FsAction.READ)); } return fsPermissionTasks; }
@Override public RecordReader getRecordReader(OperatorContext context, FileSystemWrapper dfs, EasyDatasetSplitXAttr splitAttributes, List<SchemaPath> columns) throws ExecutionSetupException { Path path = dfs.makeQualified(new Path(splitAttributes.getPath())); FileSplit split = new FileSplit(path, splitAttributes.getStart(), splitAttributes.getLength(), new String[]{""}); TextParsingSettings settings = new TextParsingSettings(); settings.set((TextFormatConfig)formatConfig); return new CompliantTextRecordReader(split, dfs, context, settings, columns); }
@Override public RecordReader getRecordReader(OperatorContext context, FileSystemWrapper dfs, EasyDatasetSplitXAttr splitAttributes, List<SchemaPath> columns) throws ExecutionSetupException { return new AvroRecordReader(context, splitAttributes.getPath(), splitAttributes.getStart(), splitAttributes.getLength(), columns, getFsPlugin().getFsConf()); }