/** * Get a stream of {@link Dataset}s found. * @param desiredCharacteristics desired {@link java.util.Spliterator} characteristics of this stream. The returned * stream need not satisfy these characteristics, this argument merely implies that the * caller will run optimally when those characteristics are present, allowing pushdown of * those characteristics. For example {@link java.util.Spliterator#SORTED} can sometimes * be pushed down at a cost, so the {@link DatasetsFinder} would only push it down if it is valuable * for the caller. * @param suggestedOrder suggested order of the datasets in the stream. Implementation may or may not return the entries * in that order. If the entries are in that order, implementation should ensure the spliterator * is annotated as such. * @return a stream of {@link Dataset}s found. * @throws IOException */ default Stream<T> getDatasetsStream(int desiredCharacteristics, Comparator<T> suggestedOrder) throws IOException { return StreamSupport.stream(Spliterators.spliteratorUnknownSize(getDatasetsIterator(), 0), false); }
private Stream<WorkUnit> createWorkUnitStream(SourceState state) throws IOException { IterableDatasetFinder datasetsFinder = createDatasetsFinder(state); Stream<Dataset> datasetStream = datasetsFinder.getDatasetsStream(0, null); if (this.drilldownIntoPartitions) { return datasetStream.flatMap(dataset -> { if (dataset instanceof PartitionableDataset) { try { return (Stream<PartitionableDataset.DatasetPartition>) ((PartitionableDataset) dataset).getPartitions(0, null); } catch (IOException ioe) { log.error("Failed to get partitions for dataset " + dataset.getUrn()); return Stream.empty(); } } else { return Stream.of(new DatasetWrapper(dataset)); } }).map(this::workUnitForPartitionInternal).filter(Objects::nonNull); } else { return datasetStream.map(this::workUnitForDataset).filter(Objects::nonNull); } }
@Override public List<WorkUnit> getWorkunits(SourceState state) { try { this.beginGetWorkunitsTime = System.currentTimeMillis(); initialize(state); EventSubmitter.submit(Optional.of(this.eventSubmitter), EventConstants.CONVERSION_FIND_HIVE_TABLES_EVENT); Iterator<HiveDataset> iterator = this.datasetFinder.getDatasetsIterator(); while (iterator.hasNext()) { HiveDataset hiveDataset = iterator.next(); try (AutoReturnableObject<IMetaStoreClient> client = hiveDataset.getClientPool().getClient()) { log.debug(String.format("Processing dataset: %s", hiveDataset)); // Create workunits for partitions if (HiveUtils.isPartitioned(hiveDataset.getTable()) && state.getPropAsBoolean(HIVE_SOURCE_CREATE_WORKUNITS_FOR_PARTITIONS, DEFAULT_HIVE_SOURCE_CREATE_WORKUNITS_FOR_PARTITIONS)) { createWorkunitsForPartitionedTable(hiveDataset, client); } else { createWorkunitForNonPartitionedTable(hiveDataset); } } } } catch (IOException e) { throw new RuntimeException(e); } int realWorkunits = this.workunits.size(); this.watermarker.onGetWorkunitsEnd(this.workunits); log.info(String.format("Created %s real workunits and %s watermark workunits", realWorkunits, (this.workunits.size() - realWorkunits))); return this.workunits; }
datasetsFinder.getDatasetsStream(Spliterator.SORTED, this.lexicographicalComparator); datasetStream = sortStreamLexicographically(datasetStream);
.transform(iterableDatasetFinder.getDatasetsIterator(), new CopyableDatasetRequestor.Factory(targetFs, copyConfiguration, log)); Iterator<CopyableDatasetRequestor> requestorIterator =
private Stream<WorkUnit> createWorkUnitStream(SourceState state) throws IOException { IterableDatasetFinder datasetsFinder = createDatasetsFinder(state); Stream<Dataset> datasetStream = datasetsFinder.getDatasetsStream(0, null); if (this.drilldownIntoPartitions) { return datasetStream.flatMap(dataset -> { if (dataset instanceof PartitionableDataset) { try { return (Stream<PartitionableDataset.DatasetPartition>) ((PartitionableDataset) dataset).getPartitions(0, null); } catch (IOException ioe) { log.error("Failed to get partitions for dataset " + dataset.getUrn()); return Stream.empty(); } } else { return Stream.of(new DatasetWrapper(dataset)); } }).map(this::workUnitForPartitionInternal).filter(Objects::nonNull); } else { return datasetStream.map(this::workUnitForDataset).filter(Objects::nonNull); } }
.transform(iterableDatasetFinder.getDatasetsIterator(), new CopyableDatasetRequestor.Factory(targetFs, copyConfiguration, log)); Iterator<CopyableDatasetRequestor> requestorIterator =
datasetsFinder.getDatasetsStream(Spliterator.SORTED, this.lexicographicalComparator); datasetStream = sortStreamLexicographically(datasetStream);
/** * Get a stream of {@link Dataset}s found. * @param desiredCharacteristics desired {@link java.util.Spliterator} characteristics of this stream. The returned * stream need not satisfy these characteristics, this argument merely implies that the * caller will run optimally when those characteristics are present, allowing pushdown of * those characteristics. For example {@link java.util.Spliterator#SORTED} can sometimes * be pushed down at a cost, so the {@link DatasetsFinder} would only push it down if it is valuable * for the caller. * @param suggestedOrder suggested order of the datasets in the stream. Implementation may or may not return the entries * in that order. If the entries are in that order, implementation should ensure the spliterator * is annotated as such. * @return a stream of {@link Dataset}s found. * @throws IOException */ default Stream<T> getDatasetsStream(int desiredCharacteristics, Comparator<T> suggestedOrder) throws IOException { return StreamSupport.stream(Spliterators.spliteratorUnknownSize(getDatasetsIterator(), 0), false); }
@Override public List<WorkUnit> getWorkunits(SourceState state) { try { this.beginGetWorkunitsTime = System.currentTimeMillis(); initialize(state); EventSubmitter.submit(Optional.of(this.eventSubmitter), EventConstants.CONVERSION_FIND_HIVE_TABLES_EVENT); Iterator<HiveDataset> iterator = this.datasetFinder.getDatasetsIterator(); while (iterator.hasNext()) { HiveDataset hiveDataset = iterator.next(); try (AutoReturnableObject<IMetaStoreClient> client = hiveDataset.getClientPool().getClient()) { log.debug(String.format("Processing dataset: %s", hiveDataset)); // Create workunits for partitions if (HiveUtils.isPartitioned(hiveDataset.getTable()) && state.getPropAsBoolean(HIVE_SOURCE_CREATE_WORKUNITS_FOR_PARTITIONS, DEFAULT_HIVE_SOURCE_CREATE_WORKUNITS_FOR_PARTITIONS)) { createWorkunitsForPartitionedTable(hiveDataset, client); } else { createWorkunitForNonPartitionedTable(hiveDataset); } } } } catch (IOException e) { throw new RuntimeException(e); } int realWorkunits = this.workunits.size(); this.watermarker.onGetWorkunitsEnd(this.workunits); log.info(String.format("Created %s real workunits and %s watermark workunits", realWorkunits, (this.workunits.size() - realWorkunits))); return this.workunits; }
.transform(iterableDatasetFinder.getDatasetsIterator(), new CopyableDatasetRequestor.Factory(targetFs, copyConfiguration, log)); Iterator<CopyableDatasetRequestor> requestorIterator =