org.apache.gobblin.dataset.IterableDatasetFinder java code examples

/**
 * Get a stream of {@link Dataset}s found.
 * @param desiredCharacteristics desired {@link java.util.Spliterator} characteristics of this stream. The returned
 *                               stream need not satisfy these characteristics, this argument merely implies that the
 *                               caller will run optimally when those characteristics are present, allowing pushdown of
 *                               those characteristics. For example {@link java.util.Spliterator#SORTED} can sometimes
 *                               be pushed down at a cost, so the {@link DatasetsFinder} would only push it down if it is valuable
 *                               for the caller.
 * @param suggestedOrder suggested order of the datasets in the stream. Implementation may or may not return the entries
 *                       in that order. If the entries are in that order, implementation should ensure the spliterator
 *                       is annotated as such.
 * @return a stream of {@link Dataset}s found.
 * @throws IOException
 */
default Stream<T> getDatasetsStream(int desiredCharacteristics, Comparator<T> suggestedOrder) throws IOException {
 return StreamSupport.stream(Spliterators.spliteratorUnknownSize(getDatasetsIterator(), 0), false);
}

private Stream<WorkUnit> createWorkUnitStream(SourceState state) throws IOException {
 IterableDatasetFinder datasetsFinder = createDatasetsFinder(state);
 Stream<Dataset> datasetStream = datasetsFinder.getDatasetsStream(0, null);
 if (this.drilldownIntoPartitions) {
  return datasetStream.flatMap(dataset -> {
   if (dataset instanceof PartitionableDataset) {
    try {
     return (Stream<PartitionableDataset.DatasetPartition>) ((PartitionableDataset) dataset).getPartitions(0,
       null);
    } catch (IOException ioe) {
     log.error("Failed to get partitions for dataset " + dataset.getUrn());
     return Stream.empty();
    }
   } else {
    return Stream.of(new DatasetWrapper(dataset));
   }
  }).map(this::workUnitForPartitionInternal).filter(Objects::nonNull);
 } else {
  return datasetStream.map(this::workUnitForDataset).filter(Objects::nonNull);
 }
}

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
 try {
  this.beginGetWorkunitsTime = System.currentTimeMillis();
  initialize(state);
  EventSubmitter.submit(Optional.of(this.eventSubmitter), EventConstants.CONVERSION_FIND_HIVE_TABLES_EVENT);
  Iterator<HiveDataset> iterator = this.datasetFinder.getDatasetsIterator();
  while (iterator.hasNext()) {
   HiveDataset hiveDataset = iterator.next();
   try (AutoReturnableObject<IMetaStoreClient> client = hiveDataset.getClientPool().getClient()) {
    log.debug(String.format("Processing dataset: %s", hiveDataset));
    // Create workunits for partitions
    if (HiveUtils.isPartitioned(hiveDataset.getTable())
      && state.getPropAsBoolean(HIVE_SOURCE_CREATE_WORKUNITS_FOR_PARTITIONS,
      DEFAULT_HIVE_SOURCE_CREATE_WORKUNITS_FOR_PARTITIONS)) {
     createWorkunitsForPartitionedTable(hiveDataset, client);
    } else {
     createWorkunitForNonPartitionedTable(hiveDataset);
    }
   }
  }
 } catch (IOException e) {
  throw new RuntimeException(e);
 }
 int realWorkunits = this.workunits.size();
 this.watermarker.onGetWorkunitsEnd(this.workunits);
 log.info(String.format("Created %s real workunits and %s watermark workunits", realWorkunits,
   (this.workunits.size() - realWorkunits)));
 return this.workunits;
}

  datasetsFinder.getDatasetsStream(Spliterator.SORTED, this.lexicographicalComparator);
datasetStream = sortStreamLexicographically(datasetStream);

  .transform(iterableDatasetFinder.getDatasetsIterator(),
    new CopyableDatasetRequestor.Factory(targetFs, copyConfiguration, log));
Iterator<CopyableDatasetRequestor> requestorIterator =

private Stream<WorkUnit> createWorkUnitStream(SourceState state) throws IOException {
 IterableDatasetFinder datasetsFinder = createDatasetsFinder(state);
 Stream<Dataset> datasetStream = datasetsFinder.getDatasetsStream(0, null);
 if (this.drilldownIntoPartitions) {
  return datasetStream.flatMap(dataset -> {
   if (dataset instanceof PartitionableDataset) {
    try {
     return (Stream<PartitionableDataset.DatasetPartition>) ((PartitionableDataset) dataset).getPartitions(0,
       null);
    } catch (IOException ioe) {
     log.error("Failed to get partitions for dataset " + dataset.getUrn());
     return Stream.empty();
    }
   } else {
    return Stream.of(new DatasetWrapper(dataset));
   }
  }).map(this::workUnitForPartitionInternal).filter(Objects::nonNull);
 } else {
  return datasetStream.map(this::workUnitForDataset).filter(Objects::nonNull);
 }
}

  .transform(iterableDatasetFinder.getDatasetsIterator(),
    new CopyableDatasetRequestor.Factory(targetFs, copyConfiguration, log));
Iterator<CopyableDatasetRequestor> requestorIterator =

  datasetsFinder.getDatasetsStream(Spliterator.SORTED, this.lexicographicalComparator);
datasetStream = sortStreamLexicographically(datasetStream);

/**
 * Get a stream of {@link Dataset}s found.
 * @param desiredCharacteristics desired {@link java.util.Spliterator} characteristics of this stream. The returned
 *                               stream need not satisfy these characteristics, this argument merely implies that the
 *                               caller will run optimally when those characteristics are present, allowing pushdown of
 *                               those characteristics. For example {@link java.util.Spliterator#SORTED} can sometimes
 *                               be pushed down at a cost, so the {@link DatasetsFinder} would only push it down if it is valuable
 *                               for the caller.
 * @param suggestedOrder suggested order of the datasets in the stream. Implementation may or may not return the entries
 *                       in that order. If the entries are in that order, implementation should ensure the spliterator
 *                       is annotated as such.
 * @return a stream of {@link Dataset}s found.
 * @throws IOException
 */
default Stream<T> getDatasetsStream(int desiredCharacteristics, Comparator<T> suggestedOrder) throws IOException {
 return StreamSupport.stream(Spliterators.spliteratorUnknownSize(getDatasetsIterator(), 0), false);
}

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
 try {
  this.beginGetWorkunitsTime = System.currentTimeMillis();
  initialize(state);
  EventSubmitter.submit(Optional.of(this.eventSubmitter), EventConstants.CONVERSION_FIND_HIVE_TABLES_EVENT);
  Iterator<HiveDataset> iterator = this.datasetFinder.getDatasetsIterator();
  while (iterator.hasNext()) {
   HiveDataset hiveDataset = iterator.next();
   try (AutoReturnableObject<IMetaStoreClient> client = hiveDataset.getClientPool().getClient()) {
    log.debug(String.format("Processing dataset: %s", hiveDataset));
    // Create workunits for partitions
    if (HiveUtils.isPartitioned(hiveDataset.getTable())
      && state.getPropAsBoolean(HIVE_SOURCE_CREATE_WORKUNITS_FOR_PARTITIONS,
      DEFAULT_HIVE_SOURCE_CREATE_WORKUNITS_FOR_PARTITIONS)) {
     createWorkunitsForPartitionedTable(hiveDataset, client);
    } else {
     createWorkunitForNonPartitionedTable(hiveDataset);
    }
   }
  }
 } catch (IOException e) {
  throw new RuntimeException(e);
 }
 int realWorkunits = this.workunits.size();
 this.watermarker.onGetWorkunitsEnd(this.workunits);
 log.info(String.format("Created %s real workunits and %s watermark workunits", realWorkunits,
   (this.workunits.size() - realWorkunits)));
 return this.workunits;
}

  .transform(iterableDatasetFinder.getDatasetsIterator(),
    new CopyableDatasetRequestor.Factory(targetFs, copyConfiguration, log));
Iterator<CopyableDatasetRequestor> requestorIterator =

Javadoc

A DatasetsFinder that can return the Datasets as an Iterator. This allows Datasets to be created on demand instead of all at once, possibly reducing memory usage and improving performance.

Most used methods

getDatasetsIterator
getDatasetsStream
Get a stream of Datasets found.

Popular in Java

Parsing JSON documents to java classes using gson
putExtra (Intent)
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
getResourceAsStream (ClassLoader)
Pointer (com.sun.jna)
An abstraction for a native pointer data type. A Pointer instance represents, on the Java side, a na
HttpURLConnection (java.net)
An URLConnection for HTTP (RFC 2616 [http://tools.ietf.org/html/rfc2616]) used to send and receive d
ResultSet (java.sql)
An interface for an object which represents a database table entry, returned as the result of the qu
TimeUnit (java.util.concurrent)
A TimeUnit represents time durations at a given unit of granularity and provides utility methods to
StringUtils (org.apache.commons.lang)
Operations on java.lang.String that arenull safe. * IsEmpty/IsBlank - checks if a String contains
Get (org.apache.hadoop.hbase.client)
Used to perform Get operations on a single row. To get everything for a row, instantiate a Get objec
Top plugins for Android Studio

How to useIterableDatasetFinder in org.apache.gobblin.dataset

Best Java code snippets using org.apache.gobblin.dataset.IterableDatasetFinder (Showing top 11 results out of 315)

How to use
IterableDatasetFinder
in
org.apache.gobblin.dataset