org.apache.gobblin.data.management.copy.hive.HiveUtils java code examples

if (!HiveUtils.isPartitioned(hiveDataset.getTable())) {
 throw new IllegalArgumentException("HiveDatasetVersionFinder is only compatible with partitioned hive tables");
 List<Partition> partitions = HiveUtils.getPartitions(client.get(), hiveDataset.getTable(), Optional.<String> absent());
 return Lists.newArrayList(Iterables.filter(Iterables.transform(partitions, new Function<Partition, HiveDatasetVersion>() {

/**
 * Get paths from a Hive location using the provided input format.
 */
public static Set<Path> getPaths(InputFormat<?, ?> inputFormat, Path location) throws IOException {
 JobConf jobConf = new JobConf(getHadoopConfiguration());
 Set<Path> paths = Sets.newHashSet();
 FileInputFormat.addInputPaths(jobConf, location.toString());
 InputSplit[] splits = inputFormat.getSplits(jobConf, 1000);
 for (InputSplit split : splits) {
  if (!(split instanceof FileSplit)) {
   throw new IOException("Not a file split. Found " + split.getClass().getName());
  }
  FileSplit fileSplit = (FileSplit) split;
  paths.add(fileSplit.getPath());
 }
 return paths;
}

public static HiveLocationDescriptor forPartition(Partition partition, FileSystem fs, Properties properties) throws IOException {
 return new HiveLocationDescriptor(partition.getDataLocation(),
   HiveUtils.getInputFormat(partition.getTPartition().getSd()), fs, properties);
}

 checkPartitionedTableCompatibility(this.targetTable, this.existingTargetTable.get());
if (HiveUtils.isPartitioned(this.dataset.table)) {
 this.sourcePartitions = HiveUtils.getPartitionsMap(multiClient.getClient(source_client), this.dataset.table,
   this.partitionFilter, this.hivePartitionExtendedFilter);
 HiveAvroCopyEntityHelper.updatePartitionAttributesIfAvro(this.targetTable, this.sourcePartitions, this);
     HiveUtils.getPartitionsMap(multiClient.getClient(target_client),
       this.existingTargetTable.get(), this.partitionFilter, this.hivePartitionExtendedFilter))
     : Maps.<List<String>, Partition> newHashMap();

/**
 * For backward compatibility when PathFilter is injected as a parameter.
 * @param client
 * @param table
 * @param filter
 * @return
 * @throws IOException
 */
public static List<Partition> getPartitions(IMetaStoreClient client, Table table, Optional<String> filter)
  throws IOException {
 return getPartitions(client, table, filter, Optional.<HivePartitionExtendedFilter>absent());
}

private void checkPartitionedTableCompatibility(Table desiredTargetTable, Table existingTargetTable)
  throws IOException {
 if (!desiredTargetTable.getDataLocation().equals(existingTargetTable.getDataLocation())) {
  throw new HiveTableLocationNotMatchException(desiredTargetTable.getDataLocation(),
    existingTargetTable.getDataLocation());
 }
 if (HiveUtils.isPartitioned(desiredTargetTable) != HiveUtils.isPartitioned(existingTargetTable)) {
  throw new IOException(String.format(
    "%s: Desired target table %s partitioned, existing target table %s partitioned. Tables are incompatible.",
    this.dataset.tableIdentifier, HiveUtils.isPartitioned(desiredTargetTable) ? "is" : "is not",
    HiveUtils.isPartitioned(existingTargetTable) ? "is" : "is not"));
 }
 if (desiredTargetTable.isPartitioned()
   && !desiredTargetTable.getPartitionKeys().equals(existingTargetTable.getPartitionKeys())) {
  throw new IOException(String.format(
    "%s: Desired target table has partition keys %s, existing target table has partition keys %s. "
      + "Tables are incompatible.",
    this.dataset.tableIdentifier, gson.toJson(desiredTargetTable.getPartitionKeys()),
    gson.toJson(existingTargetTable.getPartitionKeys())));
 }
}

for (Path path : HiveUtils.getPaths(this.inputFormat, this.location)) {
 result.put(path, this.fileSystem.getFileStatus(path));

/**
 * @param client an {@link IMetaStoreClient} for the correct metastore.
 * @param table the {@link Table} for which we should get partitions.
 * @param filter an optional filter for partitions as would be used in Hive. Can only filter on String columns.
 *               (e.g. "part = \"part1\"" or "date > \"2015\"".
 * @return a map of values to {@link Partition} for input {@link Table}.
 */
public static Map<List<String>, Partition> getPartitionsMap(IMetaStoreClient client, Table table,
  Optional<String> filter, Optional<? extends HivePartitionExtendedFilter> hivePartitionExtendedFilterOptional) throws IOException {
 return Maps.uniqueIndex(getPartitions(client, table, filter, hivePartitionExtendedFilterOptional), new Function<Partition, List<String>>() {
  @Override
  public List<String> apply(@Nullable Partition partition) {
   if (partition == null) {
    return null;
   }
   return partition.getValues();
  }
 });
}

if (HiveUtils.isPartitioned(hiveDataset.getTable())) {
 processPartitionedTable(hiveDataset, client);
} else {

 checkPartitionedTableCompatibility(this.targetTable, this.existingTargetTable.get());
if (HiveUtils.isPartitioned(this.dataset.table)) {
 this.sourcePartitions = HiveUtils.getPartitionsMap(multiClient.getClient(source_client), this.dataset.table,
   this.partitionFilter, this.hivePartitionExtendedFilter);
 HiveAvroCopyEntityHelper.updatePartitionAttributesIfAvro(this.targetTable, this.sourcePartitions, this);
     HiveUtils.getPartitionsMap(multiClient.getClient(target_client),
       this.existingTargetTable.get(), this.partitionFilter, this.hivePartitionExtendedFilter))
     : Maps.<List<String>, Partition> newHashMap();

for (Path path : HiveUtils.getPaths(this.inputFormat, this.location)) {
 result.put(path, this.fileSystem.getFileStatus(path));

if (table.isPresent()) {
 org.apache.hadoop.hive.ql.metadata.Table qlTable = new org.apache.hadoop.hive.ql.metadata.Table(table.get());
 if (HiveUtils.isPartitioned(qlTable)) {
  partitions = Optional.of(HiveUtils.getPartitions(client.get(), qlTable, Optional.<String>absent()));

@Override
public void run()
  throws Exception {
 Iterator<HiveDataset> iterator = this.datasetFinder.getDatasetsIterator();
 while (iterator.hasNext()) {
  ConvertibleHiveDataset hiveDataset = (ConvertibleHiveDataset) iterator.next();
  try (AutoReturnableObject<IMetaStoreClient> client = hiveDataset.getClientPool().getClient()) {
   Set<Partition> sourcePartitions =
     new HashSet<>(HiveUtils.getPartitions(client.get(), hiveDataset.getTable(), Optional.<String>absent()));
   sourcePartitions.parallelStream().filter(partition -> isUnixTimeStamp(partition.getDataLocation().getName()))
     .forEach(partition -> {
      Arrays.stream(listFiles(partition.getDataLocation().getParent())).filter(
        fileStatus -> !fileStatus.getPath().toString()
          .equalsIgnoreCase(partition.getDataLocation().toString())).forEach(fileStatus -> {
       deletePath(fileStatus, this.graceTimeInMillis, true);
      });
     });
  }
 }
}

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
 try {
  this.beginGetWorkunitsTime = System.currentTimeMillis();
  initialize(state);
  EventSubmitter.submit(Optional.of(this.eventSubmitter), EventConstants.CONVERSION_FIND_HIVE_TABLES_EVENT);
  Iterator<HiveDataset> iterator = this.datasetFinder.getDatasetsIterator();
  while (iterator.hasNext()) {
   HiveDataset hiveDataset = iterator.next();
   try (AutoReturnableObject<IMetaStoreClient> client = hiveDataset.getClientPool().getClient()) {
    log.debug(String.format("Processing dataset: %s", hiveDataset));
    // Create workunits for partitions
    if (HiveUtils.isPartitioned(hiveDataset.getTable())
      && state.getPropAsBoolean(HIVE_SOURCE_CREATE_WORKUNITS_FOR_PARTITIONS,
      DEFAULT_HIVE_SOURCE_CREATE_WORKUNITS_FOR_PARTITIONS)) {
     createWorkunitsForPartitionedTable(hiveDataset, client);
    } else {
     createWorkunitForNonPartitionedTable(hiveDataset);
    }
   }
  }
 } catch (IOException e) {
  throw new RuntimeException(e);
 }
 int realWorkunits = this.workunits.size();
 this.watermarker.onGetWorkunitsEnd(this.workunits);
 log.info(String.format("Created %s real workunits and %s watermark workunits", realWorkunits,
   (this.workunits.size() - realWorkunits)));
 return this.workunits;
}

public static HiveLocationDescriptor forTable(Table table, FileSystem fs, Properties properties) throws IOException {
 return new HiveLocationDescriptor(table.getDataLocation(), HiveUtils.getInputFormat(table.getTTable().getSd()), fs, properties);
}

/**
 * @return an instance of the {@link InputFormat} in this {@link StorageDescriptor}.
 */
public static InputFormat<?, ?> getInputFormat(StorageDescriptor sd) throws IOException {
 try {
  InputFormat<?, ?> inputFormat =
    ConstructorUtils.invokeConstructor((Class<? extends InputFormat>) Class.forName(sd.getInputFormat()));
  if (inputFormat instanceof JobConfigurable) {
   ((JobConfigurable) inputFormat).configure(new JobConf(getHadoopConfiguration()));
  }
  return inputFormat;
 } catch (ReflectiveOperationException re) {
  throw new IOException("Failed to instantiate input format.", re);
 }
}

if (!HiveUtils.isPartitioned(hiveDataset.getTable())) {
 throw new IllegalArgumentException("HiveDatasetVersionFinder is only compatible with partitioned hive tables");
 List<Partition> partitions = HiveUtils.getPartitions(client.get(), hiveDataset.getTable(), Optional.<String> absent());
 return Lists.newArrayList(Iterables.filter(Iterables.transform(partitions, new Function<Partition, HiveDatasetVersion>() {

/**
 * This method returns a sorted list of partitions.
 */
public List<Partition> getPartitionsFromDataset() throws IOException{
 try (AutoReturnableObject<IMetaStoreClient> client = getClientPool().getClient()) {
  List<Partition> partitions =
    HiveUtils.getPartitions(client.get(), getTable(), Optional.<String>absent());
  return sortPartitions(partitions);
 }
}

/**
 * Finds all files read by the table and generates {@link CopyEntity}s for duplicating the table. The semantics are as follows:
 * 1. Find all valid {@link org.apache.hadoop.hive.metastore.api.StorageDescriptor}. If the table is partitioned, the
 *    {@link org.apache.hadoop.hive.metastore.api.StorageDescriptor} of the base
 *    table will be ignored, and we will instead process the {@link org.apache.hadoop.hive.metastore.api.StorageDescriptor} of each partition.
 * 2. For each {@link org.apache.hadoop.hive.metastore.api.StorageDescriptor} find all files referred by it.
 * 3. Generate a {@link CopyableFile} for each file referred by a {@link org.apache.hadoop.hive.metastore.api.StorageDescriptor}.
 * 4. If the table is partitioned, create a file set for each partition.
 * 5. Create work units for registering, deregistering partitions / tables, and deleting unnecessary files in the target.
 *
 * For computation of target locations see {@link HiveTargetPathHelper#getTargetPath}
 */
Iterator<FileSet<CopyEntity>> getCopyEntities(CopyConfiguration configuration, Comparator<FileSet<CopyEntity>> prioritizer,
  PushDownRequestor<FileSet<CopyEntity>> requestor) throws IOException {
 if (HiveUtils.isPartitioned(this.dataset.table)) {
  return new PartitionIterator(this.sourcePartitions, configuration, prioritizer, requestor);
 } else {
  FileSet<CopyEntity> fileSet = new UnpartitionedTableFileSet(this.dataset.table.getCompleteName(), this.dataset, this);
  return Iterators.singletonIterator(fileSet);
 }
}

int addPartitionDeregisterSteps(List<CopyEntity> copyEntities, String fileSet, int initialPriority,
  Table table, Partition partition) throws IOException {
 int stepPriority = initialPriority;
 Collection<Path> partitionPaths = Lists.newArrayList();
 if (this.deleteMethod == DeregisterFileDeleteMethod.RECURSIVE) {
  partitionPaths = Lists.newArrayList(partition.getDataLocation());
 } else if (this.deleteMethod == DeregisterFileDeleteMethod.INPUT_FORMAT) {
  InputFormat<?, ?> inputFormat = HiveUtils.getInputFormat(partition.getTPartition().getSd());
  HiveLocationDescriptor targetLocation = new HiveLocationDescriptor(partition.getDataLocation(), inputFormat,
    this.targetFs, this.dataset.getProperties());
  partitionPaths = targetLocation.getPaths().keySet();
 } else if (this.deleteMethod == DeregisterFileDeleteMethod.NO_DELETE) {
  partitionPaths = Lists.newArrayList();
 }
 if (!partitionPaths.isEmpty()) {
  DeleteFileCommitStep deletePaths = DeleteFileCommitStep.fromPaths(this.targetFs, partitionPaths,
    this.dataset.getProperties(), table.getDataLocation());
  copyEntities.add(new PostPublishStep(fileSet, Maps.<String, String> newHashMap(), deletePaths, stepPriority++));
 }
 PartitionDeregisterStep deregister =
   new PartitionDeregisterStep(table.getTTable(), partition.getTPartition(), this.targetURI, this.hiveRegProps);
 copyEntities.add(new PostPublishStep(fileSet, Maps.<String, String> newHashMap(), deregister, stepPriority++));
 return stepPriority;
}

Javadoc

Utilities for org.apache.hadoop.hive.ql classes.

Most used methods

getPartitions
Get a list of Partitions for the table that matches an optional filter
isPartitioned
getHadoopConfiguration
getInputFormat
getPartitionsMap
getPaths
Get paths from a Hive location using the provided input format.

Popular in Java

Reading from database using SQL prepared statement
onRequestPermissionsResult (Fragment)
setRequestProperty (URLConnection)
getSystemService (Context)
ObjectMapper (com.fasterxml.jackson.databind)
ObjectMapper provides functionality for reading and writing JSON, either to and from basic POJOs (Pl
ServerSocket (java.net)
This class represents a server-side socket that waits for incoming client connections. A ServerSocke
Arrays (java.util)
This class contains various methods for manipulating arrays (such as sorting and searching). This cl
TreeSet (java.util)
TreeSet is an implementation of SortedSet. All optional operations (adding and removing) are support
UUID (java.util)
UUID is an immutable representation of a 128-bit universally unique identifier (UUID). There are mul
Annotation (javassist.bytecode.annotation)
The annotation structure.An instance of this class is returned bygetAnnotations() in AnnotationsAttr
Top PhpStorm plugins

How to useHiveUtils in org.apache.gobblin.data.management.copy.hive

Best Java code snippets using org.apache.gobblin.data.management.copy.hive.HiveUtils (Showing top 20 results out of 315)

How to use
HiveUtils
in
org.apache.gobblin.data.management.copy.hive