if (!HiveUtils.isPartitioned(hiveDataset.getTable())) { throw new IllegalArgumentException("HiveDatasetVersionFinder is only compatible with partitioned hive tables"); List<Partition> partitions = HiveUtils.getPartitions(client.get(), hiveDataset.getTable(), Optional.<String> absent()); return Lists.newArrayList(Iterables.filter(Iterables.transform(partitions, new Function<Partition, HiveDatasetVersion>() {
/** * Get paths from a Hive location using the provided input format. */ public static Set<Path> getPaths(InputFormat<?, ?> inputFormat, Path location) throws IOException { JobConf jobConf = new JobConf(getHadoopConfiguration()); Set<Path> paths = Sets.newHashSet(); FileInputFormat.addInputPaths(jobConf, location.toString()); InputSplit[] splits = inputFormat.getSplits(jobConf, 1000); for (InputSplit split : splits) { if (!(split instanceof FileSplit)) { throw new IOException("Not a file split. Found " + split.getClass().getName()); } FileSplit fileSplit = (FileSplit) split; paths.add(fileSplit.getPath()); } return paths; }
public static HiveLocationDescriptor forPartition(Partition partition, FileSystem fs, Properties properties) throws IOException { return new HiveLocationDescriptor(partition.getDataLocation(), HiveUtils.getInputFormat(partition.getTPartition().getSd()), fs, properties); }
checkPartitionedTableCompatibility(this.targetTable, this.existingTargetTable.get()); if (HiveUtils.isPartitioned(this.dataset.table)) { this.sourcePartitions = HiveUtils.getPartitionsMap(multiClient.getClient(source_client), this.dataset.table, this.partitionFilter, this.hivePartitionExtendedFilter); HiveAvroCopyEntityHelper.updatePartitionAttributesIfAvro(this.targetTable, this.sourcePartitions, this); HiveUtils.getPartitionsMap(multiClient.getClient(target_client), this.existingTargetTable.get(), this.partitionFilter, this.hivePartitionExtendedFilter)) : Maps.<List<String>, Partition> newHashMap();
/** * For backward compatibility when PathFilter is injected as a parameter. * @param client * @param table * @param filter * @return * @throws IOException */ public static List<Partition> getPartitions(IMetaStoreClient client, Table table, Optional<String> filter) throws IOException { return getPartitions(client, table, filter, Optional.<HivePartitionExtendedFilter>absent()); }
private void checkPartitionedTableCompatibility(Table desiredTargetTable, Table existingTargetTable) throws IOException { if (!desiredTargetTable.getDataLocation().equals(existingTargetTable.getDataLocation())) { throw new HiveTableLocationNotMatchException(desiredTargetTable.getDataLocation(), existingTargetTable.getDataLocation()); } if (HiveUtils.isPartitioned(desiredTargetTable) != HiveUtils.isPartitioned(existingTargetTable)) { throw new IOException(String.format( "%s: Desired target table %s partitioned, existing target table %s partitioned. Tables are incompatible.", this.dataset.tableIdentifier, HiveUtils.isPartitioned(desiredTargetTable) ? "is" : "is not", HiveUtils.isPartitioned(existingTargetTable) ? "is" : "is not")); } if (desiredTargetTable.isPartitioned() && !desiredTargetTable.getPartitionKeys().equals(existingTargetTable.getPartitionKeys())) { throw new IOException(String.format( "%s: Desired target table has partition keys %s, existing target table has partition keys %s. " + "Tables are incompatible.", this.dataset.tableIdentifier, gson.toJson(desiredTargetTable.getPartitionKeys()), gson.toJson(existingTargetTable.getPartitionKeys()))); } }
for (Path path : HiveUtils.getPaths(this.inputFormat, this.location)) { result.put(path, this.fileSystem.getFileStatus(path));
/** * @param client an {@link IMetaStoreClient} for the correct metastore. * @param table the {@link Table} for which we should get partitions. * @param filter an optional filter for partitions as would be used in Hive. Can only filter on String columns. * (e.g. "part = \"part1\"" or "date > \"2015\"". * @return a map of values to {@link Partition} for input {@link Table}. */ public static Map<List<String>, Partition> getPartitionsMap(IMetaStoreClient client, Table table, Optional<String> filter, Optional<? extends HivePartitionExtendedFilter> hivePartitionExtendedFilterOptional) throws IOException { return Maps.uniqueIndex(getPartitions(client, table, filter, hivePartitionExtendedFilterOptional), new Function<Partition, List<String>>() { @Override public List<String> apply(@Nullable Partition partition) { if (partition == null) { return null; } return partition.getValues(); } }); }
if (HiveUtils.isPartitioned(hiveDataset.getTable())) { processPartitionedTable(hiveDataset, client); } else {
checkPartitionedTableCompatibility(this.targetTable, this.existingTargetTable.get()); if (HiveUtils.isPartitioned(this.dataset.table)) { this.sourcePartitions = HiveUtils.getPartitionsMap(multiClient.getClient(source_client), this.dataset.table, this.partitionFilter, this.hivePartitionExtendedFilter); HiveAvroCopyEntityHelper.updatePartitionAttributesIfAvro(this.targetTable, this.sourcePartitions, this); HiveUtils.getPartitionsMap(multiClient.getClient(target_client), this.existingTargetTable.get(), this.partitionFilter, this.hivePartitionExtendedFilter)) : Maps.<List<String>, Partition> newHashMap();
for (Path path : HiveUtils.getPaths(this.inputFormat, this.location)) { result.put(path, this.fileSystem.getFileStatus(path));
if (table.isPresent()) { org.apache.hadoop.hive.ql.metadata.Table qlTable = new org.apache.hadoop.hive.ql.metadata.Table(table.get()); if (HiveUtils.isPartitioned(qlTable)) { partitions = Optional.of(HiveUtils.getPartitions(client.get(), qlTable, Optional.<String>absent()));
@Override public void run() throws Exception { Iterator<HiveDataset> iterator = this.datasetFinder.getDatasetsIterator(); while (iterator.hasNext()) { ConvertibleHiveDataset hiveDataset = (ConvertibleHiveDataset) iterator.next(); try (AutoReturnableObject<IMetaStoreClient> client = hiveDataset.getClientPool().getClient()) { Set<Partition> sourcePartitions = new HashSet<>(HiveUtils.getPartitions(client.get(), hiveDataset.getTable(), Optional.<String>absent())); sourcePartitions.parallelStream().filter(partition -> isUnixTimeStamp(partition.getDataLocation().getName())) .forEach(partition -> { Arrays.stream(listFiles(partition.getDataLocation().getParent())).filter( fileStatus -> !fileStatus.getPath().toString() .equalsIgnoreCase(partition.getDataLocation().toString())).forEach(fileStatus -> { deletePath(fileStatus, this.graceTimeInMillis, true); }); }); } } }
@Override public List<WorkUnit> getWorkunits(SourceState state) { try { this.beginGetWorkunitsTime = System.currentTimeMillis(); initialize(state); EventSubmitter.submit(Optional.of(this.eventSubmitter), EventConstants.CONVERSION_FIND_HIVE_TABLES_EVENT); Iterator<HiveDataset> iterator = this.datasetFinder.getDatasetsIterator(); while (iterator.hasNext()) { HiveDataset hiveDataset = iterator.next(); try (AutoReturnableObject<IMetaStoreClient> client = hiveDataset.getClientPool().getClient()) { log.debug(String.format("Processing dataset: %s", hiveDataset)); // Create workunits for partitions if (HiveUtils.isPartitioned(hiveDataset.getTable()) && state.getPropAsBoolean(HIVE_SOURCE_CREATE_WORKUNITS_FOR_PARTITIONS, DEFAULT_HIVE_SOURCE_CREATE_WORKUNITS_FOR_PARTITIONS)) { createWorkunitsForPartitionedTable(hiveDataset, client); } else { createWorkunitForNonPartitionedTable(hiveDataset); } } } } catch (IOException e) { throw new RuntimeException(e); } int realWorkunits = this.workunits.size(); this.watermarker.onGetWorkunitsEnd(this.workunits); log.info(String.format("Created %s real workunits and %s watermark workunits", realWorkunits, (this.workunits.size() - realWorkunits))); return this.workunits; }
public static HiveLocationDescriptor forTable(Table table, FileSystem fs, Properties properties) throws IOException { return new HiveLocationDescriptor(table.getDataLocation(), HiveUtils.getInputFormat(table.getTTable().getSd()), fs, properties); }
/** * @return an instance of the {@link InputFormat} in this {@link StorageDescriptor}. */ public static InputFormat<?, ?> getInputFormat(StorageDescriptor sd) throws IOException { try { InputFormat<?, ?> inputFormat = ConstructorUtils.invokeConstructor((Class<? extends InputFormat>) Class.forName(sd.getInputFormat())); if (inputFormat instanceof JobConfigurable) { ((JobConfigurable) inputFormat).configure(new JobConf(getHadoopConfiguration())); } return inputFormat; } catch (ReflectiveOperationException re) { throw new IOException("Failed to instantiate input format.", re); } }
if (!HiveUtils.isPartitioned(hiveDataset.getTable())) { throw new IllegalArgumentException("HiveDatasetVersionFinder is only compatible with partitioned hive tables"); List<Partition> partitions = HiveUtils.getPartitions(client.get(), hiveDataset.getTable(), Optional.<String> absent()); return Lists.newArrayList(Iterables.filter(Iterables.transform(partitions, new Function<Partition, HiveDatasetVersion>() {
/** * This method returns a sorted list of partitions. */ public List<Partition> getPartitionsFromDataset() throws IOException{ try (AutoReturnableObject<IMetaStoreClient> client = getClientPool().getClient()) { List<Partition> partitions = HiveUtils.getPartitions(client.get(), getTable(), Optional.<String>absent()); return sortPartitions(partitions); } }
/** * Finds all files read by the table and generates {@link CopyEntity}s for duplicating the table. The semantics are as follows: * 1. Find all valid {@link org.apache.hadoop.hive.metastore.api.StorageDescriptor}. If the table is partitioned, the * {@link org.apache.hadoop.hive.metastore.api.StorageDescriptor} of the base * table will be ignored, and we will instead process the {@link org.apache.hadoop.hive.metastore.api.StorageDescriptor} of each partition. * 2. For each {@link org.apache.hadoop.hive.metastore.api.StorageDescriptor} find all files referred by it. * 3. Generate a {@link CopyableFile} for each file referred by a {@link org.apache.hadoop.hive.metastore.api.StorageDescriptor}. * 4. If the table is partitioned, create a file set for each partition. * 5. Create work units for registering, deregistering partitions / tables, and deleting unnecessary files in the target. * * For computation of target locations see {@link HiveTargetPathHelper#getTargetPath} */ Iterator<FileSet<CopyEntity>> getCopyEntities(CopyConfiguration configuration, Comparator<FileSet<CopyEntity>> prioritizer, PushDownRequestor<FileSet<CopyEntity>> requestor) throws IOException { if (HiveUtils.isPartitioned(this.dataset.table)) { return new PartitionIterator(this.sourcePartitions, configuration, prioritizer, requestor); } else { FileSet<CopyEntity> fileSet = new UnpartitionedTableFileSet(this.dataset.table.getCompleteName(), this.dataset, this); return Iterators.singletonIterator(fileSet); } }
int addPartitionDeregisterSteps(List<CopyEntity> copyEntities, String fileSet, int initialPriority, Table table, Partition partition) throws IOException { int stepPriority = initialPriority; Collection<Path> partitionPaths = Lists.newArrayList(); if (this.deleteMethod == DeregisterFileDeleteMethod.RECURSIVE) { partitionPaths = Lists.newArrayList(partition.getDataLocation()); } else if (this.deleteMethod == DeregisterFileDeleteMethod.INPUT_FORMAT) { InputFormat<?, ?> inputFormat = HiveUtils.getInputFormat(partition.getTPartition().getSd()); HiveLocationDescriptor targetLocation = new HiveLocationDescriptor(partition.getDataLocation(), inputFormat, this.targetFs, this.dataset.getProperties()); partitionPaths = targetLocation.getPaths().keySet(); } else if (this.deleteMethod == DeregisterFileDeleteMethod.NO_DELETE) { partitionPaths = Lists.newArrayList(); } if (!partitionPaths.isEmpty()) { DeleteFileCommitStep deletePaths = DeleteFileCommitStep.fromPaths(this.targetFs, partitionPaths, this.dataset.getProperties(), table.getDataLocation()); copyEntities.add(new PostPublishStep(fileSet, Maps.<String, String> newHashMap(), deletePaths, stepPriority++)); } PartitionDeregisterStep deregister = new PartitionDeregisterStep(table.getTTable(), partition.getTPartition(), this.targetURI, this.hiveRegProps); copyEntities.add(new PostPublishStep(fileSet, Maps.<String, String> newHashMap(), deregister, stepPriority++)); return stepPriority; }