protected HiveDataset createHiveDataset(Table table, Config datasetConfig) throws IOException { return new HiveDataset(this.fs, this.clientPool, new org.apache.hadoop.hive.ql.metadata.Table(table), this.properties, datasetConfig); }
public HiveFileSet(String name, HiveDataset dataset) { super(name, dataset); this.table = dataset.getTable(); this.hiveDataset = dataset; } }
/*** * Parse logical Database and Table name from a given DbAndTable object. * * Eg. * Dataset Name Pattern : prod_$LOGICAL_DB_linkedin.prod_$LOGICAL_TABLE_linkedin * Source DB and Table : prod_dbName_linkedin.prod_tableName_linkedin * Logical DB Token : $LOGICAL_DB * Logical Table Token : $LOGICAL_TABLE * Parsed Logical DB and Table : dbName.tableName * * @param datasetNamePattern Dataset name pattern. * @param dbAndTable Source DB and Table. * @param logicalDbToken Logical DB token. * @param logicalTableToken Logical Table token. * @return Parsed logical DB and Table. */ @VisibleForTesting protected static DbAndTable parseLogicalDbAndTable(String datasetNamePattern, DbAndTable dbAndTable, String logicalDbToken, String logicalTableToken) { Preconditions.checkArgument(StringUtils.isNotBlank(datasetNamePattern), "Dataset name pattern must not be empty."); List<String> datasetNameSplit = Lists.newArrayList(SPLIT_ON_DOT.split(datasetNamePattern)); Preconditions.checkArgument(datasetNameSplit.size() == 2, "Dataset name pattern must of the format: " + "dbPrefix_$LOGICAL_DB_dbPostfix.tablePrefix_$LOGICAL_TABLE_tablePostfix (prefix / postfix are optional)"); String dbNamePattern = datasetNameSplit.get(0); String tableNamePattern = datasetNameSplit.get(1); String logicalDb = extractTokenValueFromEntity(dbAndTable.getDb(), dbNamePattern, logicalDbToken); String logicalTable = extractTokenValueFromEntity(dbAndTable.getTable(), tableNamePattern, logicalTableToken); return new DbAndTable(logicalDb, logicalTable); }
public HiveTargetPathHelper(HiveDataset dataset) { this.dataset = dataset; this.relocateDataFiles = Boolean .valueOf(this.dataset.getProperties().getProperty(RELOCATE_DATA_FILES_KEY, DEFAULT_RELOCATE_DATA_FILES)); this.targetTableRoot = this.dataset.getProperties().containsKey(COPY_TARGET_TABLE_ROOT) ? Optional.of(resolvePath(this.dataset.getProperties().getProperty(COPY_TARGET_TABLE_ROOT), this.dataset.getTable().getDbName(), this.dataset.getTable().getTableName())) : Optional.<Path> absent(); this.targetTablePrefixTobeReplaced = this.dataset.getProperties().containsKey(COPY_TARGET_TABLE_PREFIX_TOBE_REPLACED) ? Optional.of(new Path(this.dataset.getProperties().getProperty(COPY_TARGET_TABLE_PREFIX_TOBE_REPLACED))) : Optional.<Path> absent(); this.targetTablePrefixReplacement = this.dataset.getProperties().containsKey(COPY_TARGET_TABLE_PREFIX_REPLACEMENT) ? Optional.of(new Path(this.dataset.getProperties().getProperty(COPY_TARGET_TABLE_PREFIX_REPLACEMENT))) : Optional.<Path> absent(); }
/** * This method returns a sorted list of partitions. */ public List<Partition> getPartitionsFromDataset() throws IOException{ try (AutoReturnableObject<IMetaStoreClient> client = getClientPool().getClient()) { List<Partition> partitions = HiveUtils.getPartitions(client.get(), getTable(), Optional.<String>absent()); return sortPartitions(partitions); } }
DatasetDescriptor getSourceDataset() { String sourceTable = dataset.getTable().getDbName() + "." + dataset.getTable().getTableName(); DatasetDescriptor sourceDataset = new DatasetDescriptor(DatasetConstants.PLATFORM_HIVE, sourceTable); sourceDataset.addMetadata(DatasetConstants.FS_URI, dataset.getFs().getUri().toString()); return sourceDataset; }
helper.getExistingTargetTable().get().getParameters().get(HiveDataset.REGISTRATION_GENERATION_TIME_MILLIS)); HiveLocationDescriptor sourceHiveDescriptor = HiveLocationDescriptor.forTable(helper.getDataset().getTable(), helper.getDataset().getFs(), helper.getDataset().getProperties()); getFileStatus(helper.getDataset().getFs(), sourceHiveDescriptor.getLocation()); HiveDataset.REGISTRATION_GENERATION_TIME_MILLIS, helper.getDataset().getTable().getDbName()+"."+helper.getDataset().getTable().getTableName())); return false; } catch (IOException ioe) {
if (!HiveUtils.isPartitioned(hiveDataset.getTable())) { throw new IllegalArgumentException("HiveDatasetVersionFinder is only compatible with partitioned hive tables"); try (AutoReturnableObject<IMetaStoreClient> client = hiveDataset.getClientPool().getClient()) { List<Partition> partitions = HiveUtils.getPartitions(client.get(), hiveDataset.getTable(), Optional.<String> absent()); return Lists.newArrayList(Iterables.filter(Iterables.transform(partitions, new Function<Partition, HiveDatasetVersion>() {
private HiveTargetPathHelper createTestTargetPathHelper(Properties properties) { HiveDataset dataset = Mockito.mock(HiveDataset.class); Table table = new Table(new org.apache.hadoop.hive.metastore.api.Table()); table.setDbName("dbName"); table.setTableName("tableName"); Mockito.when(dataset.getTable()).thenReturn(table); Mockito.when(dataset.getTableRootPath()).thenReturn(Optional.of(TABLE_ROOT)); Mockito.when(dataset.getProperties()).thenReturn(properties); HiveTargetPathHelper helper = new HiveTargetPathHelper(dataset); return helper; }
@Test public void testMaterializeView() throws Exception { String destinationTable = "materializeView"; File tmpDir = Files.createTempDir(); tmpDir.deleteOnExit(); String viewName = "myView"; this.jdbcConnector.executeStatements(String.format("CREATE VIEW %s.%s AS SELECT * FROM %s.%s WHERE name = 'foo'", this.dbName, viewName, this.dbName, this.sourceTableName)); Table view; try (AutoReturnableObject<IMetaStoreClient> client = pool.getClient()) { view = new Table(client.get().getTable(this.dbName, viewName)); } HiveDataset viewDataset = new HiveDataset(FileSystem.getLocal(new Configuration()), pool, view, new Properties()); WorkUnit workUnit = HiveMaterializer.viewMaterializationWorkUnit(viewDataset, HiveConverterUtils.StorageFormat.AVRO, new TableLikeStageableTableMetadata(viewDataset.getTable(), this.dbName, destinationTable, tmpDir.getAbsolutePath()), null); HiveMaterializer hiveMaterializer = new HiveMaterializer(getTaskContextForRun(workUnit)); hiveMaterializer.run(); Assert.assertEquals(hiveMaterializer.getWorkingState(), WorkUnitState.WorkingState.SUCCESSFUL); hiveMaterializer.commit(); Assert.assertEquals(hiveMaterializer.getWorkingState(), WorkUnitState.WorkingState.SUCCESSFUL); List<List<String>> allTable = executeStatementAndGetResults(this.jdbcConnector, String.format("SELECT * FROM %s.%s", this.dbName, destinationTable), 3); Assert.assertEquals(allTable.size(), 4); Assert.assertEquals(allTable.stream().map(l -> l.get(0)).collect(Collectors.toList()), Lists.newArrayList("101", "103", "201", "203")); }
if (!this.dataset.getTableRootPath().isPresent()) { this.dataset.getTable().getCompleteName())); sourceAndDestination.getSource().getPath().getParent(), this.dataset.getTableRootPath().get().getParent(), configuration);
private static List<Partition> getPartitions(String completeTableName) { List<String> tableList = At_SPLITTER.splitToList(completeTableName); if (tableList.size() != 2) { log.warn("Invalid table name " + completeTableName); return Collections.EMPTY_LIST; } try (AutoReturnableObject<IMetaStoreClient> client = ComplianceRetentionJob.pool.getClient()) { Table table = client.get().getTable(tableList.get(0), tableList.get(1)); HiveDataset dataset = new HiveDataset(FileSystem.newInstance(new Configuration()), ComplianceRetentionJob.pool, new org.apache.hadoop.hive.ql.metadata.Table(table), new Properties()); return dataset.getPartitionsFromDataset(); } catch (IOException | TException e) { log.warn("Unable to get Partitions for table " + completeTableName + " " + e.getMessage()); } return Collections.EMPTY_LIST; } }
public void initDatasetFinder(Properties properties) throws IOException { Preconditions.checkArgument(properties.containsKey(GOBBLIN_COMPLIANCE_DATASET_FINDER_CLASS), "Missing required propety " + GOBBLIN_COMPLIANCE_DATASET_FINDER_CLASS); String finderClass = properties.getProperty(GOBBLIN_COMPLIANCE_DATASET_FINDER_CLASS); this.finder = GobblinConstructorUtils.invokeConstructor(DatasetsFinder.class, finderClass, new State(properties)); Iterator<HiveDataset> datasetsIterator = new HiveDatasetFinder(FileSystem.newInstance(new Configuration()), properties).getDatasetsIterator(); while (datasetsIterator.hasNext()) { // Drop partitions from empty tables if property is set, otherwise skip the table HiveDataset hiveDataset = datasetsIterator.next(); List<Partition> partitionsFromDataset = hiveDataset.getPartitionsFromDataset(); String completeTableName = hiveDataset.getTable().getCompleteName(); if (!partitionsFromDataset.isEmpty()) { this.tableNamesList.add(completeTableName); continue; } if (!Boolean.parseBoolean(properties.getProperty(ComplianceConfigurationKeys.SHOULD_DROP_EMPTY_TABLES, ComplianceConfigurationKeys.DEFAULT_SHOULD_DROP_EMPTY_TABLES))) { continue; } if (completeTableName.contains(ComplianceConfigurationKeys.TRASH) || completeTableName .contains(ComplianceConfigurationKeys.BACKUP) || completeTableName .contains(ComplianceConfigurationKeys.STAGING)) { this.tablesToDrop.add(hiveDataset); } } }
String sourceDatabase = hiveDataset.getDbAndTable().getDb(); String sourceTable = hiveDataset.getDbAndTable().getTable(); String destinationDatabase = conversionConfig.getDestinationDbName(); String destinationTable = conversionConfig.getDestinationTableName();
public HiveDataset(FileSystem fs, HiveMetastoreClientPool clientPool, Table table, Properties properties, Config datasetConfig) { this.fs = fs; this.clientPool = clientPool; this.table = table; this.properties = properties; this.tableRootPath = PathUtils.isGlob(this.table.getDataLocation()) ? Optional.<Path> absent() : Optional.fromNullable(this.table.getDataLocation()); this.tableIdentifier = this.table.getDbName() + "." + this.table.getTableName(); this.datasetNamePattern = Optional.fromNullable(ConfigUtils.getString(datasetConfig, DATASET_NAME_PATTERN_KEY, null)); this.dbAndTable = new DbAndTable(table.getDbName(), table.getTableName()); if (this.datasetNamePattern.isPresent()) { this.logicalDbAndTable = parseLogicalDbAndTable(this.datasetNamePattern.get(), this.dbAndTable, LOGICAL_DB_TOKEN, LOGICAL_TABLE_TOKEN); } else { this.logicalDbAndTable = this.dbAndTable; } this.datasetConfig = resolveConfig(datasetConfig, dbAndTable, logicalDbAndTable); this.metricContext = Instrumented.getMetricContext(new State(properties), HiveDataset.class, Lists.<Tag<?>> newArrayList(new Tag<>(DATABASE, table.getDbName()), new Tag<>(TABLE, table.getTableName()))); }
HiveLocationDescriptor.forTable(getTable(), getHiveDataset().getFs(), getHiveDataset().getProperties()); HiveLocationDescriptor desiredTargetLocation = HiveLocationDescriptor.forTable(this.helper.getTargetTable(), this.helper.getTargetFs(), getHiveDataset().getProperties()); HiveLocationDescriptor.forTable(existingTargetTable.get(), this.helper.getTargetFs(), getHiveDataset().getProperties())) : Optional.<HiveLocationDescriptor> absent(); DeleteFileCommitStep.fromPaths(this.helper.getTargetFs(), diffPathSet.pathsToDelete, getHiveDataset().getProperties()); copyEntities.add(new PrePublishStep(fileSet, Maps.<String, String> newHashMap(), deleteStep, stepPriority++));
private FileSet<CopyEntity> fileSetForPartition(final Partition partition) { return new HivePartitionFileSet(HiveCopyEntityHelper.this, partition, HiveCopyEntityHelper.this.dataset.getProperties()); } }
/** * * @param entity, name of the entity to be changed, e.g. hive table or partition * @param sd, StorageDescriptor of the entity */ public static void updateAvroSchemaURL(String entity, StorageDescriptor sd, HiveCopyEntityHelper hiveHelper) { String oldAvroSchemaURL = sd.getSerdeInfo().getParameters().get(HIVE_TABLE_AVRO_SCHEMA_URL); if (oldAvroSchemaURL != null) { Path oldAvroSchemaPath = new Path(oldAvroSchemaURL); URI sourceFileSystemURI = hiveHelper.getDataset().getFs().getUri(); if (PathUtils.isAbsoluteAndSchemeAuthorityNull(oldAvroSchemaPath) || (oldAvroSchemaPath.toUri().getScheme().equals(sourceFileSystemURI.getScheme()) && oldAvroSchemaPath.toUri().getAuthority().equals(sourceFileSystemURI.getAuthority()))) { String newAvroSchemaURL = hiveHelper.getTargetPathHelper().getTargetPath(oldAvroSchemaPath, hiveHelper.getTargetFileSystem(), Optional.<Partition>absent(), true).toString(); sd.getSerdeInfo().getParameters().put(HIVE_TABLE_AVRO_SCHEMA_URL, newAvroSchemaURL); log.info(String.format("For entity %s, change %s from %s to %s", entity, HIVE_TABLE_AVRO_SCHEMA_URL, oldAvroSchemaURL, newAvroSchemaURL)); } } }
this.eventSubmitter = new EventSubmitter.Builder(dataset.getMetricContext(), "hive.dataset.copy").build(); MultiTimingEvent multiTimer = closer.register(new MultiTimingEvent(this.eventSubmitter, "HiveCopySetup", true)); this.hiveRegProps = new HiveRegProps(new State(this.dataset.getProperties())); this.targetURI = Optional.fromNullable(this.dataset.getProperties().getProperty(TARGET_METASTORE_URI_KEY)); this.targetClientPool = HiveMetastoreClientPool.get(this.dataset.getProperties(), this.targetURI); this.targetDatabase = Optional.fromNullable(this.dataset.getProperties().getProperty(TARGET_DATABASE_KEY)) .or(this.dataset.table.getDbName()); this.existingEntityPolicy = ExistingEntityPolicy.valueOf(this.dataset.getProperties() .getProperty(EXISTING_ENTITY_POLICY_KEY, DEFAULT_EXISTING_ENTITY_POLICY).toUpperCase()); this.unmanagedDataPolicy = UnmanagedDataPolicy.valueOf( this.dataset.getProperties().getProperty(UNMANAGED_DATA_POLICY_KEY, DEFAULT_UNMANAGED_DATA_POLICY) .toUpperCase()); this.deleteMethod = this.dataset.getProperties().containsKey(DELETE_FILES_ON_DEREGISTER) ? DeregisterFileDeleteMethod .valueOf(this.dataset.getProperties().getProperty(DELETE_FILES_ON_DEREGISTER).toUpperCase()) : DEFAULT_DEREGISTER_DELETE_METHOD; if (this.dataset.getProperties().containsKey(COPY_PARTITION_FILTER_GENERATOR)) { try { PartitionFilterGenerator generator = GobblinConstructorUtils.invokeFirstConstructor( (Class<PartitionFilterGenerator>) Class .forName(this.dataset.getProperties().getProperty(COPY_PARTITION_FILTER_GENERATOR)), Lists.<Object> newArrayList(this.dataset.getProperties()), Lists.newArrayList()); this.partitionFilter = Optional.of(generator.getFilter(this.dataset)); log.info(String.format("Dynamic partition filter for table %s: %s.", this.dataset.table.getCompleteName(), Optional.fromNullable(this.dataset.getProperties().getProperty(COPY_PARTITIONS_FILTER_CONSTANT));
return targetFs.makeQualified(targetPathWithoutSchemeAndAuthority); } else if (this.targetTableRoot.isPresent()) { Preconditions.checkArgument(this.dataset.getTableRootPath().isPresent(), "Cannot move paths to a new root unless table has exactly one location."); Preconditions.checkArgument(PathUtils.isAncestor(this.dataset.getTableRootPath().get(), sourcePath), "When moving paths to a new root, all locations must be descendants of the table root location. " + "Table root location: %s, file location: %s.", this.dataset.getTableRootPath(), sourcePath); Path relativePath = PathUtils.relativizePath(sourcePath, this.dataset.getTableRootPath().get()); return targetFs.makeQualified(new Path(this.targetTableRoot.get(), relativePath)); } else {