/** * Generates a CTAS statement to dump the contents of a table / partition into a new table. * @param outputDbAndTable output db and table where contents should be written. * @param sourceEntity source table / partition. * @param partitionDMLInfo map of partition values. * @param storageFormat format of output table. * @param outputTableLocation location where files of output table should be written. */ public static String generateStagingCTASStatementFromSelectStar(HiveDatasetFinder.DbAndTable outputDbAndTable, HiveDatasetFinder.DbAndTable sourceEntity, Map<String, String> partitionDMLInfo, StorageFormat storageFormat, String outputTableLocation) { StringBuilder sourceQueryBuilder = new StringBuilder("SELECT * FROM `").append(sourceEntity.getDb()) .append("`.`").append(sourceEntity.getTable()).append("`"); if (partitionDMLInfo != null && !partitionDMLInfo.isEmpty()) { sourceQueryBuilder.append(" WHERE "); sourceQueryBuilder.append(partitionDMLInfo.entrySet().stream() .map(e -> "`" + e.getKey() + "`='" + e.getValue() + "'") .collect(joining(" AND "))); } return generateStagingCTASStatement(outputDbAndTable, sourceQueryBuilder.toString(), storageFormat, outputTableLocation); }
String sourceDatabase = hiveDataset.getDbAndTable().getDb(); String sourceTable = hiveDataset.getDbAndTable().getTable(); String destinationDatabase = conversionConfig.getDestinationDbName();
/*** * Parse logical Database and Table name from a given DbAndTable object. * * Eg. * Dataset Name Pattern : prod_$LOGICAL_DB_linkedin.prod_$LOGICAL_TABLE_linkedin * Source DB and Table : prod_dbName_linkedin.prod_tableName_linkedin * Logical DB Token : $LOGICAL_DB * Logical Table Token : $LOGICAL_TABLE * Parsed Logical DB and Table : dbName.tableName * * @param datasetNamePattern Dataset name pattern. * @param dbAndTable Source DB and Table. * @param logicalDbToken Logical DB token. * @param logicalTableToken Logical Table token. * @return Parsed logical DB and Table. */ @VisibleForTesting protected static DbAndTable parseLogicalDbAndTable(String datasetNamePattern, DbAndTable dbAndTable, String logicalDbToken, String logicalTableToken) { Preconditions.checkArgument(StringUtils.isNotBlank(datasetNamePattern), "Dataset name pattern must not be empty."); List<String> datasetNameSplit = Lists.newArrayList(SPLIT_ON_DOT.split(datasetNamePattern)); Preconditions.checkArgument(datasetNameSplit.size() == 2, "Dataset name pattern must of the format: " + "dbPrefix_$LOGICAL_DB_dbPostfix.tablePrefix_$LOGICAL_TABLE_tablePostfix (prefix / postfix are optional)"); String dbNamePattern = datasetNameSplit.get(0); String tableNamePattern = datasetNameSplit.get(1); String logicalDb = extractTokenValueFromEntity(dbAndTable.getDb(), dbNamePattern, logicalDbToken); String logicalTable = extractTokenValueFromEntity(dbAndTable.getTable(), tableNamePattern, logicalTableToken); return new DbAndTable(logicalDb, logicalTable); }
/*** * Parse logical Database and Table name from a given DbAndTable object. * * Eg. * Dataset Name Pattern : prod_$LOGICAL_DB_linkedin.prod_$LOGICAL_TABLE_linkedin * Source DB and Table : prod_dbName_linkedin.prod_tableName_linkedin * Logical DB Token : $LOGICAL_DB * Logical Table Token : $LOGICAL_TABLE * Parsed Logical DB and Table : dbName.tableName * * @param datasetNamePattern Dataset name pattern. * @param dbAndTable Source DB and Table. * @param logicalDbToken Logical DB token. * @param logicalTableToken Logical Table token. * @return Parsed logical DB and Table. */ @VisibleForTesting protected static DbAndTable parseLogicalDbAndTable(String datasetNamePattern, DbAndTable dbAndTable, String logicalDbToken, String logicalTableToken) { Preconditions.checkArgument(StringUtils.isNotBlank(datasetNamePattern), "Dataset name pattern must not be empty."); List<String> datasetNameSplit = Lists.newArrayList(SPLIT_ON_DOT.split(datasetNamePattern)); Preconditions.checkArgument(datasetNameSplit.size() == 2, "Dataset name pattern must of the format: " + "dbPrefix_$LOGICAL_DB_dbPostfix.tablePrefix_$LOGICAL_TABLE_tablePostfix (prefix / postfix are optional)"); String dbNamePattern = datasetNameSplit.get(0); String tableNamePattern = datasetNameSplit.get(1); String logicalDb = extractTokenValueFromEntity(dbAndTable.getDb(), dbNamePattern, logicalDbToken); String logicalTable = extractTokenValueFromEntity(dbAndTable.getTable(), tableNamePattern, logicalTableToken); return new DbAndTable(logicalDb, logicalTable); }
/** * Generates a CTAS statement to dump the contents of a table / partition into a new table. * @param outputDbAndTable output db and table where contents should be written. * @param sourceEntity source table / partition. * @param partitionDMLInfo map of partition values. * @param storageFormat format of output table. * @param outputTableLocation location where files of output table should be written. */ public static String generateStagingCTASStatementFromSelectStar(HiveDatasetFinder.DbAndTable outputDbAndTable, HiveDatasetFinder.DbAndTable sourceEntity, Map<String, String> partitionDMLInfo, StorageFormat storageFormat, String outputTableLocation) { StringBuilder sourceQueryBuilder = new StringBuilder("SELECT * FROM `").append(sourceEntity.getDb()) .append("`.`").append(sourceEntity.getTable()).append("`"); if (partitionDMLInfo != null && !partitionDMLInfo.isEmpty()) { sourceQueryBuilder.append(" WHERE "); sourceQueryBuilder.append(partitionDMLInfo.entrySet().stream() .map(e -> "`" + e.getKey() + "`='" + e.getValue() + "'") .collect(joining(" AND "))); } return generateStagingCTASStatement(outputDbAndTable, sourceQueryBuilder.toString(), storageFormat, outputTableLocation); }
private HiveProcessingEntity getConversionEntity(HiveWorkUnit hiveWorkUnit) throws IOException, TException, HiveException { try (AutoReturnableObject<IMetaStoreClient> client = this.pool.getClient()) { HiveDataset dataset = hiveWorkUnit.getHiveDataset(); HiveDatasetFinder.DbAndTable dbAndTable = dataset.getDbAndTable(); Table table = new Table(client.get().getTable(dbAndTable.getDb(), dbAndTable.getTable())); Partition partition = null; if (hiveWorkUnit.getPartitionName().isPresent()) { partition = new Partition(table, client.get() .getPartition(dbAndTable.getDb(), dbAndTable.getTable(), hiveWorkUnit.getPartitionName().get())); } return new HiveProcessingEntity(dataset, table, Optional.fromNullable(partition)); } }
private HiveDataset getHiveDataset(String tableString, FileSystem fs, State state) throws IOException { try { HiveMetastoreClientPool pool = HiveMetastoreClientPool.get(state.getProperties(), Optional.fromNullable(state.getProp(HIVE_METASTORE_URI_KEY))); List<String> tokens = Splitter.on(".").splitToList(tableString); DbAndTable sourceDbAndTable = new DbAndTable(tokens.get(0), tokens.get(1)); try (AutoReturnableObject<IMetaStoreClient> client = pool.getClient()) { Table sourceTable = new Table(client.get().getTable(sourceDbAndTable.getDb(), sourceDbAndTable.getTable())); return new HiveDataset(fs, pool, sourceTable, ConfigUtils.propertiesToConfig(state.getProperties())); } } catch (TException exc) { throw new RuntimeException(exc); } }
/** * Generates a CTAS statement to dump the results of a query into a new table. * @param outputDbAndTable output db and table where contents should be written. * @param sourceQuery query to materialize. * @param storageFormat format of output table. * @param outputTableLocation location where files of output table should be written. */ public static String generateStagingCTASStatement(HiveDatasetFinder.DbAndTable outputDbAndTable, String sourceQuery, StorageFormat storageFormat, String outputTableLocation) { Preconditions.checkArgument(!Strings.isNullOrEmpty(outputDbAndTable.getDb()) && !Strings.isNullOrEmpty(outputDbAndTable.getTable()), "Invalid output db and table " + outputDbAndTable); return String.format("CREATE TEMPORARY TABLE `%s`.`%s` STORED AS %s LOCATION '%s' AS %s", outputDbAndTable.getDb(), outputDbAndTable.getTable(), storageFormat.getHiveName(), outputTableLocation, sourceQuery); }
public HiveBaseExtractor(WorkUnitState state) throws IOException { if (Boolean.valueOf(state.getPropAsBoolean(PartitionLevelWatermarker.IS_WATERMARK_WORKUNIT_KEY))) { return; } this.hiveWorkUnit = new HiveWorkUnit(state.getWorkunit()); this.hiveDataset = hiveWorkUnit.getHiveDataset(); this.dbName = hiveDataset.getDbAndTable().getDb(); this.tableName = hiveDataset.getDbAndTable().getTable(); this.pool = HiveMetastoreClientPool.get(state.getJobState().getProperties(), Optional.fromNullable(state.getJobState().getProp(HiveDatasetFinder.HIVE_METASTORE_URI_KEY))); }
new HiveDatasetFinder.DbAndTable("dbPrefix_myDB_dbPostfix", "tablePrefix_myTable_tablePostfix"), HiveDataset.LOGICAL_DB_TOKEN, HiveDataset.LOGICAL_TABLE_TOKEN); Assert.assertEquals(logicalDbAndTable.getDb(), "myDB", "DB name not parsed correctly"); Assert.assertEquals(logicalDbAndTable.getTable(), "myTable", "Table name not parsed correctly"); new HiveDatasetFinder.DbAndTable("myDB_dbPostfix", "myTable_tablePostfix"), HiveDataset.LOGICAL_DB_TOKEN, HiveDataset.LOGICAL_TABLE_TOKEN); Assert.assertEquals(logicalDbAndTable.getDb(), "myDB", "DB name not parsed correctly"); Assert.assertEquals(logicalDbAndTable.getTable(), "myTable", "Table name not parsed correctly"); new HiveDatasetFinder.DbAndTable("dbPrefix_myDB", "tablePrefix_myTable"), HiveDataset.LOGICAL_DB_TOKEN, HiveDataset.LOGICAL_TABLE_TOKEN); Assert.assertEquals(logicalDbAndTable.getDb(), "myDB", "DB name not parsed correctly"); Assert.assertEquals(logicalDbAndTable.getTable(), "myTable", "Table name not parsed correctly"); new HiveDatasetFinder.DbAndTable("myDB", "myTable"), HiveDataset.LOGICAL_DB_TOKEN, HiveDataset.LOGICAL_TABLE_TOKEN); Assert.assertEquals(logicalDbAndTable.getDb(), "myDB", "DB name not parsed correctly"); Assert.assertEquals(logicalDbAndTable.getTable(), "myTable", "Table name not parsed correctly");
String resolvedValue = StringUtils.replaceEach(rawValue, new String[] { DATABASE_TOKEN, TABLE_TOKEN, LOGICAL_DB_TOKEN, LOGICAL_TABLE_TOKEN }, new String[] { realDbAndTable.getDb(), realDbAndTable.getTable(), logicalDbAndTable.getDb(), logicalDbAndTable.getTable() }); resolvedValueList.add(resolvedValue); String resolvedValue = StringUtils.replaceEach(resolvedConfig.getString(entry.getKey()), new String[] { DATABASE_TOKEN, TABLE_TOKEN, LOGICAL_DB_TOKEN, LOGICAL_TABLE_TOKEN }, new String[] { realDbAndTable.getDb(), realDbAndTable.getTable(), logicalDbAndTable.getDb(), logicalDbAndTable.getTable() }); resolvedProperties.setProperty(entry.getKey(), resolvedValue);
@Override protected HiveDataset computeNext() { while (this.tables.hasNext()) { DbAndTable dbAndTable = this.tables.next(); try (AutoReturnableObject<IMetaStoreClient> client = HiveDatasetFinder.this.clientPool.getClient()) { Table table = client.get().getTable(dbAndTable.getDb(), dbAndTable.getTable()); Config datasetConfig = getDatasetConfig(table); if (ConfigUtils.getBoolean(datasetConfig, HIVE_DATASET_IS_BLACKLISTED_KEY, DEFAULT_HIVE_DATASET_IS_BLACKLISTED_KEY)) { continue; } if (HiveDatasetFinder.this.eventSubmitter.isPresent()) { SlaEventSubmitter.builder().datasetUrn(dbAndTable.toString()) .eventSubmitter(HiveDatasetFinder.this.eventSubmitter.get()).eventName(DATASET_FOUND).build().submit(); } return createHiveDataset(table, datasetConfig); } catch (IllegalArgumentException e) { Throwables.propagate(e); } catch (Throwable t) { log.error(String.format("Failed to create HiveDataset for table %s.%s", dbAndTable.getDb(), dbAndTable.getTable()), t); if (HiveDatasetFinder.this.eventSubmitter.isPresent()) { SlaEventSubmitter.builder().datasetUrn(dbAndTable.toString()) .eventSubmitter(HiveDatasetFinder.this.eventSubmitter.get()).eventName(DATASET_ERROR) .additionalMetadata(FAILURE_CONTEXT, t.toString()).build().submit(); } } } return endOfData(); } };
private HiveMetastoreClientPool getTestPool(List<HiveDatasetFinder.DbAndTable> dbAndTables) throws Exception { SetMultimap<String, String> entities = HashMultimap.create(); for (HiveDatasetFinder.DbAndTable dbAndTable : dbAndTables) { entities.put(dbAndTable.getDb(), dbAndTable.getTable()); } HiveMetastoreClientPool pool = Mockito.mock(HiveMetastoreClientPool.class); IMetaStoreClient client = Mockito.mock(IMetaStoreClient.class); Mockito.when(client.getAllDatabases()).thenReturn(Lists.newArrayList(entities.keySet())); for (String db : entities.keySet()) { Mockito.doReturn(Lists.newArrayList(entities.get(db))).when(client).getAllTables(db); } for (HiveDatasetFinder.DbAndTable dbAndTable : dbAndTables) { Table table = new Table(); table.setDbName(dbAndTable.getDb()); table.setTableName(dbAndTable.getTable()); StorageDescriptor sd = new StorageDescriptor(); sd.setLocation("/tmp/test"); table.setSd(sd); Mockito.doReturn(table).when(client).getTable(dbAndTable.getDb(), dbAndTable.getTable()); } @SuppressWarnings("unchecked") AutoReturnableObject<IMetaStoreClient> aro = Mockito.mock(AutoReturnableObject.class); Mockito.when(aro.get()).thenReturn(client); Mockito.when(pool.getHiveRegProps()).thenReturn(null); Mockito.when(pool.getClient()).thenReturn(aro); return pool; }
@Test public void testGetWorkunitsAfterWatermark() throws Exception { String dbName = "testdb4"; String tableName1 = "testtable1"; String tableSdLoc1 = "/tmp/testtable1"; String tableName2 = "testtable2"; String tableSdLoc2 = "/tmp/testtable2"; this.hiveMetastoreTestUtils.getLocalMetastoreClient().dropDatabase(dbName, false, true, true); this.hiveMetastoreTestUtils.createTestAvroTable(dbName, tableName1, tableSdLoc1, Optional.<String> absent()); this.hiveMetastoreTestUtils.createTestAvroTable(dbName, tableName2, tableSdLoc2, Optional.<String> absent(), true); List<WorkUnitState> previousWorkUnitStates = Lists.newArrayList(); Table table1 = this.hiveMetastoreTestUtils.getLocalMetastoreClient().getTable(dbName, tableName1); previousWorkUnitStates.add(ConversionHiveTestUtils.createWus(dbName, tableName1, TimeUnit.MILLISECONDS.convert(table1.getCreateTime(), TimeUnit.SECONDS))); SourceState testState = new SourceState(getTestState(dbName), previousWorkUnitStates); testState.setProp(HiveSource.HIVE_SOURCE_WATERMARKER_FACTORY_CLASS_KEY, TableLevelWatermarker.Factory.class.getName()); List<WorkUnit> workUnits = this.hiveSource.getWorkunits(testState); Assert.assertEquals(workUnits.size(), 1); WorkUnit wu = workUnits.get(0); HiveWorkUnit hwu = new HiveWorkUnit(wu); Assert.assertEquals(hwu.getHiveDataset().getDbAndTable().getDb(), dbName); Assert.assertEquals(hwu.getHiveDataset().getDbAndTable().getTable(), tableName2); }
@Test public void testGetWorkUnitsForPartitions() throws Exception { String dbName = "testdb3"; String tableName = "testtable3"; String tableSdLoc = "/tmp/testtable3"; this.hiveMetastoreTestUtils.getLocalMetastoreClient().dropDatabase(dbName, false, true, true); SourceState testState = getTestState(dbName); Table tbl = this.hiveMetastoreTestUtils.createTestAvroTable(dbName, tableName, tableSdLoc, Optional.of("field")); this.hiveMetastoreTestUtils.addTestPartition(tbl, ImmutableList.of("f1"), (int) System.currentTimeMillis()); List<WorkUnit> workUnits = this.hiveSource.getWorkunits(testState); // One workunit for the partition + 1 dummy watermark workunit Assert.assertEquals(workUnits.size(), 2); WorkUnit wu = workUnits.get(0); WorkUnit wu2 = workUnits.get(1); HiveWorkUnit hwu = null; if (!wu.contains(PartitionLevelWatermarker.IS_WATERMARK_WORKUNIT_KEY)) { hwu = new HiveWorkUnit(wu); } else { hwu = new HiveWorkUnit(wu2); } Assert.assertEquals(hwu.getHiveDataset().getDbAndTable().getDb(), dbName); Assert.assertEquals(hwu.getHiveDataset().getDbAndTable().getTable(), tableName); Assert.assertEquals(hwu.getPartitionName().get(), "field=f1"); }
@Test public void testGetWorkUnitsForTable() throws Exception { String dbName = "testdb2"; String tableName = "testtable2"; String tableSdLoc = "/tmp/testtable2"; this.hiveMetastoreTestUtils.getLocalMetastoreClient().dropDatabase(dbName, false, true, true); SourceState testState = getTestState(dbName); this.hiveMetastoreTestUtils.createTestAvroTable(dbName, tableName, tableSdLoc, Optional.<String> absent()); List<WorkUnit> workUnits = hiveSource.getWorkunits(testState); // One workunit for the table, no dummy workunits Assert.assertEquals(workUnits.size(), 1); WorkUnit wu = workUnits.get(0); HiveWorkUnit hwu = new HiveWorkUnit(wu); Assert.assertEquals(hwu.getHiveDataset().getDbAndTable().getDb(), dbName); Assert.assertEquals(hwu.getHiveDataset().getDbAndTable().getTable(), tableName); Assert.assertEquals(hwu.getTableSchemaUrl(), new Path("/tmp/dummy")); }
private HiveProcessingEntity getConversionEntity(HiveWorkUnit hiveWorkUnit) throws IOException, TException, HiveException { try (AutoReturnableObject<IMetaStoreClient> client = this.pool.getClient()) { HiveDataset dataset = hiveWorkUnit.getHiveDataset(); HiveDatasetFinder.DbAndTable dbAndTable = dataset.getDbAndTable(); Table table = new Table(client.get().getTable(dbAndTable.getDb(), dbAndTable.getTable())); Partition partition = null; if (hiveWorkUnit.getPartitionName().isPresent()) { partition = new Partition(table, client.get() .getPartition(dbAndTable.getDb(), dbAndTable.getTable(), hiveWorkUnit.getPartitionName().get())); } return new HiveProcessingEntity(dataset, table, Optional.fromNullable(partition)); } }
/** * Generates a CTAS statement to dump the results of a query into a new table. * @param outputDbAndTable output db and table where contents should be written. * @param sourceQuery query to materialize. * @param storageFormat format of output table. * @param outputTableLocation location where files of output table should be written. */ public static String generateStagingCTASStatement(HiveDatasetFinder.DbAndTable outputDbAndTable, String sourceQuery, StorageFormat storageFormat, String outputTableLocation) { Preconditions.checkArgument(!Strings.isNullOrEmpty(outputDbAndTable.getDb()) && !Strings.isNullOrEmpty(outputDbAndTable.getTable()), "Invalid output db and table " + outputDbAndTable); return String.format("CREATE TEMPORARY TABLE `%s`.`%s` STORED AS %s LOCATION '%s' AS %s", outputDbAndTable.getDb(), outputDbAndTable.getTable(), storageFormat.getHiveName(), outputTableLocation, sourceQuery); }
private HiveDataset getHiveDataset(String tableString, FileSystem fs, State state) throws IOException { try { HiveMetastoreClientPool pool = HiveMetastoreClientPool.get(state.getProperties(), Optional.fromNullable(state.getProp(HIVE_METASTORE_URI_KEY))); List<String> tokens = Splitter.on(".").splitToList(tableString); DbAndTable sourceDbAndTable = new DbAndTable(tokens.get(0), tokens.get(1)); try (AutoReturnableObject<IMetaStoreClient> client = pool.getClient()) { Table sourceTable = new Table(client.get().getTable(sourceDbAndTable.getDb(), sourceDbAndTable.getTable())); return new HiveDataset(fs, pool, sourceTable, ConfigUtils.propertiesToConfig(state.getProperties())); } } catch (TException exc) { throw new RuntimeException(exc); } }
public HiveBaseExtractor(WorkUnitState state) throws IOException { if (Boolean.valueOf(state.getPropAsBoolean(PartitionLevelWatermarker.IS_WATERMARK_WORKUNIT_KEY))) { return; } this.hiveWorkUnit = new HiveWorkUnit(state.getWorkunit()); this.hiveDataset = hiveWorkUnit.getHiveDataset(); this.dbName = hiveDataset.getDbAndTable().getDb(); this.tableName = hiveDataset.getDbAndTable().getTable(); this.pool = HiveMetastoreClientPool.get(state.getJobState().getProperties(), Optional.fromNullable(state.getJobState().getProp(HiveDatasetFinder.HIVE_METASTORE_URI_KEY))); }