/*** * Parse logical Database and Table name from a given DbAndTable object. * * Eg. * Dataset Name Pattern : prod_$LOGICAL_DB_linkedin.prod_$LOGICAL_TABLE_linkedin * Source DB and Table : prod_dbName_linkedin.prod_tableName_linkedin * Logical DB Token : $LOGICAL_DB * Logical Table Token : $LOGICAL_TABLE * Parsed Logical DB and Table : dbName.tableName * * @param datasetNamePattern Dataset name pattern. * @param dbAndTable Source DB and Table. * @param logicalDbToken Logical DB token. * @param logicalTableToken Logical Table token. * @return Parsed logical DB and Table. */ @VisibleForTesting protected static DbAndTable parseLogicalDbAndTable(String datasetNamePattern, DbAndTable dbAndTable, String logicalDbToken, String logicalTableToken) { Preconditions.checkArgument(StringUtils.isNotBlank(datasetNamePattern), "Dataset name pattern must not be empty."); List<String> datasetNameSplit = Lists.newArrayList(SPLIT_ON_DOT.split(datasetNamePattern)); Preconditions.checkArgument(datasetNameSplit.size() == 2, "Dataset name pattern must of the format: " + "dbPrefix_$LOGICAL_DB_dbPostfix.tablePrefix_$LOGICAL_TABLE_tablePostfix (prefix / postfix are optional)"); String dbNamePattern = datasetNameSplit.get(0); String tableNamePattern = datasetNameSplit.get(1); String logicalDb = extractTokenValueFromEntity(dbAndTable.getDb(), dbNamePattern, logicalDbToken); String logicalTable = extractTokenValueFromEntity(dbAndTable.getTable(), tableNamePattern, logicalTableToken); return new DbAndTable(logicalDb, logicalTable); }
@Test public void testResolveConfig() throws IOException { HiveDatasetFinder.DbAndTable realDbAndTable = new HiveDatasetFinder.DbAndTable("realDb", "realTable"); HiveDatasetFinder.DbAndTable logicalDbAndTable = new HiveDatasetFinder.DbAndTable("logicalDb", "logicalTable"); Config resolvedConfig = HiveDataset.resolveConfig(config, realDbAndTable, logicalDbAndTable); Assert.assertEquals(resolvedConfig.getString(DUMMY_CONFIG_KEY_WITH_DB_TOKEN), "resPrefix_realDb_resPostfix", "Real DB not resolved correctly"); Assert.assertEquals(resolvedConfig.getString(DUMMY_CONFIG_KEY_WITH_TABLE_TOKEN), "resPrefix_realTable_resPostfix", "Real Table not resolved correctly"); Assert.assertEquals(resolvedConfig.getString(HiveDatasetVersionCleaner.REPLACEMENT_HIVE_DB_NAME_KEY), "resPrefix_logicalDb_resPostfix", "Logical DB not resolved correctly"); Assert.assertEquals(resolvedConfig.getString(HiveDatasetVersionCleaner.REPLACEMENT_HIVE_TABLE_NAME_KEY), "resPrefix_logicalTable_resPostfix", "Logical Table not resolved correctly"); } }
@Test public void testException() throws Exception { List<HiveDatasetFinder.DbAndTable> dbAndTables = Lists.newArrayList(); dbAndTables.add(new HiveDatasetFinder.DbAndTable("db1", "table1")); dbAndTables.add(new HiveDatasetFinder.DbAndTable("db1", TestHiveDatasetFinder.THROW_EXCEPTION)); dbAndTables.add(new HiveDatasetFinder.DbAndTable("db1", "table3")); HiveMetastoreClientPool pool = getTestPool(dbAndTables); Properties properties = new Properties(); properties.put(HiveDatasetFinder.HIVE_DATASET_PREFIX + "." + WhitelistBlacklist.WHITELIST, ""); HiveDatasetFinder finder = new TestHiveDatasetFinder(FileSystem.getLocal(new Configuration()), properties, pool); List<HiveDataset> datasets = Lists.newArrayList(finder.getDatasetsIterator()); Assert.assertEquals(datasets.size(), 2); }
/** * Get all tables in db with given table pattern. */ public Collection<DbAndTable> getTables() throws IOException { List<DbAndTable> tables = Lists.newArrayList(); try (AutoReturnableObject<IMetaStoreClient> client = this.clientPool.getClient()) { Iterable<String> databases = Iterables.filter(client.get().getAllDatabases(), new Predicate<String>() { @Override public boolean apply(String db) { return HiveDatasetFinder.this.whitelistBlacklist.acceptDb(db); } }); for (final String db : databases) { Iterable<String> tableNames = Iterables.filter(client.get().getAllTables(db), new Predicate<String>() { @Override public boolean apply(String table) { return HiveDatasetFinder.this.whitelistBlacklist.acceptTable(db, table); } }); for (String tableName : tableNames) { tables.add(new DbAndTable(db, tableName)); } } } catch (Exception exc) { throw new IOException(exc); } return tables; }
/*** * Parse logical Database and Table name from a given DbAndTable object. * * Eg. * Dataset Name Pattern : prod_$LOGICAL_DB_linkedin.prod_$LOGICAL_TABLE_linkedin * Source DB and Table : prod_dbName_linkedin.prod_tableName_linkedin * Logical DB Token : $LOGICAL_DB * Logical Table Token : $LOGICAL_TABLE * Parsed Logical DB and Table : dbName.tableName * * @param datasetNamePattern Dataset name pattern. * @param dbAndTable Source DB and Table. * @param logicalDbToken Logical DB token. * @param logicalTableToken Logical Table token. * @return Parsed logical DB and Table. */ @VisibleForTesting protected static DbAndTable parseLogicalDbAndTable(String datasetNamePattern, DbAndTable dbAndTable, String logicalDbToken, String logicalTableToken) { Preconditions.checkArgument(StringUtils.isNotBlank(datasetNamePattern), "Dataset name pattern must not be empty."); List<String> datasetNameSplit = Lists.newArrayList(SPLIT_ON_DOT.split(datasetNamePattern)); Preconditions.checkArgument(datasetNameSplit.size() == 2, "Dataset name pattern must of the format: " + "dbPrefix_$LOGICAL_DB_dbPostfix.tablePrefix_$LOGICAL_TABLE_tablePostfix (prefix / postfix are optional)"); String dbNamePattern = datasetNameSplit.get(0); String tableNamePattern = datasetNameSplit.get(1); String logicalDb = extractTokenValueFromEntity(dbAndTable.getDb(), dbNamePattern, logicalDbToken); String logicalTable = extractTokenValueFromEntity(dbAndTable.getTable(), tableNamePattern, logicalTableToken); return new DbAndTable(logicalDb, logicalTable); }
public HiveDataset(FileSystem fs, HiveMetastoreClientPool clientPool, Table table, Properties properties, Config datasetConfig) { this.fs = fs; this.clientPool = clientPool; this.table = table; this.properties = properties; this.tableRootPath = PathUtils.isGlob(this.table.getDataLocation()) ? Optional.<Path> absent() : Optional.fromNullable(this.table.getDataLocation()); this.tableIdentifier = this.table.getDbName() + "." + this.table.getTableName(); this.datasetNamePattern = Optional.fromNullable(ConfigUtils.getString(datasetConfig, DATASET_NAME_PATTERN_KEY, null)); this.dbAndTable = new DbAndTable(table.getDbName(), table.getTableName()); if (this.datasetNamePattern.isPresent()) { this.logicalDbAndTable = parseLogicalDbAndTable(this.datasetNamePattern.get(), this.dbAndTable, LOGICAL_DB_TOKEN, LOGICAL_TABLE_TOKEN); } else { this.logicalDbAndTable = this.dbAndTable; } this.datasetConfig = resolveConfig(datasetConfig, dbAndTable, logicalDbAndTable); this.metricContext = Instrumented.getMetricContext(new State(properties), HiveDataset.class, Lists.<Tag<?>> newArrayList(new Tag<>(DATABASE, table.getDbName()), new Tag<>(TABLE, table.getTableName()))); }
private HiveDataset getHiveDataset(String tableString, FileSystem fs, State state) throws IOException { try { HiveMetastoreClientPool pool = HiveMetastoreClientPool.get(state.getProperties(), Optional.fromNullable(state.getProp(HIVE_METASTORE_URI_KEY))); List<String> tokens = Splitter.on(".").splitToList(tableString); DbAndTable sourceDbAndTable = new DbAndTable(tokens.get(0), tokens.get(1)); try (AutoReturnableObject<IMetaStoreClient> client = pool.getClient()) { Table sourceTable = new Table(client.get().getTable(sourceDbAndTable.getDb(), sourceDbAndTable.getTable())); return new HiveDataset(fs, pool, sourceTable, ConfigUtils.propertiesToConfig(state.getProperties())); } } catch (TException exc) { throw new RuntimeException(exc); } }
@Override public List<String> generateQueries() { ensureParentOfStagingPathExists(); return Lists.newArrayList(HiveConverterUtils.generateStagingCTASStatementFromSelectStar( new HiveDatasetFinder.DbAndTable(this.outputDatabaseName, this.stagingTableName), new HiveDatasetFinder.DbAndTable(this.inputDbName, this.inputTableName), this.partitionsDMLInfo, this.storageFormat, this.stagingDataLocation)); } }
@Override public List<String> generateQueries() { ensureParentOfStagingPathExists(); return Lists.newArrayList(HiveConverterUtils.generateStagingCTASStatement( new HiveDatasetFinder.DbAndTable(this.outputDatabaseName, this.stagingTableName), this.sourceQuery, this.storageFormat, this.stagingDataLocation)); }
new HiveDatasetFinder.DbAndTable("dbPrefix_myDB_dbPostfix", "tablePrefix_myTable_tablePostfix"), HiveDataset.LOGICAL_DB_TOKEN, HiveDataset.LOGICAL_TABLE_TOKEN); Assert.assertEquals(logicalDbAndTable.getDb(), "myDB", "DB name not parsed correctly"); new HiveDatasetFinder.DbAndTable("myDB_dbPostfix", "myTable_tablePostfix"), HiveDataset.LOGICAL_DB_TOKEN, HiveDataset.LOGICAL_TABLE_TOKEN); Assert.assertEquals(logicalDbAndTable.getDb(), "myDB", "DB name not parsed correctly"); new HiveDatasetFinder.DbAndTable("dbPrefix_myDB", "tablePrefix_myTable"), HiveDataset.LOGICAL_DB_TOKEN, HiveDataset.LOGICAL_TABLE_TOKEN); Assert.assertEquals(logicalDbAndTable.getDb(), "myDB", "DB name not parsed correctly"); new HiveDatasetFinder.DbAndTable("myDB", "myTable"), HiveDataset.LOGICAL_DB_TOKEN, HiveDataset.LOGICAL_TABLE_TOKEN); Assert.assertEquals(logicalDbAndTable.getDb(), "myDB", "DB name not parsed correctly"); try { logicalDbAndTable = HiveDataset.parseLogicalDbAndTable(datasetNamePattern, new HiveDatasetFinder.DbAndTable("dbPrefix_myDB_dbPostfix", "tablePrefix_myTable_tablePostfix"), HiveDataset.LOGICAL_DB_TOKEN, HiveDataset.LOGICAL_TABLE_TOKEN); Assert.fail("Dataset name pattern is missing, code should have thrown exception"); try { logicalDbAndTable = HiveDataset.parseLogicalDbAndTable(datasetNamePattern, new HiveDatasetFinder.DbAndTable("dbPrefix_myDB_dbPostfix", "tablePrefix_myTable_tablePostfix"), HiveDataset.LOGICAL_DB_TOKEN, HiveDataset.LOGICAL_TABLE_TOKEN); Assert.fail("Dataset name pattern is missing, code should have thrown exception");
/** * Get all tables in db with given table pattern. */ public Collection<DbAndTable> getTables() throws IOException { List<DbAndTable> tables = Lists.newArrayList(); try (AutoReturnableObject<IMetaStoreClient> client = this.clientPool.getClient()) { Iterable<String> databases = Iterables.filter(client.get().getAllDatabases(), new Predicate<String>() { @Override public boolean apply(String db) { return HiveDatasetFinder.this.whitelistBlacklist.acceptDb(db); } }); for (final String db : databases) { Iterable<String> tableNames = Iterables.filter(client.get().getAllTables(db), new Predicate<String>() { @Override public boolean apply(String table) { return HiveDatasetFinder.this.whitelistBlacklist.acceptTable(db, table); } }); for (String tableName : tableNames) { tables.add(new DbAndTable(db, tableName)); } } } catch (Exception exc) { throw new IOException(exc); } return tables; }
@Test public void testDatasetConfig() throws Exception { List<HiveDatasetFinder.DbAndTable> dbAndTables = Lists.newArrayList(); dbAndTables.add(new HiveDatasetFinder.DbAndTable("db1", "table1")); HiveMetastoreClientPool pool = getTestPool(dbAndTables); Properties properties = new Properties(); properties.put(HiveDatasetFinder.HIVE_DATASET_PREFIX + "." + WhitelistBlacklist.WHITELIST, ""); properties.put("hive.dataset.test.conf1", "conf1-val1"); properties.put("hive.dataset.test.conf2", "conf2-val2"); HiveDatasetFinder finder = new TestHiveDatasetFinder(FileSystem.getLocal(new Configuration()), properties, pool); List<HiveDataset> datasets = Lists.newArrayList(finder.getDatasetsIterator()); Assert.assertEquals(datasets.size(), 1); HiveDataset hiveDataset = datasets.get(0); Assert.assertEquals(hiveDataset.getDatasetConfig().getString("hive.dataset.test.conf1"), "conf1-val1"); Assert.assertEquals(hiveDataset.getDatasetConfig().getString("hive.dataset.test.conf2"), "conf2-val2"); // Test scoped configs with prefix properties.put(HiveDatasetFinder.HIVE_DATASET_CONFIG_PREFIX_KEY, "hive.dataset.test"); finder = new TestHiveDatasetFinder(FileSystem.getLocal(new Configuration()), properties, pool); datasets = Lists.newArrayList(finder.getDatasetsIterator()); Assert.assertEquals(datasets.size(), 1); hiveDataset = datasets.get(0); Assert.assertEquals(hiveDataset.getDatasetConfig().getString("conf1"), "conf1-val1"); Assert.assertEquals(hiveDataset.getDatasetConfig().getString("conf2"), "conf2-val2"); }
public HiveDataset(FileSystem fs, HiveMetastoreClientPool clientPool, Table table, Properties properties, Config datasetConfig) { this.fs = fs; this.clientPool = clientPool; this.table = table; this.properties = properties; this.tableRootPath = PathUtils.isGlob(this.table.getDataLocation()) ? Optional.<Path> absent() : Optional.fromNullable(this.table.getDataLocation()); this.tableIdentifier = this.table.getDbName() + "." + this.table.getTableName(); this.datasetNamePattern = Optional.fromNullable(ConfigUtils.getString(datasetConfig, DATASET_NAME_PATTERN_KEY, null)); this.dbAndTable = new DbAndTable(table.getDbName(), table.getTableName()); if (this.datasetNamePattern.isPresent()) { this.logicalDbAndTable = parseLogicalDbAndTable(this.datasetNamePattern.get(), this.dbAndTable, LOGICAL_DB_TOKEN, LOGICAL_TABLE_TOKEN); } else { this.logicalDbAndTable = this.dbAndTable; } this.datasetConfig = resolveConfig(datasetConfig, dbAndTable, logicalDbAndTable); this.metricContext = Instrumented.getMetricContext(new State(properties), HiveDataset.class, Lists.<Tag<?>> newArrayList(new Tag<>(DATABASE, table.getDbName()), new Tag<>(TABLE, table.getTableName()))); }
@Test public void testTableList() throws Exception { List<HiveDatasetFinder.DbAndTable> dbAndTables = Lists.newArrayList(); dbAndTables.add(new HiveDatasetFinder.DbAndTable("db1", "table1")); dbAndTables.add(new HiveDatasetFinder.DbAndTable("db1", "table2")); dbAndTables.add(new HiveDatasetFinder.DbAndTable("db1", "table3")); dbAndTables.add(new HiveDatasetFinder.DbAndTable("db2", "table1")); HiveMetastoreClientPool pool = getTestPool(dbAndTables); Properties properties = new Properties(); properties.put(HiveDatasetFinder.DB_KEY, "db1"); properties.put(HiveDatasetFinder.TABLE_PATTERN_KEY, "table1|table2"); HiveDatasetFinder finder = new TestHiveDatasetFinder(FileSystem.getLocal(new Configuration()), properties, pool); List<HiveDataset> datasets = Lists.newArrayList(finder.getDatasetsIterator()); Assert.assertEquals(datasets.size(), 2); Assert.assertEquals(datasets.get(0).getTable().getDbName(), "db1"); Assert.assertEquals(datasets.get(1).getTable().getDbName(), "db1"); Assert.assertEquals(Sets.newHashSet(datasets.get(0).getTable().getTableName(), datasets.get(1).getTable().getTableName()), Sets.newHashSet("table1", "table2")); }
@Test public void testBlacklist() throws Exception { List<HiveDatasetFinder.DbAndTable> dbAndTables = Lists.newArrayList(); dbAndTables.add(new HiveDatasetFinder.DbAndTable("db1", "table1")); dbAndTables.add(new HiveDatasetFinder.DbAndTable("db1", "table2")); dbAndTables.add(new HiveDatasetFinder.DbAndTable("db2", "table1")); HiveMetastoreClientPool pool = getTestPool(dbAndTables); Properties properties = new Properties(); properties.put(HiveDatasetFinder.HIVE_DATASET_PREFIX + "." + WhitelistBlacklist.WHITELIST, ""); properties.put(HiveDatasetFinder.HIVE_DATASET_PREFIX + "." + WhitelistBlacklist.BLACKLIST, "db2"); HiveDatasetFinder finder = new TestHiveDatasetFinder(FileSystem.getLocal(new Configuration()), properties, pool); List<HiveDataset> datasets = Lists.newArrayList(finder.getDatasetsIterator()); Assert.assertEquals(datasets.size(), 2); Assert.assertEquals(datasets.get(0).getTable().getDbName(), "db1"); Assert.assertEquals(datasets.get(1).getTable().getDbName(), "db1"); Assert.assertEquals(Sets.newHashSet(datasets.get(0).getTable().getTableName(), datasets.get(1).getTable().getTableName()), Sets.newHashSet("table1", "table2")); }
@Test public void testWhitelist() throws Exception { List<HiveDatasetFinder.DbAndTable> dbAndTables = Lists.newArrayList(); dbAndTables.add(new HiveDatasetFinder.DbAndTable("db1", "table1")); dbAndTables.add(new HiveDatasetFinder.DbAndTable("db1", "table2")); dbAndTables.add(new HiveDatasetFinder.DbAndTable("db2", "table1")); HiveMetastoreClientPool pool = getTestPool(dbAndTables); Properties properties = new Properties(); properties.put(HiveDatasetFinder.HIVE_DATASET_PREFIX + "." + WhitelistBlacklist.WHITELIST, "db1"); HiveDatasetFinder finder = new TestHiveDatasetFinder(FileSystem.getLocal(new Configuration()), properties, pool); List<HiveDataset> datasets = Lists.newArrayList(finder.getDatasetsIterator()); Assert.assertEquals(datasets.size(), 2); Assert.assertEquals(datasets.get(0).getTable().getDbName(), "db1"); Assert.assertEquals(datasets.get(1).getTable().getDbName(), "db1"); Assert.assertEquals(Sets.newHashSet(datasets.get(0).getTable().getTableName(), datasets.get(1).getTable().getTableName()), Sets.newHashSet("table1", "table2")); }
private HiveDataset getHiveDataset(String tableString, FileSystem fs, State state) throws IOException { try { HiveMetastoreClientPool pool = HiveMetastoreClientPool.get(state.getProperties(), Optional.fromNullable(state.getProp(HIVE_METASTORE_URI_KEY))); List<String> tokens = Splitter.on(".").splitToList(tableString); DbAndTable sourceDbAndTable = new DbAndTable(tokens.get(0), tokens.get(1)); try (AutoReturnableObject<IMetaStoreClient> client = pool.getClient()) { Table sourceTable = new Table(client.get().getTable(sourceDbAndTable.getDb(), sourceDbAndTable.getTable())); return new HiveDataset(fs, pool, sourceTable, ConfigUtils.propertiesToConfig(state.getProperties())); } } catch (TException exc) { throw new RuntimeException(exc); } }
@Test public void testDatasetFinder() throws Exception { List<HiveDatasetFinder.DbAndTable> dbAndTables = Lists.newArrayList(); dbAndTables.add(new HiveDatasetFinder.DbAndTable("db1", "table1")); dbAndTables.add(new HiveDatasetFinder.DbAndTable("db1", "table2")); dbAndTables.add(new HiveDatasetFinder.DbAndTable("db1", "table3")); HiveMetastoreClientPool pool = getTestPool(dbAndTables); Properties properties = new Properties(); properties.put(HiveDatasetFinder.HIVE_DATASET_PREFIX + "." + WhitelistBlacklist.WHITELIST, ""); HiveDatasetFinder finder = new TestHiveDatasetFinder(FileSystem.getLocal(new Configuration()), properties, pool); List<HiveDataset> datasets = Lists.newArrayList(finder.getDatasetsIterator()); Assert.assertEquals(datasets.size(), 3); }
@Override public List<String> generateQueries() { ensureParentOfStagingPathExists(); return Lists.newArrayList(HiveConverterUtils.generateStagingCTASStatement( new HiveDatasetFinder.DbAndTable(this.outputDatabaseName, this.stagingTableName), this.sourceQuery, this.storageFormat, this.stagingDataLocation)); }
@Override public List<String> generateQueries() { ensureParentOfStagingPathExists(); return Lists.newArrayList(HiveConverterUtils.generateStagingCTASStatementFromSelectStar( new HiveDatasetFinder.DbAndTable(this.outputDatabaseName, this.stagingTableName), new HiveDatasetFinder.DbAndTable(this.inputDbName, this.inputTableName), this.partitionsDMLInfo, this.storageFormat, this.stagingDataLocation)); } }