/** * @param state This is a Job State */ public HiveRegistrationPublisher(State state) { super(state); this.hiveRegister = this.closer.register(HiveRegister.get(state)); this.hivePolicyExecutor = ExecutorsUtils.loggingDecorator(Executors.newFixedThreadPool(new HiveRegProps(state).getNumThreads(), ExecutorsUtils.newThreadFactory(Optional.of(log), Optional.of("HivePolicyExecutor-%d")))); this.metricContext = Instrumented.getMetricContext(state, HiveRegistrationPublisher.class); isPathDedupeEnabled = state.getPropAsBoolean(PATH_DEDUPE_ENABLED, this.DEFAULT_PATH_DEDUPE_ENABLED); }
protected String getDatabaseOrTableName(Path path, String nameKey, String regexKey, Optional<Pattern> pattern) { String name; if (this.props.contains(nameKey)) { name = this.props.getProp(nameKey); } else if (pattern.isPresent()) { Matcher matcher = pattern.get().matcher(path.toString()); if (matcher.matches() && matcher.groupCount() >= 1) { name = matcher.group(1); } else { throw new IllegalStateException("No group match found for regexKey " + regexKey+" with regexp "+ pattern.get().toString() +" on path "+path); } } else { throw new IllegalStateException("Missing required property " + nameKey + " or " + regexKey); } return sanitizeAndValidateName(name); }
/** * @param props A {@link State} object that includes both properties required by {@link HiveMetaStoreBasedRegister} to do * Hive registration, as well as the Hive properties that will be added to the Hive table when creating the table, * e.g., orc.compress=SNAPPY * * <p> * The Hive table properties should be a comma-separated list associated with {@link #HIVE_TABLE_PARTITION_PROPS} in the * given {@link State}. * </p> */ public HiveRegProps(State props) { super(props); this.tablePartitionProps = createHiveProps(HIVE_TABLE_PARTITION_PROPS); if (props.contains(HiveMetaStoreUtils.RUNTIME_PROPS)) { runtimeTableProps = Optional.of(props.getProp(HiveMetaStoreUtils.RUNTIME_PROPS)); } else{ runtimeTableProps = Optional.absent(); } this.storageProps = createHiveProps(HIVE_STORAGE_PROPS); this.serdeProps = createHiveProps(HIVE_SERDE_PROPS); }
protected HiveRegister(State state) { this.props = new HiveRegProps(state); this.hiveDbRootDir = this.props.getDbRootDir(); this.executor = ExecutorsUtils.loggingDecorator( ScalingThreadPoolExecutor.newScalingThreadPool(0, this.props.getNumThreads(), TimeUnit.SECONDS.toMillis(10), ExecutorsUtils.newThreadFactory(Optional.of(log), Optional.of(getClass().getSimpleName())))); }
/** * Obtain Hive database names. The returned {@link Iterable} contains the database name returned by * {@link #getDatabaseName(Path)} (if present) plus additional database names specified in * {@link #ADDITIONAL_HIVE_DATABASE_NAMES}. * */ protected Iterable<String> getDatabaseNames(Path path) { List<String> databaseNames = Lists.newArrayList(); Optional<String> databaseName; if ((databaseName = getDatabaseName(path)).isPresent()) { databaseNames.add(databaseName.get()); } if (!Strings.isNullOrEmpty(this.props.getProp(ADDITIONAL_HIVE_DATABASE_NAMES))) { for (String additionalDbName : this.props.getPropAsList(ADDITIONAL_HIVE_DATABASE_NAMES)) { databaseNames.add(this.dbNamePrefix + additionalDbName + this.dbNameSuffix); } } Preconditions.checkState(!databaseNames.isEmpty(), "Hive database name not specified"); return databaseNames; }
public HiveRegistrationPolicyBase(State props) throws IOException { Preconditions.checkNotNull(props); this.props = new HiveRegProps(props); if (props.contains(HiveRegistrationPolicyBase.HIVE_FS_URI)) { this.fs = FileSystem.get(URI.create(props.getProp(HiveRegistrationPolicyBase.HIVE_FS_URI)), new Configuration()); } else { this.fs = FileSystem.get(new Configuration()); } this.sanitizeNameAllowed = props.getPropAsBoolean(HIVE_SANITIZE_INVALID_NAMES, true); this.dbNamePattern = props.contains(HIVE_DATABASE_REGEX) ? Optional.of(Pattern.compile(props.getProp(HIVE_DATABASE_REGEX))) : Optional.<Pattern> absent(); this.tableNamePattern = props.contains(HIVE_TABLE_REGEX) ? Optional.of(Pattern.compile(props.getProp(HIVE_TABLE_REGEX))) : Optional.<Pattern> absent(); this.dbNamePrefix = props.getProp(HIVE_DATABASE_NAME_PREFIX, StringUtils.EMPTY); this.dbNameSuffix = props.getProp(HIVE_DATABASE_NAME_SUFFIX, StringUtils.EMPTY); this.tableNamePrefix = props.getProp(HIVE_TABLE_NAME_PREFIX, StringUtils.EMPTY); this.tableNameSuffix = props.getProp(HIVE_TABLE_NAME_SUFFIX, StringUtils.EMPTY); this.emptyInputPathFlag = props.getPropAsBoolean(MAPREDUCE_JOB_INPUT_PATH_EMPTY_KEY, false); this.metricContext = Instrumented.getMetricContext(props, HiveRegister.class); }
public HiveMetaStoreBasedRegister(State state, Optional<String> metastoreURI) throws IOException { super(state); this.optimizedChecks = state.getPropAsBoolean(this.OPTIMIZED_CHECK_ENABLED, true); GenericObjectPoolConfig config = new GenericObjectPoolConfig(); config.setMaxTotal(this.props.getNumThreads()); config.setMaxIdle(this.props.getNumThreads()); this.clientPool = HiveMetastoreClientPool.get(this.props.getProperties(), metastoreURI); this.metricContext = GobblinMetricsRegistry.getInstance().getMetricContext(state, HiveMetaStoreBasedRegister.class, GobblinMetrics.getCustomTagsFromState(state)); this.eventSubmitter = new EventSubmitter.Builder(this.metricContext, "org.apache.gobblin.hive.HiveMetaStoreBasedRegister").build(); }
Timer.Context context = this.metricContext.timer(CONFIG_FOR_TOPIC_TIMER).time(); configForTopic = ConfigStoreUtils.getConfigForTopic(this.props.getProperties(), KafkaSource.TOPIC_NAME, this.configClient); context.close(); tableNames.add(this.tableNamePrefix + resolvedTableName + this.tableNameSuffix); } else if (!Strings.isNullOrEmpty(this.props.getProp(additionalNamesProp))) { for (String additionalTableName : this.props.getPropAsList(additionalNamesProp)) { String resolvedTableName = primaryTableName.isPresent() ? StringUtils.replace(additionalTableName, PRIMARY_TABLE_TOKEN,
/** * Get Hive database root dir from {@link #HIVE_DB_ROOT_DIR}. * * @return {@link Optional#absent()} if {@link #HIVE_DB_ROOT_DIR} is not specified. */ public Optional<String> getDbRootDir() { return Optional.fromNullable(getProp(HIVE_DB_ROOT_DIR)); }
/** * Create a {@link State} object that contains Hive table properties. These properties are obtained from * {@link #HIVE_TABLE_PARTITION_PROPS}, which is a list of comma-separated properties. Each property is in the form * of '[key]=[value]'. */ private State createHiveProps(String propKey) { State state = new State(); if (!contains(propKey)) { return state; } for (String propValue : getPropAsList(propKey)) { List<String> tokens = SPLITTER.splitToList(propValue); Preconditions.checkState(tokens.size() == 2, propValue + " is not a valid Hive table/partition property"); state.setProp(tokens.get(0), tokens.get(1)); } return state; }
/** * This method first tries to obtain the database name from {@link #HIVE_TABLE_NAME}. * If this property is not specified, it then tries to obtain the database name using * the first group of {@link #HIVE_TABLE_REGEX}. */ protected Optional<String> getTableName(Path path) { if (!this.props.contains(HIVE_TABLE_NAME) && !this.props.contains(HIVE_TABLE_REGEX)) { return Optional.<String> absent(); } return Optional.<String> of( this.tableNamePrefix + getDatabaseOrTableName(path, HIVE_TABLE_NAME, HIVE_TABLE_REGEX, this.tableNamePattern) + this.tableNameSuffix); }
/** * Get number of threads from {@link #HIVE_REGISTER_THREADS}, with a default value of * {@link #DEFAULT_HIVE_REGISTER_THREADS}. */ public int getNumThreads() { return getPropAsInt(HIVE_REGISTER_THREADS, DEFAULT_HIVE_REGISTER_THREADS); } }
protected HiveRegister(State state) { this.props = new HiveRegProps(state); this.hiveDbRootDir = this.props.getDbRootDir(); this.executor = ExecutorsUtils.loggingDecorator( ScalingThreadPoolExecutor.newScalingThreadPool(0, this.props.getNumThreads(), TimeUnit.SECONDS.toMillis(10), ExecutorsUtils.newThreadFactory(Optional.of(log), Optional.of(getClass().getSimpleName())))); }
@Test public void testAddTableDeregisterSteps() throws Exception { HiveDataset dataset = Mockito.mock(HiveDataset.class); Mockito.when(dataset.getProperties()).thenReturn(new Properties()); HiveCopyEntityHelper helper = Mockito.mock(HiveCopyEntityHelper.class); Mockito.when(helper.getDeleteMethod()).thenReturn(DeregisterFileDeleteMethod.NO_DELETE); Mockito.when(helper.getTargetURI()).thenReturn(Optional.of("/targetURI")); Mockito.when(helper.getHiveRegProps()).thenReturn(new HiveRegProps(new State())); Mockito.when(helper.getDataset()).thenReturn(dataset); Mockito.when(helper.addTableDeregisterSteps(Mockito.any(List.class), Mockito.any(String.class), Mockito.anyInt(), Mockito.any(org.apache.hadoop.hive.ql.metadata.Table.class))).thenCallRealMethod(); org.apache.hadoop.hive.ql.metadata.Table meta_table = Mockito.mock(org.apache.hadoop.hive.ql.metadata.Table.class); org.apache.hadoop.hive.metastore.api.Table api_table = Mockito.mock(org.apache.hadoop.hive.metastore.api.Table.class); Mockito.when(api_table.getDbName()).thenReturn("TestDB"); Mockito.when(api_table.getTableName()).thenReturn("TestTable"); Mockito.when(meta_table.getTTable()).thenReturn(api_table); List<CopyEntity> copyEntities = new ArrayList<CopyEntity>(); String fileSet = "testFileSet"; int initialPriority = 0; int priority = helper.addTableDeregisterSteps(copyEntities, fileSet, initialPriority, meta_table); Assert.assertTrue(priority == 1); Assert.assertTrue(copyEntities.size() == 1); Assert.assertTrue(copyEntities.get(0) instanceof PostPublishStep); PostPublishStep p = (PostPublishStep) (copyEntities.get(0)); Assert .assertTrue(p.getStep().toString().contains("Deregister table TestDB.TestTable on Hive metastore /targetURI")); }
public HiveMetaStoreBasedRegister(State state, Optional<String> metastoreURI) throws IOException { super(state); this.optimizedChecks = state.getPropAsBoolean(this.OPTIMIZED_CHECK_ENABLED, true); GenericObjectPoolConfig config = new GenericObjectPoolConfig(); config.setMaxTotal(this.props.getNumThreads()); config.setMaxIdle(this.props.getNumThreads()); this.clientPool = HiveMetastoreClientPool.get(this.props.getProperties(), metastoreURI); this.metricContext = GobblinMetricsRegistry.getInstance().getMetricContext(state, HiveMetaStoreBasedRegister.class, GobblinMetrics.getCustomTagsFromState(state)); this.eventSubmitter = new EventSubmitter.Builder(this.metricContext, "org.apache.gobblin.hive.HiveMetaStoreBasedRegister").build(); }
Timer.Context context = this.metricContext.timer(CONFIG_FOR_TOPIC_TIMER).time(); configForTopic = ConfigStoreUtils.getConfigForTopic(this.props.getProperties(), KafkaSource.TOPIC_NAME, this.configClient); context.close(); tableNames.add(this.tableNamePrefix + resolvedTableName + this.tableNameSuffix); } else if (!Strings.isNullOrEmpty(this.props.getProp(additionalNamesProp))) { for (String additionalTableName : this.props.getPropAsList(additionalNamesProp)) { String resolvedTableName = primaryTableName.isPresent() ? StringUtils.replace(additionalTableName, PRIMARY_TABLE_TOKEN,
/** * Get the name of registered HiveTable's upstream data attributes. * E.g., When data consumed from Kafka is registered into Hive Table, it is expected * to have Hive Metadata indicating the Kafka topic. * * HIVE_UPSTREAM_DATA_ATTR_NAMES is comma separated string, each item representing a upstream data attr. * E.g. hive.upstream.data.attr.names=topic.name,some.else * * @return {@link Optional#absent()} if {@link #HIVE_UPSTREAM_DATA_ATTR_NAMES} is not specified. */ public Optional<String> getUpstreamDataAttrName(){ return Optional.fromNullable(getProp(HIVE_UPSTREAM_DATA_ATTR_NAMES)); }
/** * Create a {@link State} object that contains Hive table properties. These properties are obtained from * {@link #HIVE_TABLE_PARTITION_PROPS}, which is a list of comma-separated properties. Each property is in the form * of '[key]=[value]'. */ private State createHiveProps(String propKey) { State state = new State(); if (!contains(propKey)) { return state; } for (String propValue : getPropAsList(propKey)) { List<String> tokens = SPLITTER.splitToList(propValue); Preconditions.checkState(tokens.size() == 2, propValue + " is not a valid Hive table/partition property"); state.setProp(tokens.get(0), tokens.get(1)); } return state; }
/** * Obtain Hive database names. The returned {@link Iterable} contains the database name returned by * {@link #getDatabaseName(Path)} (if present) plus additional database names specified in * {@link #ADDITIONAL_HIVE_DATABASE_NAMES}. * */ protected Iterable<String> getDatabaseNames(Path path) { List<String> databaseNames = Lists.newArrayList(); Optional<String> databaseName; if ((databaseName = getDatabaseName(path)).isPresent()) { databaseNames.add(databaseName.get()); } if (!Strings.isNullOrEmpty(this.props.getProp(ADDITIONAL_HIVE_DATABASE_NAMES))) { for (String additionalDbName : this.props.getPropAsList(ADDITIONAL_HIVE_DATABASE_NAMES)) { databaseNames.add(this.dbNamePrefix + additionalDbName + this.dbNameSuffix); } } Preconditions.checkState(!databaseNames.isEmpty(), "Hive database name not specified"); return databaseNames; }
/** * This method first tries to obtain the database name from {@link #HIVE_DATABASE_NAME}. * If this property is not specified, it then tries to obtain the database name using * the first group of {@link #HIVE_DATABASE_REGEX}. * */ protected Optional<String> getDatabaseName(Path path) { if (!this.props.contains(HIVE_DATABASE_NAME) && !this.props.contains(HIVE_DATABASE_REGEX)) { return Optional.<String> absent(); } return Optional.<String> of( this.dbNamePrefix + getDatabaseOrTableName(path, HIVE_DATABASE_NAME, HIVE_DATABASE_REGEX, this.dbNamePattern) + this.dbNameSuffix); }