public static Set<String> filter(Set<String> topics, List<Pattern> blacklist, List<Pattern> whitelist) { Set<String> result = Sets.newHashSet(); for (String topic : topics) { if (survived(topic, blacklist, whitelist)) { result.add(topic); } } return result; }
static Set<SourceEntity> getFilteredSourceEntitiesHelper(SourceState state, Iterable<SourceEntity> unfilteredEntities) { Set<SourceEntity> entities = new HashSet<>(); List<Pattern> blacklist = DatasetFilterUtils.getPatternList(state, ENTITY_BLACKLIST); List<Pattern> whitelist = DatasetFilterUtils.getPatternList(state, ENTITY_WHITELIST); for (SourceEntity entity : unfilteredEntities) { if (DatasetFilterUtils.survived(entity.getSourceEntityName(), blacklist, whitelist)) { entities.add(entity); } } return entities; }
/** * Set the regex patterns used to filter logs that should not be copied. * * @param regexList a comma-separated list of regex patterns * @return this {@link LogCopier.Builder} instance */ public Builder useExcludingRegexPatterns(String regexList) { Preconditions.checkNotNull(regexList); this.excludingRegexPatterns = DatasetFilterUtils.getPatternsFromStrings(COMMA_SPLITTER.splitToList(regexList)); return this; }
public static double getRatioThresholdByDatasetName (String datasetName, Map<String, Double> datasetRegexAndRecompactThreshold) { for (Map.Entry<String, Double> topicRegexEntry : datasetRegexAndRecompactThreshold.entrySet()) { if (DatasetFilterUtils.stringInPatterns(datasetName, DatasetFilterUtils.getPatternsFromStrings(Splitter.on(DATASETS_WITH_SAME_RECOMPACT_THRESHOLDS_SEPARATOR) .trimResults().omitEmptyStrings().splitToList(topicRegexEntry.getKey())))) { return topicRegexEntry.getValue(); } } return MRCompactor.DEFAULT_COMPACTION_LATEDATA_THRESHOLD_FOR_RECOMPACT_PER_DATASET; }
public static List<Pattern> getPatternList(State state, String propKey) { return getPatternList(state, propKey, StringUtils.EMPTY); }
protected double getDatasetPriority(String datasetName) { double priority = LOW_PRIORITY; if (DatasetFilterUtils.stringInPatterns(datasetName, this.highPriority)) { priority = HIGH_PRIORITY; } else if (DatasetFilterUtils.stringInPatterns(datasetName, this.normalPriority)) { priority = NORMAL_PRIORITY; } return priority; }
/** * If config store is enabled, then intersection of topics from blacklisting/whitelisting will be taken against * the topics from config-store */ private List<KafkaTopic> getFilteredTopics(SourceState state) { List<Pattern> blacklist = DatasetFilterUtils.getPatternList(state, TOPIC_BLACKLIST); List<Pattern> whitelist = DatasetFilterUtils.getPatternList(state, TOPIC_WHITELIST); List<KafkaTopic> topics = this.kafkaConsumerClient.get().getFilteredTopics(blacklist, whitelist); Optional<String> configStoreUri = ConfigStoreUtils.getConfigStoreUri(state.getProperties()); if (configStoreUri.isPresent()) { List<KafkaTopic> topicsFromConfigStore = ConfigStoreUtils .getTopicsFromConfigStore(state.getProperties(), configStoreUri.get(), this.kafkaConsumerClient.get()); return topics.stream().filter((KafkaTopic p) -> (topicsFromConfigStore.stream() .anyMatch((KafkaTopic q) -> q.getName().equalsIgnoreCase(p.getName())))).collect(toList()); } return topics; }
public static double getRatioThresholdByDatasetName (String datasetName, Map<String, Double> datasetRegexAndRecompactThreshold) { for (Map.Entry<String, Double> topicRegexEntry : datasetRegexAndRecompactThreshold.entrySet()) { if (DatasetFilterUtils.stringInPatterns(datasetName, DatasetFilterUtils.getPatternsFromStrings(Splitter.on(DATASETS_WITH_SAME_RECOMPACT_THRESHOLDS_SEPARATOR) .trimResults().omitEmptyStrings().splitToList(topicRegexEntry.getKey())))) { return topicRegexEntry.getValue(); } } return MRCompactor.DEFAULT_COMPACTION_LATEDATA_THRESHOLD_FOR_RECOMPACT_PER_DATASET; }
/** * A topic survives if (1) it doesn't match the blacklist, and * (2) either whitelist is empty, or it matches the whitelist. * Whitelist and blacklist use regex patterns (NOT glob patterns). */ public static boolean survived(String topic, List<Pattern> blacklist, List<Pattern> whitelist) { if (stringInPatterns(topic, blacklist)) { return false; } return (whitelist.isEmpty() || stringInPatterns(topic, whitelist)); }
public static List<String> filter(List<String> topics, List<Pattern> blacklist, List<Pattern> whitelist) { List<String> result = Lists.newArrayList(); for (String topic : topics) { if (survived(topic, blacklist, whitelist)) { result.add(topic); } } return result; }
static Set<SourceEntity> getFilteredSourceEntitiesHelper(SourceState state, Iterable<SourceEntity> unfilteredEntities) { Set<SourceEntity> entities = new HashSet<>(); List<Pattern> blacklist = DatasetFilterUtils.getPatternList(state, ENTITY_BLACKLIST); List<Pattern> whitelist = DatasetFilterUtils.getPatternList(state, ENTITY_WHITELIST); for (SourceEntity entity : unfilteredEntities) { if (DatasetFilterUtils.survived(entity.getSourceEntityName(), blacklist, whitelist)) { entities.add(entity); } } return entities; }
private void verifyDataCompleteness() { List<Pattern> blacklist = DatasetFilterUtils.getPatternList(this.state, COMPACTION_COMPLETENESS_VERIFICATION_BLACKLIST); List<Pattern> whitelist = DatasetFilterUtils.getPatternList(this.state, COMPACTION_COMPLETENESS_VERIFICATION_WHITELIST); int numDatasetsVerifiedTogether = getNumDatasetsVerifiedTogether(); List<Dataset> datasetsToBeVerified = Lists.newArrayList(); for (Dataset dataset : this.datasets) { if (dataset.state() != UNVERIFIED) { continue; } if (shouldVerifyCompletenessForDataset(dataset, blacklist, whitelist)) { datasetsToBeVerified.add(dataset); if (datasetsToBeVerified.size() >= numDatasetsVerifiedTogether) { ListenableFuture<Results> future = this.verifier.get().verify(datasetsToBeVerified); addCallback(datasetsToBeVerified, future); datasetsToBeVerified = Lists.newArrayList(); } } else { dataset.setState(VERIFIED); } } if (!datasetsToBeVerified.isEmpty()) { ListenableFuture<Results> future = this.verifier.get().verify(datasetsToBeVerified); addCallback(datasetsToBeVerified, future); } }
/** * Set the regex patterns used to filter logs that should be copied. * * @param regexList a comma-separated list of regex patterns * @return this {@link LogCopier.Builder} instance */ public Builder useIncludingRegexPatterns(String regexList) { Preconditions.checkNotNull(regexList); this.includingRegexPatterns = DatasetFilterUtils.getPatternsFromStrings(COMMA_SPLITTER.splitToList(regexList)); return this; }
/** * Check if a log line should be copied. * * <p> * A line should be copied if and only if all of the following conditions satisfy: * * <ul> * <li> * It doesn't match any of the excluding regex patterns. If there's no excluding regex patterns, * this condition is considered satisfied. * </li> * <li> * It matches at least one of the including regex patterns. If there's no including regex patterns, * this condition is considered satisfied. * </li> * </ul> * </p> */ private boolean shouldCopyLine(String line) { boolean including = !LogCopier.this.includingRegexPatterns.isPresent() || DatasetFilterUtils.stringInPatterns(line, LogCopier.this.includingRegexPatterns.get()); boolean excluding = LogCopier.this.excludingRegexPatterns.isPresent() && DatasetFilterUtils.stringInPatterns(line, LogCopier.this.excludingRegexPatterns.get()); return !excluding && including; } }
@Override public boolean apply(@Nonnull KafkaTopic kafkaTopic) { return DatasetFilterUtils.survived(kafkaTopic.getName(), blacklist, whitelist); } }));
@VisibleForTesting DatasetsFinder(State state, FileSystem fs) { this.state = state; this.conf = HadoopUtils.getConfFromState(state); this.fs = fs; this.inputDir = getInputDir(); this.destDir = getDestDir(); this.tmpOutputDir = getTmpOutputDir(); this.blacklist = DatasetFilterUtils.getPatternList(state, MRCompactor.COMPACTION_BLACKLIST); this.whitelist = DatasetFilterUtils.getPatternList(state, MRCompactor.COMPACTION_WHITELIST); setTopicsFromConfigStore(state); this.highPriority = getHighPriorityPatterns(); this.normalPriority = getNormalPriorityPatterns(); this.recompactDatasets = getRecompactDatasets(); }
private void setTopicsFromConfigStore(State state) { Set<String> blacklistTopicsFromConfigStore = new HashSet<>(); Set<String> whitelistTopicsFromConfigStore = new HashSet<>(); ConfigStoreUtils.setTopicsFromConfigStore(state.getProperties(), blacklistTopicsFromConfigStore, whitelistTopicsFromConfigStore, MRCompactor.COMPACTION_BLACKLIST, MRCompactor.COMPACTION_WHITELIST); this.blacklist.addAll(DatasetFilterUtils.getPatternsFromStrings(new ArrayList<>(blacklistTopicsFromConfigStore))); this.whitelist.addAll(DatasetFilterUtils.getPatternsFromStrings(new ArrayList<>(whitelistTopicsFromConfigStore))); }
protected double getDatasetPriority(String datasetName) { double priority = LOW_PRIORITY; if (DatasetFilterUtils.stringInPatterns(datasetName, this.highPriority)) { priority = HIGH_PRIORITY; } else if (DatasetFilterUtils.stringInPatterns(datasetName, this.normalPriority)) { priority = NORMAL_PRIORITY; } return priority; }
private List<TopicMetadata> fetchTopicMetadataFromBroker(String broker, List<Pattern> blacklist, List<Pattern> whitelist) { List<TopicMetadata> topicMetadataList = fetchTopicMetadataFromBroker(broker); if (topicMetadataList == null) { return null; } List<TopicMetadata> filteredTopicMetadataList = Lists.newArrayList(); for (TopicMetadata topicMetadata : topicMetadataList) { if (DatasetFilterUtils.survived(topicMetadata.topic(), blacklist, whitelist)) { filteredTopicMetadataList.add(topicMetadata); } } return filteredTopicMetadataList; }
state.setProp(KafkaSource.TOPIC_BLACKLIST, StringUtils.EMPTY); List<KafkaTopic> allTopics = kafkaConsumerClient.getFilteredTopics(DatasetFilterUtils.getPatternList(state, KafkaSource.TOPIC_BLACKLIST), DatasetFilterUtils.getPatternList(state, KafkaSource.TOPIC_WHITELIST)); Optional<Config> runtimeConfig = ConfigClientUtils.getOptionalRuntimeConfig(properties);