org.apache.gobblin.util.DatasetFilterUtils java code examples

public static Set<String> filter(Set<String> topics, List<Pattern> blacklist, List<Pattern> whitelist) {
 Set<String> result = Sets.newHashSet();
 for (String topic : topics) {
  if (survived(topic, blacklist, whitelist)) {
   result.add(topic);
  }
 }
 return result;
}

static Set<SourceEntity> getFilteredSourceEntitiesHelper(SourceState state, Iterable<SourceEntity> unfilteredEntities) {
 Set<SourceEntity> entities = new HashSet<>();
 List<Pattern> blacklist = DatasetFilterUtils.getPatternList(state, ENTITY_BLACKLIST);
 List<Pattern> whitelist = DatasetFilterUtils.getPatternList(state, ENTITY_WHITELIST);
 for (SourceEntity entity : unfilteredEntities) {
  if (DatasetFilterUtils.survived(entity.getSourceEntityName(), blacklist, whitelist)) {
   entities.add(entity);
  }
 }
 return entities;
}

/**
 * Set the regex patterns used to filter logs that should not be copied.
 *
 * @param regexList a comma-separated list of regex patterns
 * @return this {@link LogCopier.Builder} instance
 */
public Builder useExcludingRegexPatterns(String regexList) {
 Preconditions.checkNotNull(regexList);
 this.excludingRegexPatterns = DatasetFilterUtils.getPatternsFromStrings(COMMA_SPLITTER.splitToList(regexList));
 return this;
}

public static double getRatioThresholdByDatasetName (String datasetName, Map<String, Double> datasetRegexAndRecompactThreshold) {
 for (Map.Entry<String, Double> topicRegexEntry : datasetRegexAndRecompactThreshold.entrySet()) {
  if (DatasetFilterUtils.stringInPatterns(datasetName,
      DatasetFilterUtils.getPatternsFromStrings(Splitter.on(DATASETS_WITH_SAME_RECOMPACT_THRESHOLDS_SEPARATOR)
          .trimResults().omitEmptyStrings().splitToList(topicRegexEntry.getKey())))) {
   return topicRegexEntry.getValue();
  }
 }
 return MRCompactor.DEFAULT_COMPACTION_LATEDATA_THRESHOLD_FOR_RECOMPACT_PER_DATASET;
}

public static List<Pattern> getPatternList(State state, String propKey) {
 return getPatternList(state, propKey, StringUtils.EMPTY);
}

protected double getDatasetPriority(String datasetName) {
 double priority = LOW_PRIORITY;
 if (DatasetFilterUtils.stringInPatterns(datasetName, this.highPriority)) {
  priority = HIGH_PRIORITY;
 } else if (DatasetFilterUtils.stringInPatterns(datasetName, this.normalPriority)) {
  priority = NORMAL_PRIORITY;
 }
 return priority;
}

/**
 * If config store is enabled, then intersection of topics from blacklisting/whitelisting will be taken against
 * the topics from config-store
 */
private List<KafkaTopic> getFilteredTopics(SourceState state) {
 List<Pattern> blacklist = DatasetFilterUtils.getPatternList(state, TOPIC_BLACKLIST);
 List<Pattern> whitelist = DatasetFilterUtils.getPatternList(state, TOPIC_WHITELIST);
 List<KafkaTopic> topics = this.kafkaConsumerClient.get().getFilteredTopics(blacklist, whitelist);
 Optional<String> configStoreUri = ConfigStoreUtils.getConfigStoreUri(state.getProperties());
 if (configStoreUri.isPresent()) {
  List<KafkaTopic> topicsFromConfigStore = ConfigStoreUtils
    .getTopicsFromConfigStore(state.getProperties(), configStoreUri.get(), this.kafkaConsumerClient.get());
  return topics.stream().filter((KafkaTopic p) -> (topicsFromConfigStore.stream()
    .anyMatch((KafkaTopic q) -> q.getName().equalsIgnoreCase(p.getName())))).collect(toList());
 }
 return topics;
}

public static double getRatioThresholdByDatasetName (String datasetName, Map<String, Double> datasetRegexAndRecompactThreshold) {
 for (Map.Entry<String, Double> topicRegexEntry : datasetRegexAndRecompactThreshold.entrySet()) {
  if (DatasetFilterUtils.stringInPatterns(datasetName,
      DatasetFilterUtils.getPatternsFromStrings(Splitter.on(DATASETS_WITH_SAME_RECOMPACT_THRESHOLDS_SEPARATOR)
          .trimResults().omitEmptyStrings().splitToList(topicRegexEntry.getKey())))) {
   return topicRegexEntry.getValue();
  }
 }
 return MRCompactor.DEFAULT_COMPACTION_LATEDATA_THRESHOLD_FOR_RECOMPACT_PER_DATASET;
}

/**
 * A topic survives if (1) it doesn't match the blacklist, and
 * (2) either whitelist is empty, or it matches the whitelist.
 * Whitelist and blacklist use regex patterns (NOT glob patterns).
 */
public static boolean survived(String topic, List<Pattern> blacklist, List<Pattern> whitelist) {
 if (stringInPatterns(topic, blacklist)) {
  return false;
 }
 return (whitelist.isEmpty() || stringInPatterns(topic, whitelist));
}

public static List<String> filter(List<String> topics, List<Pattern> blacklist, List<Pattern> whitelist) {
 List<String> result = Lists.newArrayList();
 for (String topic : topics) {
  if (survived(topic, blacklist, whitelist)) {
   result.add(topic);
  }
 }
 return result;
}

static Set<SourceEntity> getFilteredSourceEntitiesHelper(SourceState state, Iterable<SourceEntity> unfilteredEntities) {
 Set<SourceEntity> entities = new HashSet<>();
 List<Pattern> blacklist = DatasetFilterUtils.getPatternList(state, ENTITY_BLACKLIST);
 List<Pattern> whitelist = DatasetFilterUtils.getPatternList(state, ENTITY_WHITELIST);
 for (SourceEntity entity : unfilteredEntities) {
  if (DatasetFilterUtils.survived(entity.getSourceEntityName(), blacklist, whitelist)) {
   entities.add(entity);
  }
 }
 return entities;
}

private void verifyDataCompleteness() {
 List<Pattern> blacklist =
   DatasetFilterUtils.getPatternList(this.state, COMPACTION_COMPLETENESS_VERIFICATION_BLACKLIST);
 List<Pattern> whitelist =
   DatasetFilterUtils.getPatternList(this.state, COMPACTION_COMPLETENESS_VERIFICATION_WHITELIST);
 int numDatasetsVerifiedTogether = getNumDatasetsVerifiedTogether();
 List<Dataset> datasetsToBeVerified = Lists.newArrayList();
 for (Dataset dataset : this.datasets) {
  if (dataset.state() != UNVERIFIED) {
   continue;
  }
  if (shouldVerifyCompletenessForDataset(dataset, blacklist, whitelist)) {
   datasetsToBeVerified.add(dataset);
   if (datasetsToBeVerified.size() >= numDatasetsVerifiedTogether) {
    ListenableFuture<Results> future = this.verifier.get().verify(datasetsToBeVerified);
    addCallback(datasetsToBeVerified, future);
    datasetsToBeVerified = Lists.newArrayList();
   }
  } else {
   dataset.setState(VERIFIED);
  }
 }
 if (!datasetsToBeVerified.isEmpty()) {
  ListenableFuture<Results> future = this.verifier.get().verify(datasetsToBeVerified);
  addCallback(datasetsToBeVerified, future);
 }
}

/**
 * Set the regex patterns used to filter logs that should be copied.
 *
 * @param regexList a comma-separated list of regex patterns
 * @return this {@link LogCopier.Builder} instance
 */
public Builder useIncludingRegexPatterns(String regexList) {
 Preconditions.checkNotNull(regexList);
 this.includingRegexPatterns = DatasetFilterUtils.getPatternsFromStrings(COMMA_SPLITTER.splitToList(regexList));
 return this;
}

 /**
  * Check if a log line should be copied.
  *
  * <p>
  *   A line should be copied if and only if all of the following conditions satisfy:
  *
  *   <ul>
  *     <li>
  *       It doesn't match any of the excluding regex patterns. If there's no excluding regex patterns,
  *       this condition is considered satisfied.
  *     </li>
  *     <li>
  *       It matches at least one of the including regex patterns. If there's no including regex patterns,
  *       this condition is considered satisfied.
  *     </li>
  *   </ul>
  * </p>
  */
 private boolean shouldCopyLine(String line) {
  boolean including = !LogCopier.this.includingRegexPatterns.isPresent()
    || DatasetFilterUtils.stringInPatterns(line, LogCopier.this.includingRegexPatterns.get());
  boolean excluding = LogCopier.this.excludingRegexPatterns.isPresent()
    && DatasetFilterUtils.stringInPatterns(line, LogCopier.this.excludingRegexPatterns.get());
  return !excluding && including;
 }
}

 @Override
 public boolean apply(@Nonnull KafkaTopic kafkaTopic) {
  return DatasetFilterUtils.survived(kafkaTopic.getName(), blacklist, whitelist);
 }
}));

@VisibleForTesting
DatasetsFinder(State state, FileSystem fs) {
 this.state = state;
 this.conf = HadoopUtils.getConfFromState(state);
 this.fs = fs;
 this.inputDir = getInputDir();
 this.destDir = getDestDir();
 this.tmpOutputDir = getTmpOutputDir();
 this.blacklist = DatasetFilterUtils.getPatternList(state, MRCompactor.COMPACTION_BLACKLIST);
 this.whitelist = DatasetFilterUtils.getPatternList(state, MRCompactor.COMPACTION_WHITELIST);
 setTopicsFromConfigStore(state);
 this.highPriority = getHighPriorityPatterns();
 this.normalPriority = getNormalPriorityPatterns();
 this.recompactDatasets = getRecompactDatasets();
}

private void setTopicsFromConfigStore(State state) {
 Set<String> blacklistTopicsFromConfigStore = new HashSet<>();
 Set<String> whitelistTopicsFromConfigStore = new HashSet<>();
 ConfigStoreUtils.setTopicsFromConfigStore(state.getProperties(), blacklistTopicsFromConfigStore,
   whitelistTopicsFromConfigStore, MRCompactor.COMPACTION_BLACKLIST, MRCompactor.COMPACTION_WHITELIST);
 this.blacklist.addAll(DatasetFilterUtils.getPatternsFromStrings(new ArrayList<>(blacklistTopicsFromConfigStore)));
 this.whitelist.addAll(DatasetFilterUtils.getPatternsFromStrings(new ArrayList<>(whitelistTopicsFromConfigStore)));
}

protected double getDatasetPriority(String datasetName) {
 double priority = LOW_PRIORITY;
 if (DatasetFilterUtils.stringInPatterns(datasetName, this.highPriority)) {
  priority = HIGH_PRIORITY;
 } else if (DatasetFilterUtils.stringInPatterns(datasetName, this.normalPriority)) {
  priority = NORMAL_PRIORITY;
 }
 return priority;
}

private List<TopicMetadata> fetchTopicMetadataFromBroker(String broker, List<Pattern> blacklist,
  List<Pattern> whitelist) {
 List<TopicMetadata> topicMetadataList = fetchTopicMetadataFromBroker(broker);
 if (topicMetadataList == null) {
  return null;
 }
 List<TopicMetadata> filteredTopicMetadataList = Lists.newArrayList();
 for (TopicMetadata topicMetadata : topicMetadataList) {
  if (DatasetFilterUtils.survived(topicMetadata.topic(), blacklist, whitelist)) {
   filteredTopicMetadataList.add(topicMetadata);
  }
 }
 return filteredTopicMetadataList;
}

state.setProp(KafkaSource.TOPIC_BLACKLIST, StringUtils.EMPTY);
List<KafkaTopic> allTopics =
  kafkaConsumerClient.getFilteredTopics(DatasetFilterUtils.getPatternList(state, KafkaSource.TOPIC_BLACKLIST),
    DatasetFilterUtils.getPatternList(state, KafkaSource.TOPIC_WHITELIST));
Optional<Config> runtimeConfig = ConfigClientUtils.getOptionalRuntimeConfig(properties);

Javadoc

A utility class for filtering datasets through blacklist and whitelist.

Most used methods

survived
A topic survives if (1) it doesn't match the blacklist, and (2) either whitelist is empty, or it mat
getPatternList
getPatternsFromStrings
Convert a list of Strings to a list of Patterns.
stringInPatterns
Determines whether a string matches one of the regex patterns.

Popular in Java

Making http requests using okhttp
requestLocationUpdates (LocationManager)
getExternalFilesDir (Context)
setRequestProperty (URLConnection)
InputStream (java.io)
A readable source of bytes.Most clients will use input streams that read data from the file system (
Hashtable (java.util)
A plug-in replacement for JDK1.5 java.util.Hashtable. This version is based on org.cliffc.high_scale
SortedSet (java.util)
SortedSet is a Set which iterates over its elements in a sorted order. The order is determined eithe
Table (com.google.common.collect)
A collection that associates an ordered pair of keys, called a row key and a column key, with a sing
BorderLayout (java.awt)
A border layout lays out a container, arranging and resizing its components to fit in five regions:
JButton (javax.swing)
From CI to AI: The AI layer in your organization

How to useDatasetFilterUtils in org.apache.gobblin.util

Best Java code snippets using org.apache.gobblin.util.DatasetFilterUtils (Showing top 20 results out of 315)

How to use
DatasetFilterUtils
in
org.apache.gobblin.util