private PartitionConsumer getPartitionConsumer(DatasetContext context) { PartitionedFileSet lines = context.getDataset(partitionedFileSetName); return new ConcurrentPartitionConsumer(lines, new DelegatingStatePersistor(context, statePersistor), consumerConfiguration); } }
/** * Resets the process state of the given partition keys, as they were not successfully processed, or discards the * partition if it has already been attempted the configured number of attempts. */ protected void abort(ConsumerWorkingSet workingSet, List<? extends PartitionKey> partitionKeys) { List<PartitionKey> discardedPartitions = new ArrayList<>(); for (PartitionKey key : partitionKeys) { ConsumablePartition consumablePartition = workingSet.lookup(key); assertInProgress(consumablePartition); // either reset its processState, or remove it from the workingSet, depending on how many tries it already has if (consumablePartition.getNumFailures() < getConfiguration().getMaxRetries()) { consumablePartition.retry(); } else { discardedPartitions.add(key); workingSet.lookup(key).discard(); } } if (!discardedPartitions.isEmpty()) { LOG.warn("Discarded keys due to being retried {} times: {}", getConfiguration().getMaxRetries(), discardedPartitions); } }
@Override public PartitionConsumerResult doConsume(ConsumerWorkingSet workingSet, PartitionAcceptor acceptor) { doExpiry(workingSet); workingSet.populate(getPartitionedFileSet(), getConfiguration()); List<PartitionDetail> toConsume = selectPartitions(acceptor, workingSet); return new PartitionConsumerResult(toConsume, removeDiscardedPartitions(workingSet)); }
@Override public void apply() throws Exception { // creating a new PartitionConsumer resets the consumption state. Consuming from it then returns an iterator // with all the partition keys List<? extends Partition> consumedPartitions = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor()).consumePartitions().getPartitions(); Set<PartitionKey> allKeys = new HashSet<>(); allKeys.addAll(partitionKeys1); allKeys.addAll(partitionKeys2); Assert.assertEquals(allKeys, toKeys(consumedPartitions)); } });
/** * Goes through all partitions. If any IN_PROGRESS partition is older than the configured timeout, reset its state * to AVAILABLE, unless it has already been retried the configured number of times, in which case it is discarded. */ protected void doExpiry(ConsumerWorkingSet workingSet) { long expiryTime = getExpiryBorder(); List<PartitionKey> expiredPartitions = new ArrayList<>(); List<PartitionKey> discardedPartitions = new ArrayList<>(); for (ConsumablePartition partition : workingSet.getPartitions()) { if (partition.getProcessState() == ProcessState.IN_PROGRESS && partition.getTimestamp() < expiryTime) { // either reset its processState, or remove it from the workingSet, depending on how many tries it already has if (partition.getNumFailures() < getConfiguration().getMaxRetries()) { partition.retry(); } else { partition.discard(); } expiredPartitions.add(partition.getPartitionKey()); } } if (!expiredPartitions.isEmpty()) { LOG.warn("Expiring in progress partitions: {}", expiredPartitions); if (!discardedPartitions.isEmpty()) { LOG.warn("Discarded keys due to being retried {} times: {}", getConfiguration().getMaxRetries(), discardedPartitions); } } } }
/** * Removes the list of partitions that have failed processing the configured number of times from the working set and * returns them. */ protected List<PartitionDetail> removeDiscardedPartitions(ConsumerWorkingSet workingSet) { List<PartitionDetail> failedPartitions = new ArrayList<>(); Iterator<ConsumablePartition> iter = workingSet.getPartitions().iterator(); while (iter.hasNext()) { ConsumablePartition partition = iter.next(); if (partition.getProcessState() == ProcessState.DISCARDED) { failedPartitions.add(getPartitionedFileSet().getPartition(partition.getPartitionKey())); iter.remove(); } } return failedPartitions; }
/** * @return a timestamp which determines partition expiry. Partitions with a timestamp smaller (older) than this value * are considered 'expired'. */ protected long getExpiryBorder() { long now = System.currentTimeMillis(); long expirationTimeoutMillis = TimeUnit.SECONDS.toMillis(getConfiguration().getTimeout()); return now - expirationTimeoutMillis; }
@Override public void untake(ConsumerWorkingSet workingSet, List<? extends PartitionKey> partitionKeys) { doExpiry(workingSet); for (PartitionKey key : partitionKeys) { ConsumablePartition consumablePartition = workingSet.lookup(key); // don't need to assertInProgress because untake() already does that consumablePartition.untake(); } }
/** * Removes the given partition keys from the working set, as they have been successfully processed. */ protected void commit(ConsumerWorkingSet workingSet, List<? extends PartitionKey> partitionKeys) { for (PartitionKey key : partitionKeys) { ConsumablePartition consumablePartition = workingSet.lookup(key); assertInProgress(consumablePartition); workingSet.remove(key); } }
@Override public void apply() throws Exception { // creating a new PartitionConsumer resets the consumption state. Consuming from it then returns an iterator // with all the partition keys added after the deletions ConcurrentPartitionConsumer partitionConsumer2 = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor()); Assert.assertEquals(partitionKeys2, toKeys(partitionConsumer2.consumePartitions().getPartitions())); } });
private List<PartitionDetail> selectPartitions(PartitionAcceptor acceptor, ConsumerWorkingSet workingSet) { long now = System.currentTimeMillis(); List<PartitionDetail> toConsume = new ArrayList<>(); Iterator<ConsumablePartition> iter = workingSet.getPartitions().iterator(); while (iter.hasNext()) { ConsumablePartition consumablePartition = iter.next(); if (ProcessState.AVAILABLE != consumablePartition.getProcessState()) { continue; } PartitionDetail partition = getPartitionedFileSet().getPartition(consumablePartition.getPartitionKey()); if (partition == null) { // no longer exists, so skip it and remove it from the working set iter.remove(); continue; } PartitionAcceptor.Return accept = acceptor.accept(partition); switch (accept) { case ACCEPT: consumablePartition.take(); consumablePartition.setTimestamp(now); toConsume.add(partition); continue; case SKIP: continue; case STOP: return toConsume; } } return toConsume; }
/** * Used from the initialize method of the implementing batch job to configure as input a PartitionedFileSet that has * specified a set of {@link Partition}s of a {@link PartitionedFileSet} to be processed by the run of the batch job. * It does this by reading back the previous state, determining the new partitions to read, computing the new * state, and persisting this new state. It then configures this dataset as input to the mapreduce context that is * passed in. * * @param mapreduceContext MapReduce context used to access the PartitionedFileSet, and on which the input is * configured * @param partitionedFileSetName the name of the {@link PartitionedFileSet} to consume partitions from * @param statePersistor a {@link DatasetStatePersistor} responsible for defining how the partition consumer state is * managed * @param consumerConfiguration defines parameters for the partition consumption * @return a BatchPartitionCommitter used to persist the state of the partition consumer */ public static BatchPartitionCommitter setInput(MapReduceContext mapreduceContext, String partitionedFileSetName, DatasetStatePersistor statePersistor, ConsumerConfiguration consumerConfiguration) { PartitionedFileSet partitionedFileSet = mapreduceContext.getDataset(partitionedFileSetName); final PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(partitionedFileSet, new DelegatingStatePersistor(mapreduceContext, statePersistor), consumerConfiguration); final List<PartitionDetail> consumedPartitions = partitionConsumer.consumePartitions().getPartitions(); Map<String, String> arguments = new HashMap<>(); PartitionedFileSetArguments.addInputPartitions(arguments, consumedPartitions); mapreduceContext.addInput(Input.ofDataset(partitionedFileSetName, arguments)); return succeeded -> partitionConsumer.onFinish(consumedPartitions, succeeded); }
ConsumerConfiguration configuration = ConsumerConfiguration.builder().setMaxRetries(3).build(); final PartitionConsumer partitionConsumer1 = new ConcurrentPartitionConsumer(dataset, persistor, configuration); final PartitionConsumer partitionConsumer2 = new ConcurrentPartitionConsumer(dataset, persistor, configuration); final PartitionConsumer partitionConsumer3 = new ConcurrentPartitionConsumer(dataset, persistor, configuration);
ConsumerConfiguration.builder().setPartitionPredicate(predicate).build(); PartitionConsumer newPartitionConsumer = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor(), configuration); List<Partition> consumedPartitions = new ArrayList<>();
final PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor());
final PartitionConsumer partitionConsumer = new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor()); dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override
new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor(), ConsumerConfiguration.builder().setMaxRetries(1).build()); dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() {
new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor()); dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override
new ConcurrentPartitionConsumer(dataset, new InMemoryStatePersistor(), consumerConfiguration); dsFrameworkUtil.newInMemoryTransactionExecutor(txAwareDataset).execute(new TransactionExecutor.Subroutine() { @Override