/** * Attaches a stage that performs the given group-and-aggregate operation. * It emits one key-value pair (in a {@code Map.Entry}) for each distinct * key it observes in its input. The value is the result of the aggregate * operation across all the items with the given grouping key. * * @see com.hazelcast.jet.aggregate.AggregateOperations AggregateOperations * @param aggrOp the aggregate operation to perform * @param <R> type of the aggregation result */ @Nonnull default <R> BatchStage<Entry<K, R>> aggregate( @Nonnull AggregateOperation1<? super T, ?, ? extends R> aggrOp ) { return aggregate(aggrOp, Util::entry); }
@Nonnull AggregateOperation3<? super T, ? super T1, ? super T2, ?, ? extends R> aggrOp ) { return aggregate3(stage1, stage2, aggrOp, Util::entry);
/** * Attaches a stage that performs the given cogroup-and-aggregate operation * over the items from both this stage and {@code stage1} you supply. It * emits one key-value pair (in a {@code Map.Entry}) for each distinct key * it observes in its input. The value is the result of the aggregate * operation across all the items with the given grouping key. * <p> * This variant requires you to provide a two-input aggregate operation * (refer to its {@linkplain AggregateOperation2 Javadoc} for a simple * example). If you can express your logic in terms of two single-input * aggregate operations, one for each input stream, then you should use * {@link #aggregate2(AggregateOperation1, BatchStageWithKey, AggregateOperation1) * stage0.aggregate2(aggrOp0, stage1, aggrOp1)} because it offers a simpler * API and you can use the already defined single-input operations. Use * this variant only when you have the need to implement an aggregate * operation that combines the input streams into the same accumulator. * * @see com.hazelcast.jet.aggregate.AggregateOperations AggregateOperations * @param aggrOp the aggregate operation to perform * @param <T1> type of items in {@code stage1} * @param <R> type of the aggregation result */ @Nonnull default <T1, R> BatchStage<Entry<K, R>> aggregate2( @Nonnull BatchStageWithKey<T1, ? extends K> stage1, @Nonnull AggregateOperation2<? super T, ? super T1, ?, R> aggrOp ) { return aggregate2(stage1, aggrOp, Util::entry); }
@SuppressWarnings("Convert2MethodRef") // https://bugs.openjdk.java.net/browse/JDK-8154236 private static Pipeline coGroupBuild() { Pipeline p = Pipeline.create(); // Create three source streams BatchStageWithKey<PageVisit, Integer> pageVisits = p.drawFrom(Sources.<PageVisit>list(PAGE_VISIT)) .groupingKey(pageVisit -> pageVisit.userId()); BatchStageWithKey<AddToCart, Integer> addToCarts = p.drawFrom(Sources.<AddToCart>list(ADD_TO_CART)) .groupingKey(addToCart -> addToCart.userId()); BatchStageWithKey<Payment, Integer> payments = p.drawFrom(Sources.<Payment>list(PAYMENT)) .groupingKey(payment -> payment.userId()); // Obtain a builder object for the co-group transform GroupAggregateBuilder<Integer, List<PageVisit>> builder = pageVisits.aggregateBuilder(toList()); Tag<List<PageVisit>> visitTag = builder.tag0(); // Add the co-grouped streams to the builder. Here we add just two, but // you could add any number of them. Tag<List<AddToCart>> cartTag = builder.add(addToCarts, toList()); Tag<List<Payment>> payTag = builder.add(payments, toList()); // Build the co-group transform. The aggregate operation collects all the // stream items into an accumulator class called ItemsByTag. We transform // it into a 3-tuple of lists. BatchStage<Entry<Integer, Tuple3<List<PageVisit>, List<AddToCart>, List<Payment>>>> coGrouped = builder.build((key, res) -> entry(key, tuple3(res.get(visitTag), res.get(cartTag), res.get(payTag)))); // Store the results in the output map coGrouped.drainTo(Sinks.map(RESULT)); return p; }
/** * Attaches a stage that emits just the items that are distinct according * to their definition of equality ({@code equals} and {@code hashCode}). * There is no guarantee which one of equal items it will emit. * * @return the newly attached stage */ @Nonnull default BatchStage<T> distinct() { return groupingKey(wholeItem()).distinct(); }
@SuppressWarnings("Convert2MethodRef") // https://bugs.openjdk.java.net/browse/JDK-8154236 private static Pipeline coGroupBuild() { Pipeline p = Pipeline.create(); // Create three source streams BatchStageWithKey<PageVisit, Integer> pageVisits = p.drawFrom(Sources.<PageVisit>list(PAGE_VISIT)) .groupingKey(pageVisit -> pageVisit.userId()); BatchStageWithKey<AddToCart, Integer> addToCarts = p.drawFrom(Sources.<AddToCart>list(ADD_TO_CART)) .groupingKey(addToCart -> addToCart.userId()); BatchStageWithKey<Payment, Integer> payments = p.drawFrom(Sources.<Payment>list(PAYMENT)) .groupingKey(payment -> payment.userId()); // Obtain a builder object for the co-group transform GroupAggregateBuilder<Integer, List<PageVisit>> builder = pageVisits.aggregateBuilder(toList()); Tag<List<PageVisit>> visitTag = builder.tag0(); // Add the co-grouped streams to the builder. Here we add just two, but // you could add any number of them. Tag<List<AddToCart>> cartTag = builder.add(addToCarts, toList()); Tag<List<Payment>> payTag = builder.add(payments, toList()); // Build the co-group transform. The aggregate operation collects all the // stream items into an accumulator class called ItemsByTag. We transform // it into a 3-tuple of lists. BatchStage<Entry<Integer, Tuple3<List<PageVisit>, List<AddToCart>, List<Payment>>>> coGrouped = builder.build((key, res) -> entry(key, tuple3(res.get(visitTag), res.get(cartTag), res.get(payTag)))); // Store the results in the output map coGrouped.drainTo(Sinks.map(RESULT)); return p; }
public static Pipeline buildPipeline(String sourceName, String sinkName) { Pattern pattern = Pattern.compile("\\W+"); Pipeline pipeline = Pipeline.create(); pipeline.drawFrom(Sources.<Integer, String>map(sourceName)) .flatMap(e -> Traversers.traverseArray(pattern.split(e.getValue().toLowerCase())) .filter(w -> !w.isEmpty())) .groupingKey(wholeItem()) .aggregate(counting()) .drainTo(Sinks.map(sinkName)); return pipeline; }
AggregateOperation3<T, T1, T2, ?, Tuple3<R0, R1, R2>> aggrOp = aggregateOperation3(aggrOp0, aggrOp1, aggrOp2, Tuple3::tuple3); return aggregate3(stage1, stage2, aggrOp, Util::entry);
/** * Attaches a stage that performs the given cogroup-and-aggregate * transformation of the items from both this stage and {@code stage1} * you supply. For each distinct grouping key it observes in the input, it * performs the supplied aggregate operation across all the items sharing * that key. It performs the aggregation separately for each input stage: * {@code aggrOp0} on this stage and {@code aggrOp1} on {@code stage1}. * Once it has received all the items, it emits for each distinct key a * {@code Map.Entry(key, Tuple2(result0, result1))}. * * @see com.hazelcast.jet.aggregate.AggregateOperations AggregateOperations * * @param aggrOp0 aggregate operation to perform on this stage * @param stage1 the other stage * @param aggrOp1 aggregate operation to perform on the other stage * @param <R0> type of the aggregation result for stream-0 * @param <T1> type of items in {@code stage1} * @param <R1> type of the aggregation result for stream-1 */ @Nonnull default <T1, R0, R1> BatchStage<Entry<K, Tuple2<R0, R1>>> aggregate2( @Nonnull AggregateOperation1<? super T, ?, ? extends R0> aggrOp0, @Nonnull BatchStageWithKey<? extends T1, ? extends K> stage1, @Nonnull AggregateOperation1<? super T1, ?, ? extends R1> aggrOp1 ) { AggregateOperation2<? super T, ? super T1, ?, Tuple2<R0, R1>> aggrOp = aggregateOperation2(aggrOp0, aggrOp1, Tuple2::tuple2); return aggregate2(stage1, aggrOp, Util::entry); }
private static Pipeline buildPipeline() { Pattern delimiter = Pattern.compile("\\W+"); Pipeline p = Pipeline.create(); p.drawFrom(Sources.<Long, String>map(BOOK_LINES)) .flatMap(e -> traverseArray(delimiter.split(e.getValue().toLowerCase()))) .filter(word -> !word.isEmpty()) .groupingKey(wholeItem()) .aggregate(counting()) .drainTo(Sinks.map(COUNTS)); return p; }
@Nonnull DistributedQuadFunction<? super K, ? super R0, ? super R1, ? super R2, ? extends OUT> mapToOutputFn ) { return aggregate3(stage1, stage2, aggregateOperation3(aggrOp0, aggrOp1, aggrOp2, Tuple3::tuple3), (key, tuple) -> mapToOutputFn.apply(key, tuple.f0(), tuple.f1(), tuple.f2()));
@Nonnull DistributedTriFunction<? super K, ? super R0, ? super R1, OUT> mapToOutputFn ) { return aggregate2(stage1, aggregateOperation2(aggrOp0, aggrOp1, Tuple2::tuple2), (key, tuple) -> mapToOutputFn.apply(key, tuple.f0(), tuple.f1()));
.flatMap(line -> traverseArray(delimiter.split(line.toLowerCase())).filter(w -> !w.isEmpty())) .groupingKey(wholeItem()) .aggregate(counting()) .drainTo(HdfsSinks.hdfs(jobConfig));
@SuppressWarnings("Convert2MethodRef") // https://bugs.openjdk.java.net/browse/JDK-8154236 private static Pipeline coGroupDirect() { Pipeline p = Pipeline.create(); // Create three source streams BatchStageWithKey<PageVisit, Integer> pageVisits = p.drawFrom(Sources.<PageVisit>list(PAGE_VISIT)) .groupingKey(pageVisit -> pageVisit.userId()); BatchStageWithKey<AddToCart, Integer> addToCarts = p.drawFrom(Sources.<AddToCart>list(ADD_TO_CART)) .groupingKey(addToCart -> addToCart.userId()); BatchStageWithKey<Payment, Integer> payments = p.drawFrom(Sources.<Payment>list(PAYMENT)) .groupingKey(payment -> payment.userId()); // Construct the co-group transform. The aggregate operation collects all // the stream items into a 3-tuple of lists. BatchStage<Entry<Integer, Tuple3<List<PageVisit>, List<AddToCart>, List<Payment>>>> coGrouped = pageVisits.aggregate3(toList(), addToCarts, toList(), payments, toList()); // Store the results in the output map coGrouped.drainTo(Sinks.map(RESULT)); return p; }
/** * Builds and returns the Pipeline which represents the actual computation. * To compute the probability of finding word B after A, one has to know * how many pairs contain word A as a first entry and how many of them * contain B as a second entry. The pipeline creates pairs from consecutive * words and computes the probabilities of A->B. */ private static Pipeline buildPipeline() { Pipeline p = Pipeline.create(); // Reads files line-by-line BatchStage<String> lines = p.drawFrom(Sources.<String>files(INPUT_FILE)); Pattern twoWords = Pattern.compile("(\\.|\\w+)\\s(\\.|\\w+)"); // Calculates probabilities by flatmapping lines into two-word consecutive pairs using regular expressions // and aggregates them into an IMap. lines.flatMap(e -> traverseMatcher(twoWords.matcher(e.toLowerCase()), m -> tuple2(m.group(1), m.group(2)))) .groupingKey(Tuple2::f0) .aggregate(buildAggregateOp()) .drainTo(Sinks.map("stateTransitions")); return p; }
@SuppressWarnings("Convert2MethodRef") // https://bugs.openjdk.java.net/browse/JDK-8154236 private static Pipeline coGroupDirect() { Pipeline p = Pipeline.create(); // Create three source streams BatchStageWithKey<PageVisit, Integer> pageVisits = p.drawFrom(Sources.<PageVisit>list(PAGE_VISIT)) .groupingKey(pageVisit -> pageVisit.userId()); BatchStageWithKey<AddToCart, Integer> addToCarts = p.drawFrom(Sources.<AddToCart>list(ADD_TO_CART)) .groupingKey(addToCart -> addToCart.userId()); BatchStageWithKey<Payment, Integer> payments = p.drawFrom(Sources.<Payment>list(PAYMENT)) .groupingKey(payment -> payment.userId()); // Construct the co-group transform. The aggregate operation collects all // the stream items into a 3-tuple of lists. BatchStage<Entry<Integer, Tuple3<List<PageVisit>, List<AddToCart>, List<Payment>>>> coGrouped = pageVisits.aggregate3(toList(), addToCarts, toList(), payments, toList()); // Store the results in the output map coGrouped.drainTo(Sinks.map(RESULT)); return p; }
/** * Helper method to construct the pipeline for the job * * @return the pipeline for the job */ public static Pipeline buildPipeline() { final Pipeline p = Pipeline.create(); // Compute map server side final BatchStage<Horse> c = p.drawFrom(Sources.map(EVENTS_BY_NAME, t -> true, HORSE_FROM_EVENT)); final BatchStage<Entry<Horse, Long>> c2 = c.groupingKey(wholeItem()) .aggregate(counting()) .filter(ent -> ent.getValue() > 1); c2.drainTo(Sinks.map(MULTIPLE)); return p; }
})) .aggregate(AggregateOperations.toMap(entryKey(), e -> 1L, Long::sum));
/** * Helper method to construct the pipeline for the job * * @return the pipeline for the real-time analysis */ public static Pipeline buildPipeline() { final Pipeline pipeline = Pipeline.create(); // Draw users from the Hazelcast IMDG source BatchStage<User> users = pipeline.drawFrom(Sources.<User, Long, User>map(USER_ID, e -> true, Entry::getValue)); // All bet legs which are single BatchStage<Tuple3<Race, Horse, Bet>> bets = users.flatMap(user -> traverseStream( user.getKnownBets().stream() .filter(Bet::single) .flatMap(bet -> bet.getLegs().stream().map(leg -> tuple3(leg.getRace(), leg.getBacking(), bet))) ) ); // Find for each race the projected loss if each horse was to win BatchStage<Entry<Race, Map<Horse, Double>>> betsByRace = bets.groupingKey(Tuple3::f0).aggregate( AggregateOperations.toMap( Tuple3::f1, t -> t.f2().projectedPayout(t.f1()), // payout if backed horse was to win (l, r) -> l + r ) ); // Write out: (r : (h : losses)) betsByRace.drainTo(Sinks.map(WORST_ID)); return pipeline; }
private static Pipeline buildPipeline(String sourceDir, String targetDir) { Pipeline p = Pipeline.create(); p.drawFrom(Sources.files(sourceDir)) .map(LogLine::parse) .filter((LogLine log) -> log.getResponseCode() >= 200 && log.getResponseCode() < 400) .flatMap(AccessLogAnalyzer::explodeSubPaths) .groupingKey(wholeItem()) .aggregate(counting()) .drainTo(Sinks.files(targetDir)); return p; }