@Override public PCollectionView<T> expand(PCollection<T> input) { try { GroupByKey.applicableTo(input); } catch (IllegalStateException e) { throw new IllegalStateException("Unable to create a side-input view from input", e); } Combine.Globally<T, T> singletonCombine = Combine.globally(new SingletonCombineFn<>(hasDefault, input.getCoder(), defaultValue)); if (!hasDefault) { singletonCombine = singletonCombine.withoutDefaults(); } return input.apply(singletonCombine.asSingletonView()); } }
@Test public void testWithDefaultsPreservesSideInputs() { final PCollectionView<Integer> view = pipeline.apply(Create.of(1)).apply(Sum.integersGlobally().asSingletonView()); Combine.Globally<Integer, String> combine = Combine.globally(new TestCombineFnWithContext(view)) .withSideInputs(view) .withoutDefaults(); assertEquals(Collections.singletonList(view), combine.getSideInputs()); }
@Test @Category(ValidatesRunner.class) public void testHotKeyCombiningWithAccumulationMode() { PCollection<Integer> input = pipeline.apply(Create.of(1, 2, 3, 4, 5)); PCollection<Integer> output = input .apply( Window.<Integer>into(new GlobalWindows()) .triggering(Repeatedly.forever(AfterPane.elementCountAtLeast(1))) .accumulatingFiredPanes() .withAllowedLateness(new Duration(0), ClosingBehavior.FIRE_ALWAYS)) .apply(Sum.integersGlobally().withoutDefaults().withFanout(2)) .apply(ParDo.of(new GetLast())); PAssert.that(output) .satisfies( input1 -> { assertThat(input1, hasItem(15)); return null; }); pipeline.run(); }
/** * Test to read data from embedded Elasticsearch instance and verify whether data is read * successfully. */ @Test public void testHifIOWithElastic() { // Expected hashcode is evaluated during insertion time one time and hardcoded here. String expectedHashCode = "a62a85f5f081e3840baf1028d4d6c6bc"; Configuration conf = getConfiguration(); PCollection<KV<Text, LinkedMapWritable>> esData = pipeline.apply(HadoopInputFormatIO.<Text, LinkedMapWritable>read().withConfiguration(conf)); PCollection<Long> count = esData.apply(Count.globally()); // Verify that the count of objects fetched using HIFInputFormat IO is correct. PAssert.thatSingleton(count).isEqualTo((long) TEST_DATA_ROW_COUNT); PCollection<LinkedMapWritable> values = esData.apply(Values.create()); PCollection<String> textValues = values.apply(transformFunc); // Verify the output values using checksum comparison. PCollection<String> consolidatedHashcode = textValues.apply(Combine.globally(new HashingFn()).withoutDefaults()); PAssert.that(consolidatedHashcode).containsInAnyOrder(expectedHashCode); pipeline.run().waitUntilFinish(); }
@Test @Category(ValidatesRunner.class) public void testSessionsCombine() { PCollection<KV<String, Integer>> input = pipeline .apply( Create.timestamped( TimestampedValue.of(KV.of("a", 1), new Instant(0L)), TimestampedValue.of(KV.of("a", 1), new Instant(4L)), TimestampedValue.of(KV.of("a", 4), new Instant(7L)), TimestampedValue.of(KV.of("b", 1), new Instant(10L)), TimestampedValue.of(KV.of("b", 13), new Instant(16L))) .withCoder(KvCoder.of(StringUtf8Coder.of(), BigEndianIntegerCoder.of()))) .apply(Window.into(Sessions.withGapDuration(Duration.millis(5)))); PCollection<Integer> sum = input.apply(Values.create()).apply(Combine.globally(new SumInts()).withoutDefaults()); PCollection<KV<String, String>> sumPerKey = input.apply(Combine.perKey(new TestCombineFn())); PAssert.that(sum).containsInAnyOrder(7, 13); PAssert.that(sumPerKey) .containsInAnyOrder(Arrays.asList(KV.of("a", "114"), KV.of("b", "1"), KV.of("b", "13"))); pipeline.run(); }
/** This test reads data from the Cassandra instance and verifies if data is read successfully. */ @Test public void testHIFReadForCassandra() { // Expected hashcode is evaluated during insertion time one time and hardcoded here. String expectedHashCode = "1a30ad400afe4ebf5fde75f5d2d95408"; Long expectedRecordsCount = 1000L; Configuration conf = getConfiguration(options); PCollection<KV<Long, String>> cassandraData = pipeline.apply( HadoopInputFormatIO.<Long, String>read() .withConfiguration(conf) .withValueTranslation(myValueTranslate)); PAssert.thatSingleton(cassandraData.apply("Count", Count.globally())) .isEqualTo(expectedRecordsCount); PCollection<String> textValues = cassandraData.apply(Values.create()); // Verify the output values using checksum comparison. PCollection<String> consolidatedHashcode = textValues.apply(Combine.globally(new HashingFn()).withoutDefaults()); PAssert.that(consolidatedHashcode).containsInAnyOrder(expectedHashCode); pipeline.run().waitUntilFinish(); }
@Test @Category({ValidatesRunner.class, UsesCustomWindowMerging.class}) public void testMergingCustomWindows() { Instant startInstant = new Instant(0L); PCollection<String> inputCollection = pipeline.apply( Create.timestamped( TimestampedValue.of("big", startInstant.plus(Duration.standardSeconds(10))), TimestampedValue.of("small1", startInstant.plus(Duration.standardSeconds(20))), // This one will be outside of bigWindow thus not merged TimestampedValue.of("small2", startInstant.plus(Duration.standardSeconds(39))))); PCollection<String> windowedCollection = inputCollection.apply(Window.into(new CustomWindowFn<>())); PCollection<Long> count = windowedCollection.apply(Combine.globally(Count.<String>combineFn()).withoutDefaults()); // "small1" and "big" elements merged into bigWindow "small2" not merged // because timestamp is not in bigWindow PAssert.that("Wrong number of elements in output collection", count).containsInAnyOrder(2L, 1L); pipeline.run(); }
protected void runTestSimpleCombineWithContext( List<KV<String, Integer>> table, int globalSum, List<KV<String, String>> perKeyCombines, String[] globallyCombines) { PCollection<KV<String, Integer>> perKeyInput = createInput(pipeline, table); PCollection<Integer> globallyInput = perKeyInput.apply(Values.create()); PCollection<Integer> sum = globallyInput.apply("Sum", Combine.globally(new SumInts())); PCollectionView<Integer> globallySumView = sum.apply(View.asSingleton()); PCollection<KV<String, String>> combinePerKey = perKeyInput.apply( Combine.<String, Integer, String>perKey(new TestCombineFnWithContext(globallySumView)) .withSideInputs(globallySumView)); PCollection<String> combineGlobally = globallyInput.apply( Combine.globally(new TestCombineFnWithContext(globallySumView)) .withoutDefaults() .withSideInputs(globallySumView)); PAssert.that(sum).containsInAnyOrder(globalSum); PAssert.that(combinePerKey).containsInAnyOrder(perKeyCombines); PAssert.that(combineGlobally).containsInAnyOrder(globallyCombines); pipeline.run(); }
private void runRead() { PCollection<Scientist> output = pipelineRead.apply( CassandraIO.<Scientist>read() .withHosts(options.getCassandraHost()) .withPort(options.getCassandraPort()) .withMinNumberOfSplits(20) .withKeyspace(KEYSPACE) .withTable(TABLE) .withEntity(Scientist.class) .withCoder(SerializableCoder.of(Scientist.class))); PCollection<String> consolidatedHashcode = output .apply(ParDo.of(new SelectNameFn())) .apply("Hash row contents", Combine.globally(new HashingFn()).withoutDefaults()); PAssert.thatSingleton(consolidatedHashcode) .isEqualTo(TestRow.getExpectedHashForRowCount(options.getNumberOfRecords())); pipelineRead.run().waitUntilFinish(); }
@Override public PCollectionView<OutputT> expand(PCollection<InputT> input) { PCollection<OutputT> combined = input.apply(Combine.<InputT, OutputT>globally(fn).withoutDefaults().withFanout(fanout)); PCollection<KV<Void, OutputT>> materializationInput = combined.apply(new VoidKeyToMultimapMaterialization<>()); PCollectionView<OutputT> view = PCollectionViews.singletonView( materializationInput, input.getWindowingStrategy(), insertDefault, insertDefault ? fn.defaultValue() : null, combined.getCoder()); materializationInput.apply(CreatePCollectionView.of(view)); return view; }
@Override public PCollection<?> expand(PCollection<T> input) { input = input.apply(Combine.globally(combineFn).withoutDefaults().withFanout(fanout)); @SuppressWarnings("unchecked") Coder<BoundedWindow> windowCoder = (Coder<BoundedWindow>) input.getWindowingStrategy().getWindowFn().windowCoder(); return BatchViewAsSingleton.applyForSingleton( runner, input, new IsmRecordForSingularValuePerWindowDoFn<>(windowCoder), input.getCoder(), view); }
@Override public PCollection<List<KV<String, Long>>> expand(PCollection<KV<String, Long>> sessions) { SerializableComparator<KV<String, Long>> comparator = (o1, o2) -> ComparisonChain.start() .compare(o1.getValue(), o2.getValue()) .compare(o1.getKey(), o2.getKey()) .result(); return sessions .apply(Window.into(CalendarWindows.months(1))) .apply(Top.of(1, comparator).withoutDefaults()); } }
@Override public PCollection<ElemT> expand(PCollection<ElemT> input) { input .apply(Combine.globally(new Concatenate<ElemT>()).withoutDefaults()) .apply(CreateGearpumpPCollectionView.of(view)); return input; }
@Override public PCollection<T> expand(PCollection<T> input) { return ((PCollection<T>) input.apply(Combine.globally(new Concatenate<T>()).withoutDefaults())) .apply(CreateApexPCollectionView.of(view)); }
@Override public PCollection<ElemT> expand(PCollection<ElemT> input) { input .apply(Combine.globally(new Concatenate<ElemT>()).withoutDefaults()) .apply(CreateFlinkPCollectionView.of(view)); return input; }
@Override public PCollection<ElemT> expand(PCollection<ElemT> input) { return input .apply(Combine.globally(new Concatenate<ElemT>()).withoutDefaults()) .apply(ParDo.of(StreamingPCollectionViewWriterFn.create(view, input.getCoder()))) .apply(CreateDataflowView.forStreaming(view)); } }
@Test @Category({ValidatesRunner.class, DataflowPortabilityApiUnsupported.class}) public void testWindowedCombineEmpty() { PCollection<Double> mean = pipeline .apply(Create.empty(BigEndianIntegerCoder.of())) .apply(Window.into(FixedWindows.of(Duration.millis(1)))) .apply(Combine.globally(new MeanInts()).withoutDefaults()); PAssert.that(mean).empty(); pipeline.run(); }
@Override public PCollection<ElemT> expand(PCollection<ElemT> input) { input .apply(Combine.globally(new Concatenate<ElemT>()).withoutDefaults()) .apply(CreateFlinkPCollectionView.of(view)); return input; }
@Override public PCollection<ElemT> expand(PCollection<ElemT> input) { input .apply(Combine.globally(new Concatenate<ElemT>()).withoutDefaults()) .apply(CreateFlinkPCollectionView.<ElemT, ViewT>of(view)); return input; }
@Override public PCollection<T> expand(PCollection<T> in) { return in.apply(Combine.globally(new SampleAnyCombineFn<T>(limit)).withoutDefaults()) .apply(Flatten.iterables()); }