.of(lines) .using(LogParser::parseLine) .output();
.output() .persist(out);
.of(counted) .using(p -> p.getFirst() + ":" + p.getSecond()) .output();
.of(MapElements.of(left) .using(i -> i) .output(new Util.TestHint(), new Util.TestHint2(), new Util.TestHint2()), right) .by(String::length, String::length)
.of(MapElements.of(left).using(i -> i).output(new Util.TestHint(), new Util.TestHint2()), right) .by(String::length, String::length)
@Test(timeout = 5000L) public void testInputMultiConsumption() throws InterruptedException, ExecutionException { final int N = 1000; Dataset<Integer> input = flow.createInput( ListDataSource.unbounded(sequenceInts(0, N))); // there seems to be bug in LocalExecutor // that makes it impossible to consume the // same dataset twice by single union operator Dataset<Integer> first = MapElements.of(input) .using(e -> e) .output(); Dataset<Integer> second = MapElements.of(input) .using(e -> e) .output(); // ~ consume the input another time Dataset<Integer> union = Union.of(first, second) .output(); Dataset<Pair<Integer, Integer>> sum = ReduceByKey .of(union) .keyBy(e -> 0) .valueBy(e -> e) .reduceBy(Sums.ofInts()) .output(); ListDataSink<Pair<Integer, Integer>> sumOut = ListDataSink.get(); sum.persist(sumOut); executor.submit(flow).get(); DatasetAssert.unorderedEquals( sumOut.getOutputs(), Pair.of(0, 2 * (N - 1) * N / 2)); }
@Override protected Dataset<Triple<Instant, Type, Long>> getOutput(Dataset<Triple<Instant, Type, String>> input) { // distinct implemented using raw ReduceStateByKey input = AssignEventTime.of(input).using(t -> t.getFirst().toEpochMilli()).output(); Dataset<Pair<ComparablePair<Type, String>, Object>> pairs = ReduceStateByKey.of(input) .keyBy(t -> new ComparablePair<>(t.getSecond(), t.getThird())) .valueBy(t -> null) .stateFactory(DistinctState::new) .mergeStatesBy((t, os) -> {}) .windowBy(Time.of(Duration.ofHours(1))) .output(); Dataset<ComparablePair<Type, String>> distinct = MapElements.of(pairs) .using(Pair::getFirst) .output(); Dataset<Pair<Type, Long>> reduced = ReduceByKey.of(distinct) .keyBy(ComparablePair::getFirst) .valueBy(p -> 1L) .combineBy(Sums.ofLongs()) .windowBy(Time.of(Duration.ofHours(1))) .output(); // extract window timestamp return FlatMap.of(reduced) .using((Pair<Type, Long> p, Collector<Triple<Instant, Type, Long>> ctx) -> { long windowEnd = ((TimeInterval) ctx.getWindow()).getEndMillis(); ctx.collect(Triple.of(Instant.ofEpochMilli(windowEnd), p.getFirst(), p.getSecond())); }) .output(); }
@Override protected Dataset<Triple<Instant, Type, Long>> getOutput(Dataset<Triple<Instant, Type, String>> input) { // distinct implemented using raw ReduceStateByKey input = AssignEventTime.of(input).using(t -> t.getFirst().toEpochMilli()).output(); Dataset<Pair<ComparablePair<Type, String>, Object>> pairs = ReduceStateByKey.of(input) .keyBy(t -> new ComparablePair<>(t.getSecond(), t.getThird())) .valueBy(t -> null) .stateFactory(DistinctState::new) .mergeStatesBy((t, os) -> {}) .windowBy(Time.of(Duration.ofHours(1))) .output(); Dataset<ComparablePair<Type, String>> distinct = MapElements.of(pairs) .using(Pair::getFirst) .output(); Dataset<Pair<Type, Long>> reduced = ReduceByKey.of(distinct) .keyBy(ComparablePair::getFirst) .valueBy(p -> 1L) .combineBy(Sums.ofLongs()) .windowBy(Time.of(Duration.ofHours(1))) .output(); // extract window timestamp return FlatMap.of(reduced) .using((Pair<Type, Long> p, Collector<Triple<Instant, Type, Long>> ctx) -> { long windowEnd = ((TimeInterval) ctx.getWindow()).getEndMillis(); ctx.collect(Triple.of(Instant.ofEpochMilli(windowEnd), p.getFirst(), p.getSecond())); }) .output(); }
@Test public void testWrite() throws IOException { List<String> data = Arrays.asList("a", "b", "bbb", "bbbb", "c", "xy"); List<String> inputs = data.stream() .sorted(Comparator.reverseOrder()) .collect(Collectors.toList()); File tmp = folder.newFolder(); tmp.deleteOnExit(); ListDataSource<String> source = ListDataSource.unbounded(inputs); Dataset<String> input = flow.createInput(source); MapElements.of(input) .using(HBaseTestCase::kv) .output() .persist(traceLoading(HFileSink.newBuilder() .withTable(table.getNameAsString()) .withConfiguration(cluster.getConfiguration()) .withOutputPath(new Path("file://" + tmp.getPath())) .windowBy(GlobalWindowing.get(), w -> "global") .build())); new LocalExecutor().submit(flow).join(); // we should not have success marker assertFalse(new File(tmp, "_SUCCESS").exists()); assertEquals(Collections.singletonList("file:" + tmp.getPath() + "/global"), loadedPaths); // validate that the bulk load directory was deleted assertFalse(new File(tmp + "/global", "t").exists()); // validate that the data have been written to hbase data.forEach(s -> assertArrayEquals(b(s), get(s))); }
@Test public void testDistinctOnStreamUsingWindowingLabels() throws Exception { Flow flow = Flow.create("Test"); Dataset<String> lines = flow.createInput( ListDataSource.unbounded(asList( "one two three four one one two", "one two three three three")) .withReadDelay(Duration.ofSeconds(2))); // expand it to words Dataset<String> words = FlatMap.of(lines) .using(toWords(w -> w)) .output(); Dataset<Pair<TimeInterval, String>> output = FlatMap.of(Distinct.of(words).windowBy(Time.of(Duration.ofSeconds(1))).output()) .using((UnaryFunctor<String, Pair<TimeInterval, String>>) (elem, context) -> context.collect(Pair.of((TimeInterval) context.getWindow(), elem))) .output(); ListDataSink<String> out = ListDataSink.get(); // strip the labels again because we cannot test them MapElements.of(output) .using(Pair::getSecond) .output() .persist(out); executor.submit(flow).get(); DatasetAssert.unorderedEquals( out.getOutputs(), "four", "one", "three", "two", "one", "three", "two"); }
@Test public void test() throws Exception { Flow f = Flow.create("Test"); ListDataSink<Triple<String, String, String>> output = ListDataSink.get(); Dataset<Pair<String, String>> input = f.createInput(ListDataSource.unbounded( Arrays.asList( Pair.of("foo", "bar"), Pair.of("quux", "ibis")))); Dataset<Pair<String, String>> intermediate = MapElements.of(input) .using(e -> Pair.of(e.getFirst(), e.getSecond())) .output(); MapElements.of(intermediate) .using(e -> Triple.of("uf", e.getFirst(), e.getSecond())) .output() .persist(output); new TestFlinkExecutor().submit(f).get(); DatasetAssert.unorderedEquals( output.getOutputs(), Triple.of("uf", "foo", "bar"), Triple.of("uf", "quux", "ibis")); } }
@Test public void test() { final Configuration conf = new Configuration(); final String outputDir = Paths.get(tmp.getRoot().getAbsolutePath(), testName).toAbsolutePath().toString(); final Flow flow = Flow.create(); final S source = dataSinkTester.prepareDataSource(); final T sink = dataSinkTester.buildSink(outputDir, conf, useLazyOutputFormat); MapElements.of(flow.createInput(source)).using(p -> p).output().persist(sink); final Executor executor = new LocalExecutor().setDefaultParallelism(4); executor.submit(flow).join(); String[] files = new File(outputDir).list(); assertNotNull(files); List<String> reduceOutputFileNames = Arrays.stream(files) .filter(file -> file.startsWith("part-r-")) .collect(Collectors.toList()); assertEquals(expectedNumberOfReduceOutputs, reduceOutputFileNames.size()); final List<O> output = reduceOutputFileNames .stream() .flatMap(dataSinkTester.extractOutputFunction(outputDir, conf)) .collect(Collectors.toList()); DatasetAssert.unorderedEquals(dataSinkTester.expectedOutput(), output); }
@Test(expected = IllegalArgumentException.class) public void testMultipleOutputsToSameSink() throws Exception { flow = Flow.create(getClass().getSimpleName()); input = flow.createInput(new MockStreamDataSource<>()); Dataset<Object> mapped = MapElements.of(input).using(e -> e).output(); Dataset<Pair<Object, Long>> reduced = ReduceByKey .of(mapped) .keyBy(e -> e).reduceBy(values -> 1L) .windowBy(Time.of(Duration.ofSeconds(1))) .output(); Dataset<Pair<Object, Long>> output = Join.of(mapped, reduced) .by(e -> e, Pair::getFirst) .using((Object l, Pair<Object, Long> r, Collector<Long> c) -> { c.collect(r.getSecond()); }) .windowBy(Time.of(Duration.ofSeconds(1))) .output(); ListDataSink<Pair<Object, Long>> sink = ListDataSink.get(); output.persist(sink); reduced.persist(sink); FlowUnfolder.unfold(flow, Executor.getBasicOps()); }
@Test public void testBuild_WithCounters() { Flow flow = Flow.create("TEST"); Dataset<String> dataset = Util.createMockDataset(flow, 1); Dataset<String> mapped = MapElements.named("Map1") .of(dataset) .using((input, context) -> { // use simple counter context.getCounter("my-counter").increment(); return input.toLowerCase(); }) .output(); assertEquals(flow, mapped.getFlow()); assertEquals(1, flow.size()); MapElements map = (MapElements) flow.operators().iterator().next(); assertEquals(flow, map.getFlow()); assertEquals("Map1", map.getName()); assertNotNull(map.getMapper()); assertEquals(mapped, map.output()); }