@Override protected Dataset<Pair<Integer, String>> getOutput( Dataset<Integer> left, Dataset<Long> right) { return Join.of(left, right) .by(e -> e, e -> (int) (e % 10), Integer.class) .using((Integer l, Long r, Collector<String> c) -> { c.collect(l + "+" + r); }) .output(); }
@SuppressWarnings("unchecked") public static boolean wantTranslateBroadcastHashJoin(Join o) { final ArrayList<Dataset> inputs = new ArrayList(o.listInputs()); if (inputs.size() != 2) { return false; } final Dataset leftDataset = inputs.get(0); final Dataset rightDataset = inputs.get(1); return (o.getType() == Join.Type.LEFT && hasFitsInMemoryHint(rightDataset.getProducer()) || o.getType() == Join.Type.RIGHT && hasFitsInMemoryHint(leftDataset.getProducer()) ) && !(o.getWindowing() instanceof MergingWindowing); }
@Override @SuppressWarnings("unchecked") public DAG<Operator<?, ?>> getBasicOps() { final Flow flow = getFlow(); getName() + "::Map-left", flow, left, Either::left); getName() + "::Map-right", flow, right, Either::right); new Union<>(getName() + "::Union", flow, Arrays.asList(leftMap.output(), rightMap.output())); getName() + "::ReduceStateByKey", flow, union.output(), keyExtractor, e -> e, getWindowing(), (StateContext context, Collector ctx) -> { StorageProvider storages = context.getStorageProvider(); : new EarlyEmittingJoinState(storages, ctx); }, new StateSupport.MergeFromStateMerger<>(), getHints());
@SuppressWarnings("unchecked") static boolean wantTranslate(Join join, SparkFlowTranslator.AcceptorContext context) { return (join.getWindowing() == null || join.getWindowing() instanceof GlobalWindowing) && (ClassUtils.isComparable(join.getKeyClass()) || context.hasComparator(join.getKeyClass())); }
@Override public Dataset<Pair<KEY, OUT>> output(OutputHint... outputHints) { final Flow flow = left.getFlow(); final Join<LEFT, RIGHT, KEY, OUT, W> join = new Join<>( name, flow, left, right, leftKeyExtractor, rightKeyExtractor, keyClass, joinFunc, type, windowing, Sets.newHashSet(outputHints)); flow.add(join); return join.output(); } }
@Test public void testBuild() { Flow flow = Flow.create("TEST"); Dataset<String> left = Util.createMockDataset(flow, 2); Dataset<String> right = Util.createMockDataset(flow, 3); Dataset<Pair<Integer, String>> joined = Join.named("Join1") .of(left, right) .by(String::length, String::length) .using((String l, String r, Collector<String> c) -> { // no-op }) .output(); assertEquals(flow, joined.getFlow()); assertEquals(1, flow.size()); Join join = (Join) flow.operators().iterator().next(); assertEquals(flow, join.getFlow()); assertEquals("Join1", join.getName()); assertNotNull(join.leftKeyExtractor); assertNotNull(join.rightKeyExtractor); assertEquals(joined, join.output()); assertNull(join.getWindowing()); assertEquals(Join.Type.INNER, join.getType()); }
operator.listInputs() .stream() .anyMatch(input -> hasFitsInMemoryHint(((Dataset) input).getProducer())), "Missing broadcastHashJoin hint"); Preconditions.checkArgument( operator.getType() == Join.Type.LEFT || operator.getType() == Join.Type.RIGHT, "BroadcastJoin supports LEFT and RIGHT joins only"); final Windowing windowing = operator.getWindowing() == null ? AttachedWindowing.INSTANCE : operator.getWindowing(); new KeyExtractor(operator.getLeftKeyExtractor(), windowing, true); new KeyExtractor(operator.getRightKeyExtractor(), windowing, false); switch (operator.getType()) { case LEFT: { right .flatMapToPair(rightKeyExtractor) .setName(operator.getName() + "::extract-right") .collect()))); left .flatMapToPair(leftKeyExtractor) .setName(operator.getName() + "::extract-left") .flatMapToPair( t -> {
@Test public void testBuild_OutputValues() { Flow flow = Flow.create("TEST"); Dataset<String> left = Util.createMockDataset(flow, 2); Dataset<String> right = Util.createMockDataset(flow, 3); Dataset<String> joined = Join.named("JoinValues") .of(left, right) .by(String::length, String::length) .using((String l, String r, Collector<String> c) -> { // no-op }) .outputValues(); assertEquals(flow, joined.getFlow()); assertEquals(2, flow.size()); Join join = (Join) flow.operators().iterator().next(); assertEquals(flow, join.getFlow()); assertEquals("JoinValues", join.getName()); assertNotNull(join.getLeftKeyExtractor()); assertNotNull(join.getRightKeyExtractor()); assertNull(join.getWindowing()); assertEquals(Join.Type.INNER, join.getType()); }
operator.getWindowing() == null || operator.getWindowing() instanceof GlobalWindowing, "BatchJoinTranslator only supports GlobalWindowing."); left.mapToPair( se -> { final Object key = operator.getLeftKeyExtractor().apply(se.getElement()); return new Tuple2<>( new BatchJoinKey<>(key, BatchJoinKey.Side.LEFT), Either.<SparkElement, SparkElement>left(se)); }) .setName(operator.getName() + "::wrap-keys"); final Object key = operator.getRightKeyExtractor().apply(se.getElement()); return new Tuple2<>( new BatchJoinKey<>(key, BatchJoinKey.Side.RIGHT), rightPair.setName(operator.getName() + "::wrap-values"); context.getComparator(operator.getKeyClass()) != null ? new BatchJoinKeyComparator<>(context.getComparator(operator.getKeyClass())) : new BatchJoinKeyComparator<>(null); .setName(operator.getName() + "::union-inputs") .repartitionAndSortWithinPartitions(partitioner, comparator) .setName(operator.getName() + "::sort-by-key-and-side") .mapPartitions( iterator -> new JoinIterator<>(new BatchJoinIterator<>(iterator), operator.getType())) .setName(operator.getName() + "::create-iterator") .flatMap(
final UnaryFunction leftKeyExtractor = originalOperator.getLeftKeyExtractor(); final UnaryFunction rightKeyExtractor = originalOperator.getRightKeyExtractor(); final Windowing windowing = originalOperator.getWindowing() == null ? AttachedWindowing.INSTANCE : originalOperator.getWindowing(); DataSet<BatchElement<Window, Pair>> leftExtracted = left.flatMap(new KeyExtractor(leftKeyExtractor, windowing)) switch (originalOperator.getType()) { case LEFT: joined = leftExtracted .where(new JoinKeySelector()) .equalTo(new JoinKeySelector()) .with(new BroadcastFlatJoinFunction(originalOperator.getJoiner())) .returns(new TypeHint<BatchElement<Window, Pair>>() {}) .name(operator.getName() + "::left-broadcast-hash-join"); .where(new JoinKeySelector()) .equalTo(new JoinKeySelector()) .with(new BroadcastFlatJoinFunction(originalOperator.getJoiner())) .returns(new TypeHint<BatchElement<Window, Pair>>() {}) .name(operator.getName() + "::right-broadcast-hash-join"); break; default: throw new IllegalStateException("Invalid type: " + originalOperator.getType() + ".");
@Test public void testBuild_ImplicitName() { Flow flow = Flow.create("TEST"); Dataset<String> left = Util.createMockDataset(flow, 1); Dataset<String> right = Util.createMockDataset(flow, 1); Join.of(left, right) .by(String::length, String::length) .using((String l, String r, Collector<String> c) -> { // no-op }) .output(); Join join = (Join) flow.operators().iterator().next(); assertEquals("Join", join.getName()); }
Dataset<String> right = Util.createMockDataset(flow, 1); Join.named("Join1") .of(MapElements.of(left) .using(i -> i) .findFirst() .get(); assertTrue(join.listInputs() .stream() .anyMatch(input -> ((Dataset) input).getProducer().getHints().contains(new Util.TestHint()))); assertTrue(join.listInputs() .stream() .anyMatch(input -> ((Dataset) join.listInputs() .stream() .findFirst() ).getProducer().getHints().size()); assertTrue(join.getWindowing() instanceof Time);
@Test public void testBuild_FullJoin() { Flow flow = Flow.create("TEST"); Dataset<String> left = Util.createMockDataset(flow, 1); Dataset<String> right = Util.createMockDataset(flow, 1); FullJoin.named("Join1") .of(left, right) .by(String::length, String::length) .using((Optional<String> l, Optional<String> r, Collector<String> c) -> c.collect(l.orElse(null) + r.orElse(null))) .output(); Join join = (Join) flow.operators().iterator().next(); assertEquals(Join.Type.FULL, join.getType()); }
private static void checkJoinWindowing(Node<Operator<?, ?>> node) { Preconditions.checkState(node.get() instanceof Join); // ~ if a windowing strategy is explicitly provided by the user, all is fine if (((Join) node.get()).getWindowing() != null) { return; } for (Node<Operator<?, ?>> parent : node.getParents()) { if (!isBatched(parent)) { throw new WindowingRequiredException( "Join operator requires either an explicit windowing" + " strategy or needs to be supplied with batched inputs."); } } }
@Test public void testBuild_Windowing() { Flow flow = Flow.create("TEST"); Dataset<String> left = Util.createMockDataset(flow, 1); Dataset<String> right = Util.createMockDataset(flow, 1); Join.named("Join1") .of(left, right) .by(String::length, String::length) .using((String l, String r, Collector<String> c) -> c.collect(l + r)) .windowBy(Time.of(Duration.ofHours(1))) .output(); Join join = (Join) flow.operators().iterator().next(); assertTrue(join.getWindowing() instanceof Time); }
Dataset<String> outputDataset = Join.named("Join1") .of(MapElements.of(left).using(i -> i).output(new Util.TestHint(), new Util.TestHint2()), right) .findFirst() .get(); assertTrue(join.listInputs() .stream() .anyMatch(input -> ((Dataset) input).getProducer().getHints().contains(new Util.TestHint()))); assertTrue(join.listInputs() .stream() .anyMatch(input -> ((Dataset) join.listInputs() .stream() .findFirst()
@Test public void testBuild_WithCounters() { Flow flow = Flow.create("TEST"); Dataset<String> left = Util.createMockDataset(flow, 2); Dataset<String> right = Util.createMockDataset(flow, 3); Dataset<Pair<Integer, String>> joined = Join.named("Join1") .of(left, right) .by(String::length, String::length) .using((String l, String r, Collector<String> c) -> { c.getCounter("my-counter").increment(); c.collect(l + r); }) .output(); assertEquals(flow, joined.getFlow()); assertEquals(1, flow.size()); Join join = (Join) flow.operators().iterator().next(); assertEquals(flow, join.getFlow()); assertEquals("Join1", join.getName()); assertNotNull(join.leftKeyExtractor); assertNotNull(join.rightKeyExtractor); assertEquals(joined, join.output()); assertNull(join.getWindowing()); assertEquals(Join.Type.INNER, join.getType()); }
@Test public void testBuild_LeftJoin() { Flow flow = Flow.create("TEST"); Dataset<String> left = Util.createMockDataset(flow, 1); Dataset<String> right = Util.createMockDataset(flow, 1); LeftJoin.named("Join1") .of(left, right) .by(String::length, String::length) .using((String l, Optional<String> r, Collector<String> c) -> { // no-op }) .output(); Join join = (Join) flow.operators().iterator().next(); assertEquals(Join.Type.LEFT, join.getType()); }
private static void checkJoinWindowing(Node<Operator<?, ?>> node) { Preconditions.checkState(node.get() instanceof Join); // ~ if a windowing strategy is explicitly provided by the user, all is fine if (((Join) node.get()).getWindowing() != null) { return; } for (Node<Operator<?, ?>> parent : node.getParents()) { if (!isBatched(parent)) { throw new WindowingRequiredException( "Join operator requires either an explicit windowing" + " strategy or needs to be supplied with batched inputs."); } } }
@Override public Dataset<Pair<KEY, OUT>> output(OutputHint... outputHints) { final Flow flow = left.getFlow(); final Join<LEFT, RIGHT, KEY, OUT, W> join = new Join<>( name, flow, left, right, leftKeyExtractor, rightKeyExtractor, keyClass, joinFunc, type, windowing, Sets.newHashSet(outputHints)); flow.add(join); return join.output(); } }