@Test public void testCommonOperation() { List<String> data = Arrays.asList("hello", "world"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); Assert.assertEquals("hello", ds.first()); Dataset<String> filtered = ds.filter((FilterFunction<String>) v -> v.startsWith("h")); Assert.assertEquals(Arrays.asList("hello"), filtered.collectAsList()); Dataset<Integer> mapped = ds.map((MapFunction<String, Integer>) String::length, Encoders.INT()); Assert.assertEquals(Arrays.asList(5, 5), mapped.collectAsList()); Dataset<String> parMapped = ds.mapPartitions((MapPartitionsFunction<String, String>) it -> { List<String> ls = new LinkedList<>(); while (it.hasNext()) { ls.add(it.next().toUpperCase(Locale.ROOT)); } return ls.iterator(); }, Encoders.STRING()); Assert.assertEquals(Arrays.asList("HELLO", "WORLD"), parMapped.collectAsList()); Dataset<String> flatMapped = ds.flatMap((FlatMapFunction<String, String>) s -> { List<String> ls = new LinkedList<>(); for (char c : s.toCharArray()) { ls.add(String.valueOf(c)); } return ls.iterator(); }, Encoders.STRING()); Assert.assertEquals( Arrays.asList("h", "e", "l", "l", "o", "w", "o", "r", "l", "d"), flatMapped.collectAsList()); }
@Test public void testCommonOperation() { List<String> data = Arrays.asList("hello", "world"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); Assert.assertEquals("hello", ds.first()); Dataset<String> filtered = ds.filter((FilterFunction<String>) v -> v.startsWith("h")); Assert.assertEquals(Arrays.asList("hello"), filtered.collectAsList()); Dataset<Integer> mapped = ds.map((MapFunction<String, Integer>) String::length, Encoders.INT()); Assert.assertEquals(Arrays.asList(5, 5), mapped.collectAsList()); Dataset<String> parMapped = ds.mapPartitions((MapPartitionsFunction<String, String>) it -> { List<String> ls = new LinkedList<>(); while (it.hasNext()) { ls.add(it.next().toUpperCase(Locale.ROOT)); } return ls.iterator(); }, Encoders.STRING()); Assert.assertEquals(Arrays.asList("HELLO", "WORLD"), parMapped.collectAsList()); Dataset<String> flatMapped = ds.flatMap((FlatMapFunction<String, String>) s -> { List<String> ls = new LinkedList<>(); for (char c : s.toCharArray()) { ls.add(String.valueOf(c)); } return ls.iterator(); }, Encoders.STRING()); Assert.assertEquals( Arrays.asList("h", "e", "l", "l", "o", "w", "o", "r", "l", "d"), flatMapped.collectAsList()); }
@Test public void testCommonOperation() { List<String> data = Arrays.asList("hello", "world"); Dataset<String> ds = spark.createDataset(data, Encoders.STRING()); Assert.assertEquals("hello", ds.first()); Dataset<String> filtered = ds.filter((FilterFunction<String>) v -> v.startsWith("h")); Assert.assertEquals(Arrays.asList("hello"), filtered.collectAsList()); Dataset<Integer> mapped = ds.map((MapFunction<String, Integer>) String::length, Encoders.INT()); Assert.assertEquals(Arrays.asList(5, 5), mapped.collectAsList()); Dataset<String> parMapped = ds.mapPartitions((MapPartitionsFunction<String, String>) it -> { List<String> ls = new LinkedList<>(); while (it.hasNext()) { ls.add(it.next().toUpperCase(Locale.ROOT)); } return ls.iterator(); }, Encoders.STRING()); Assert.assertEquals(Arrays.asList("HELLO", "WORLD"), parMapped.collectAsList()); Dataset<String> flatMapped = ds.flatMap((FlatMapFunction<String, String>) s -> { List<String> ls = new LinkedList<>(); for (char c : s.toCharArray()) { ls.add(String.valueOf(c)); } return ls.iterator(); }, Encoders.STRING()); Assert.assertEquals( Arrays.asList("h", "e", "l", "l", "o", "w", "o", "r", "l", "d"), flatMapped.collectAsList()); }
.flatMap(messageRouterFunction(profilerProps, profiles, globals), Encoders.bean(MessageRoute.class)); LOG.debug("Generated {} message route(s)", routes.cache().count());
@Override public ConceptMaps withConceptMaps(Dataset<ConceptMap> conceptMaps) { Dataset<UrlAndVersion> newMembers = getUrlAndVersions(conceptMaps); if (hasDuplicateUrlAndVersions(newMembers) || conceptMaps.count() != newMembers.count()) { throw new IllegalArgumentException( "Cannot add concept maps having duplicate conceptMapUri and conceptMapVersion"); } // Remove the concept contents for persistence. This is most easily done in the ConceptMap // object by setting the group to an empty list. Dataset<ConceptMap> withoutConcepts = conceptMaps .map((MapFunction<ConceptMap,ConceptMap>) conceptMap -> { // Remove the elements rather than the groups to preserved the // "unmapped" structure in a group that can refer to other // concept maps. ConceptMap withoutElements = conceptMap.copy(); List<ConceptMapGroupComponent> updatedGroups = new ArrayList<>(); for (ConceptMapGroupComponent group: withoutElements.getGroup()) { group.setElement(new ArrayList<>()); updatedGroups.add(group); } withoutElements.setGroup(updatedGroups); return withoutElements; }, CONCEPT_MAP_ENCODER); Dataset<Mapping> newMappings = conceptMaps.flatMap(ConceptMaps::expandMappingsIterator, MAPPING_ENCODER); return withConceptMaps(withoutConcepts, newMappings); }
@Override public ConceptMaps withConceptMaps(Dataset<ConceptMap> conceptMaps) { Dataset<UrlAndVersion> newMembers = getUrlAndVersions(conceptMaps); if (hasDuplicateUrlAndVersions(newMembers) || conceptMaps.count() != newMembers.count()) { throw new IllegalArgumentException( "Cannot add concept maps having duplicate conceptMapUri and conceptMapVersion"); } // Remove the concept contents for persistence. This is most easily done in the ConceptMap // object by setting the group to an empty list. Dataset<ConceptMap> withoutConcepts = conceptMaps .map((MapFunction<ConceptMap,ConceptMap>) conceptMap -> { // Remove the elements rather than the groups to preserved the // "unmapped" structure in a group that can refer to other // concept maps. ConceptMap withoutElements = conceptMap.copy(); List<ConceptMapGroupComponent> updatedGroups = new ArrayList<>(); for (ConceptMapGroupComponent group: withoutElements.getGroup()) { group.setElement(new ArrayList<>()); updatedGroups.add(group); } withoutElements.setGroup(updatedGroups); return withoutElements; }, CONCEPT_MAP_ENCODER); Dataset<Mapping> newMappings = conceptMaps.flatMap(ConceptMaps::expandMappingsIterator, MAPPING_ENCODER); return withConceptMaps(withoutConcepts, newMappings); }
@Override public ConceptMaps withConceptMaps(Dataset<ConceptMap> conceptMaps) { Dataset<UrlAndVersion> newMembers = getUrlAndVersions(conceptMaps); if (hasDuplicateUrlAndVersions(newMembers) || conceptMaps.count() != newMembers.count()) { throw new IllegalArgumentException( "Cannot add concept maps having duplicate conceptMapUri and conceptMapVersion"); } // Remove the concept contents for persistence. This is most easily done in the ConceptMap // object by setting the group to an empty list. Dataset<ConceptMap> withoutConcepts = conceptMaps .map((MapFunction<ConceptMap,ConceptMap>) conceptMap -> { // Remove the elements rather than the groups to preserved the // "unmapped" structure in a group that can refer to other // concept maps. ConceptMap withoutElements = conceptMap.copy(); List<ConceptMapGroupComponent> updatedGroups = new ArrayList<>(); for (ConceptMapGroupComponent group: withoutElements.getGroup()) { group.setElement(new ArrayList<>()); updatedGroups.add(group); } withoutElements.setGroup(updatedGroups); return withoutElements; }, CONCEPT_MAP_ENCODER); Dataset<Mapping> newMappings = conceptMaps.flatMap(ConceptMaps::expandMappingsIterator, MAPPING_ENCODER); return withConceptMaps(withoutConcepts, newMappings); }
}, VALUE_SET_ENCODER); Dataset<Value> newValues = valueSets.flatMap(ValueSets::expandValuesIterator, getValueEncoder());
}, VALUE_SET_ENCODER); Dataset<Value> newValues = valueSets.flatMap(ValueSets::expandValuesIterator, getValueEncoder());
}, VALUE_SET_ENCODER); Dataset<Value> newValues = valueSets.flatMap(ValueSets::expandValuesIterator, getValueEncoder());
private Dataset<Row> readText(String path) throws Exception { Dataset<Row> lines = Contexts.getSparkSession().read().text(path); if (translatorConfig != null) { Dataset<Tuple2<String, String>> keyedLines = lines.map( new PrepareLineForTranslationFunction(), Encoders.tuple(Encoders.STRING(), Encoders.STRING())); TranslateFunction<String, String> translateFunction = getTranslateFunction(translatorConfig); return keyedLines.flatMap(translateFunction, RowEncoder.apply(translateFunction.getSchema())); } else { return lines; } }
Dataset<Row> wordCounts = lines.flatMap( (FlatMapFunction<String, String>) x -> Arrays.asList(x.split(" ")).iterator(), Encoders.STRING()).groupBy("value").count();