private static Map<String,Integer> buildIDIndexMapping(JavaRDD<String[]> parsedRDD, boolean user) { int offset = user ? 0 : 1; Map<String,Integer> reverseIDLookup = parsedRDD.map(tokens -> tokens[offset]) .distinct().sortBy(s -> s, true, parsedRDD.getNumPartitions()) .zipWithIndex().mapValues(Long::intValue) .collectAsMap(); // Clone, due to some serialization problems with the result of collectAsMap? return new HashMap<>(reverseIDLookup); }
@Test public void zipWithIndex() { List<Integer> dataArray = Arrays.asList(1, 2, 3, 4); JavaPairRDD<Integer, Long> zip = sc.parallelize(dataArray).zipWithIndex(); JavaRDD<Long> indexes = zip.values(); List<Long> correctIndexes = Arrays.asList(0L, 1L, 2L, 3L); assertEquals(correctIndexes, indexes.collect()); }
@Test public void zipWithIndex() { List<Integer> dataArray = Arrays.asList(1, 2, 3, 4); JavaPairRDD<Integer, Long> zip = sc.parallelize(dataArray).zipWithIndex(); JavaRDD<Long> indexes = zip.values(); List<Long> correctIndexes = Arrays.asList(0L, 1L, 2L, 3L); assertEquals(correctIndexes, indexes.collect()); }
@Test public void zipWithIndex() { List<Integer> dataArray = Arrays.asList(1, 2, 3, 4); JavaPairRDD<Integer, Long> zip = sc.parallelize(dataArray).zipWithIndex(); JavaRDD<Long> indexes = zip.values(); List<Long> correctIndexes = Arrays.asList(0L, 1L, 2L, 3L); assertEquals(correctIndexes, indexes.collect()); }
@Override public SparkPairStream<T, Long> zipWithIndex() { return new SparkPairStream<>(rdd.zipWithIndex()); }
/** * Save a {@code JavaRDD<List<Writable>>} to a Hadoop {@link org.apache.hadoop.io.MapFile}. Each record is * given a <i>unique and contiguous</i> {@link LongWritable} key, and values are stored as * {@link RecordWritable} instances.<br> * <b>Note</b>: If contiguous keys are not required, using a sequence file instead is preferable from a performance * point of view. Contiguous keys are often only required for non-Spark use cases, such as with * {@link org.datavec.hadoop.records.reader.mapfile.MapFileRecordReader} * <p> * Use {@link #restoreMapFileSequences(String, JavaSparkContext)} to restore values saved with this method. * * @param path Path to save the MapFile * @param rdd RDD to save * @param c Configuration object, used to customise options for the map file * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions) * to limit the maximum number of output map files * @see #saveMapFileSequences(String, JavaRDD) * @see #saveSequenceFile(String, JavaRDD) */ public static void saveMapFile(String path, JavaRDD<List<Writable>> rdd, Configuration c, @Nullable Integer maxOutputFiles) { path = FilenameUtils.normalize(path, true); if (maxOutputFiles != null) { rdd = rdd.coalesce(maxOutputFiles); } JavaPairRDD<List<Writable>, Long> dataIndexPairs = rdd.zipWithIndex(); //Note: Long values are unique + contiguous, but requires a count JavaPairRDD<LongWritable, RecordWritable> keyedByIndex = dataIndexPairs.mapToPair(new RecordSavePrepPairFunction()); keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, RecordWritable.class, MapFileOutputFormat.class, c); }
/** * Save a {@code JavaRDD<List<List<Writable>>>} to a Hadoop {@link org.apache.hadoop.io.MapFile}. Each record is * given a <i>unique and contiguous</i> {@link LongWritable} key, and values are stored as * {@link SequenceRecordWritable} instances.<br> * <b>Note</b>: If contiguous keys are not required, using a sequence file instead is preferable from a performance * point of view. Contiguous keys are often only required for non-Spark use cases, such as with * {@link org.datavec.hadoop.records.reader.mapfile.MapFileSequenceRecordReader}<br> * <p> * Use {@link #restoreMapFileSequences(String, JavaSparkContext)} to restore values saved with this method. * * @param path Path to save the MapFile * @param rdd RDD to save * @param c Configuration object, used to customise options for the map file * @see #saveMapFileSequences(String, JavaRDD) * @see #saveSequenceFile(String, JavaRDD) */ public static void saveMapFileSequences(String path, JavaRDD<List<List<Writable>>> rdd, Configuration c, @Nullable Integer maxOutputFiles) { path = FilenameUtils.normalize(path, true); if (maxOutputFiles != null) { rdd = rdd.coalesce(maxOutputFiles); } JavaPairRDD<List<List<Writable>>, Long> dataIndexPairs = rdd.zipWithIndex(); JavaPairRDD<LongWritable, SequenceRecordWritable> keyedByIndex = dataIndexPairs.mapToPair(new SequenceRecordSavePrepPairFunction()); keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, SequenceRecordWritable.class, MapFileOutputFormat.class, c); }
/** * Save a {@code JavaRDD<List<List<Writable>>>} to a Hadoop {@link org.apache.hadoop.io.MapFile}. Each record is * given a <i>unique and contiguous</i> {@link LongWritable} key, and values are stored as * {@link SequenceRecordWritable} instances.<br> * <b>Note</b>: If contiguous keys are not required, using a sequence file instead is preferable from a performance * point of view. Contiguous keys are often only required for non-Spark use cases, such as with * {@link org.datavec.hadoop.records.reader.mapfile.MapFileSequenceRecordReader}<br> * <p> * Use {@link #restoreMapFileSequences(String, JavaSparkContext)} to restore values saved with this method. * * @param path Path to save the MapFile * @param rdd RDD to save * @param c Configuration object, used to customise options for the map file * @see #saveMapFileSequences(String, JavaRDD) * @see #saveSequenceFile(String, JavaRDD) */ public static void saveMapFileSequences(String path, JavaRDD<List<List<Writable>>> rdd, Configuration c, @Nullable Integer maxOutputFiles) { path = FilenameUtils.normalize(path, true); if (maxOutputFiles != null) { rdd = rdd.coalesce(maxOutputFiles); } JavaPairRDD<List<List<Writable>>, Long> dataIndexPairs = rdd.zipWithIndex(); JavaPairRDD<LongWritable, SequenceRecordWritable> keyedByIndex = dataIndexPairs.mapToPair(new SequenceRecordSavePrepPairFunction()); keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, SequenceRecordWritable.class, MapFileOutputFormat.class, c); }
/** * Save a {@code JavaRDD<List<Writable>>} to a Hadoop {@link org.apache.hadoop.io.MapFile}. Each record is * given a <i>unique and contiguous</i> {@link LongWritable} key, and values are stored as * {@link RecordWritable} instances.<br> * <b>Note</b>: If contiguous keys are not required, using a sequence file instead is preferable from a performance * point of view. Contiguous keys are often only required for non-Spark use cases, such as with * {@link org.datavec.hadoop.records.reader.mapfile.MapFileRecordReader} * <p> * Use {@link #restoreMapFileSequences(String, JavaSparkContext)} to restore values saved with this method. * * @param path Path to save the MapFile * @param rdd RDD to save * @param c Configuration object, used to customise options for the map file * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions) * to limit the maximum number of output map files * @see #saveMapFileSequences(String, JavaRDD) * @see #saveSequenceFile(String, JavaRDD) */ public static void saveMapFile(String path, JavaRDD<List<Writable>> rdd, Configuration c, @Nullable Integer maxOutputFiles) { path = FilenameUtils.normalize(path, true); if (maxOutputFiles != null) { rdd = rdd.coalesce(maxOutputFiles); } JavaPairRDD<List<Writable>, Long> dataIndexPairs = rdd.zipWithIndex(); //Note: Long values are unique + contiguous, but requires a count JavaPairRDD<LongWritable, RecordWritable> keyedByIndex = dataIndexPairs.mapToPair(new RecordSavePrepPairFunction()); keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, RecordWritable.class, MapFileOutputFormat.class, c); }
@Override public SparkStream<T> limit(long number) { Preconditions.checkArgument(number >= 0, "Limit number must be non-negative."); if (number == 0) { StreamingContext.distributed().empty(); } return new SparkStream<>(rdd.zipWithIndex().filter(p -> p._2() < number).map(Tuple2::_1)); }
@Override public SparkStream<T> skip(long n) { if (n > count()) { return getContext().empty(); } else if (n <= 0) { return this; } return new SparkStream<>(rdd.zipWithIndex().filter(p -> p._2() > n - 1).map(Tuple2::_1)); }
JavaPairRDD<Object[], Long> zipWithIndex = parsedInput.zipWithIndex(); final JavaPairRDD<Object[], Long> zipWithIndex = parsedInput.zipWithIndex(); return zipWithIndex.map(new ValuesToInputRowFunction(_sparkJobContext)); } else if (datastore instanceof FixedWidthDatastore) { JavaPairRDD<Object[], Long> zipWithIndex = parsedInput.zipWithIndex();