org.apache.spark.api.java.JavaRDD.zipWithIndex java code examples

private static Map<String,Integer> buildIDIndexMapping(JavaRDD<String[]> parsedRDD,
                            boolean user) {
 int offset = user ? 0 : 1;
 Map<String,Integer> reverseIDLookup = parsedRDD.map(tokens -> tokens[offset])
   .distinct().sortBy(s -> s, true, parsedRDD.getNumPartitions())
   .zipWithIndex().mapValues(Long::intValue)
   .collectAsMap();
 // Clone, due to some serialization problems with the result of collectAsMap?
 return new HashMap<>(reverseIDLookup);
}

@Test
public void zipWithIndex() {
 List<Integer> dataArray = Arrays.asList(1, 2, 3, 4);
 JavaPairRDD<Integer, Long> zip = sc.parallelize(dataArray).zipWithIndex();
 JavaRDD<Long> indexes = zip.values();
 List<Long> correctIndexes = Arrays.asList(0L, 1L, 2L, 3L);
 assertEquals(correctIndexes, indexes.collect());
}

@Test
public void zipWithIndex() {
 List<Integer> dataArray = Arrays.asList(1, 2, 3, 4);
 JavaPairRDD<Integer, Long> zip = sc.parallelize(dataArray).zipWithIndex();
 JavaRDD<Long> indexes = zip.values();
 List<Long> correctIndexes = Arrays.asList(0L, 1L, 2L, 3L);
 assertEquals(correctIndexes, indexes.collect());
}

@Test
public void zipWithIndex() {
 List<Integer> dataArray = Arrays.asList(1, 2, 3, 4);
 JavaPairRDD<Integer, Long> zip = sc.parallelize(dataArray).zipWithIndex();
 JavaRDD<Long> indexes = zip.values();
 List<Long> correctIndexes = Arrays.asList(0L, 1L, 2L, 3L);
 assertEquals(correctIndexes, indexes.collect());
}

@Override
public SparkPairStream<T, Long> zipWithIndex() {
 return new SparkPairStream<>(rdd.zipWithIndex());
}

/**
 * Save a {@code JavaRDD<List<Writable>>} to a Hadoop {@link org.apache.hadoop.io.MapFile}. Each record is
 * given a <i>unique and contiguous</i> {@link LongWritable} key, and values are stored as
 * {@link RecordWritable} instances.<br>
 * <b>Note</b>: If contiguous keys are not required, using a sequence file instead is preferable from a performance
 * point of view. Contiguous keys are often only required for non-Spark use cases, such as with
 * {@link org.datavec.hadoop.records.reader.mapfile.MapFileRecordReader}
 * <p>
 * Use {@link #restoreMapFileSequences(String, JavaSparkContext)} to restore values saved with this method.
 *
 * @param path           Path to save the MapFile
 * @param rdd            RDD to save
 * @param c              Configuration object, used to customise options for the map file
 * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions)
 *                       to limit the maximum number of output map files
 * @see #saveMapFileSequences(String, JavaRDD)
 * @see #saveSequenceFile(String, JavaRDD)
 */
public static void saveMapFile(String path, JavaRDD<List<Writable>> rdd, Configuration c,
        @Nullable Integer maxOutputFiles) {
  path = FilenameUtils.normalize(path, true);
  if (maxOutputFiles != null) {
    rdd = rdd.coalesce(maxOutputFiles);
  }
  JavaPairRDD<List<Writable>, Long> dataIndexPairs = rdd.zipWithIndex(); //Note: Long values are unique + contiguous, but requires a count
  JavaPairRDD<LongWritable, RecordWritable> keyedByIndex =
          dataIndexPairs.mapToPair(new RecordSavePrepPairFunction());
  keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, RecordWritable.class, MapFileOutputFormat.class,
          c);
}

/**
 * Save a {@code JavaRDD<List<List<Writable>>>} to a Hadoop {@link org.apache.hadoop.io.MapFile}. Each record is
 * given a <i>unique and contiguous</i> {@link LongWritable} key, and values are stored as
 * {@link SequenceRecordWritable} instances.<br>
 * <b>Note</b>: If contiguous keys are not required, using a sequence file instead is preferable from a performance
 * point of view. Contiguous keys are often only required for non-Spark use cases, such as with
 * {@link org.datavec.hadoop.records.reader.mapfile.MapFileSequenceRecordReader}<br>
 * <p>
 * Use {@link #restoreMapFileSequences(String, JavaSparkContext)} to restore values saved with this method.
 *
 * @param path Path to save the MapFile
 * @param rdd  RDD to save
 * @param c    Configuration object, used to customise options for the map file
 * @see #saveMapFileSequences(String, JavaRDD)
 * @see #saveSequenceFile(String, JavaRDD)
 */
public static void saveMapFileSequences(String path, JavaRDD<List<List<Writable>>> rdd, Configuration c,
        @Nullable Integer maxOutputFiles) {
  path = FilenameUtils.normalize(path, true);
  if (maxOutputFiles != null) {
    rdd = rdd.coalesce(maxOutputFiles);
  }
  JavaPairRDD<List<List<Writable>>, Long> dataIndexPairs = rdd.zipWithIndex();
  JavaPairRDD<LongWritable, SequenceRecordWritable> keyedByIndex =
          dataIndexPairs.mapToPair(new SequenceRecordSavePrepPairFunction());
  keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, SequenceRecordWritable.class,
          MapFileOutputFormat.class, c);
}

/**
 * Save a {@code JavaRDD<List<List<Writable>>>} to a Hadoop {@link org.apache.hadoop.io.MapFile}. Each record is
 * given a <i>unique and contiguous</i> {@link LongWritable} key, and values are stored as
 * {@link SequenceRecordWritable} instances.<br>
 * <b>Note</b>: If contiguous keys are not required, using a sequence file instead is preferable from a performance
 * point of view. Contiguous keys are often only required for non-Spark use cases, such as with
 * {@link org.datavec.hadoop.records.reader.mapfile.MapFileSequenceRecordReader}<br>
 * <p>
 * Use {@link #restoreMapFileSequences(String, JavaSparkContext)} to restore values saved with this method.
 *
 * @param path Path to save the MapFile
 * @param rdd  RDD to save
 * @param c    Configuration object, used to customise options for the map file
 * @see #saveMapFileSequences(String, JavaRDD)
 * @see #saveSequenceFile(String, JavaRDD)
 */
public static void saveMapFileSequences(String path, JavaRDD<List<List<Writable>>> rdd, Configuration c,
        @Nullable Integer maxOutputFiles) {
  path = FilenameUtils.normalize(path, true);
  if (maxOutputFiles != null) {
    rdd = rdd.coalesce(maxOutputFiles);
  }
  JavaPairRDD<List<List<Writable>>, Long> dataIndexPairs = rdd.zipWithIndex();
  JavaPairRDD<LongWritable, SequenceRecordWritable> keyedByIndex =
          dataIndexPairs.mapToPair(new SequenceRecordSavePrepPairFunction());
  keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, SequenceRecordWritable.class,
          MapFileOutputFormat.class, c);
}

/**
 * Save a {@code JavaRDD<List<Writable>>} to a Hadoop {@link org.apache.hadoop.io.MapFile}. Each record is
 * given a <i>unique and contiguous</i> {@link LongWritable} key, and values are stored as
 * {@link RecordWritable} instances.<br>
 * <b>Note</b>: If contiguous keys are not required, using a sequence file instead is preferable from a performance
 * point of view. Contiguous keys are often only required for non-Spark use cases, such as with
 * {@link org.datavec.hadoop.records.reader.mapfile.MapFileRecordReader}
 * <p>
 * Use {@link #restoreMapFileSequences(String, JavaSparkContext)} to restore values saved with this method.
 *
 * @param path           Path to save the MapFile
 * @param rdd            RDD to save
 * @param c              Configuration object, used to customise options for the map file
 * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions)
 *                       to limit the maximum number of output map files
 * @see #saveMapFileSequences(String, JavaRDD)
 * @see #saveSequenceFile(String, JavaRDD)
 */
public static void saveMapFile(String path, JavaRDD<List<Writable>> rdd, Configuration c,
        @Nullable Integer maxOutputFiles) {
  path = FilenameUtils.normalize(path, true);
  if (maxOutputFiles != null) {
    rdd = rdd.coalesce(maxOutputFiles);
  }
  JavaPairRDD<List<Writable>, Long> dataIndexPairs = rdd.zipWithIndex(); //Note: Long values are unique + contiguous, but requires a count
  JavaPairRDD<LongWritable, RecordWritable> keyedByIndex =
          dataIndexPairs.mapToPair(new RecordSavePrepPairFunction());
  keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, RecordWritable.class, MapFileOutputFormat.class,
          c);
}

@Override
public SparkStream<T> limit(long number) {
 Preconditions.checkArgument(number >= 0, "Limit number must be non-negative.");
 if (number == 0) {
   StreamingContext.distributed().empty();
 }
 return new SparkStream<>(rdd.zipWithIndex().filter(p -> p._2() < number).map(Tuple2::_1));
}

@Override
public SparkStream<T> skip(long n) {
 if (n > count()) {
   return getContext().empty();
 } else if (n <= 0) {
   return this;
 }
 return new SparkStream<>(rdd.zipWithIndex().filter(p -> p._2() > n - 1).map(Tuple2::_1));
}

  JavaPairRDD<Object[], Long> zipWithIndex = parsedInput.zipWithIndex();
  final JavaPairRDD<Object[], Long> zipWithIndex = parsedInput.zipWithIndex();
  return zipWithIndex.map(new ValuesToInputRowFunction(_sparkJobContext));
} else if (datastore instanceof FixedWidthDatastore) {
  JavaPairRDD<Object[], Long> zipWithIndex = parsedInput.zipWithIndex();

How to use zipWithIndexmethodin org.apache.spark.api.java.JavaRDD

Best Java code snippets using org.apache.spark.api.java.JavaRDD.zipWithIndex (Showing top 12 results out of 315)

How to use
zipWithIndex
method
in
org.apache.spark.api.java.JavaRDD