org.apache.spark.api.java.JavaPairRDD.saveAsNewAPIHadoopFile java code examples

 @Override
 public void call(JavaPairRDD<K,M> rdd, Time time) throws IOException {
  if (rdd.isEmpty()) {
   log.info("RDD was empty, not saving to HDFS");
  } else {
   String file = prefix + "-" + time.milliseconds() + "." + suffix;
   Path path = new Path(file);
   FileSystem fs = FileSystem.get(path.toUri(), hadoopConf);
   if (fs.exists(path)) {
    log.warn("Saved data already existed, possibly from a failed job. Deleting {}", path);
    fs.delete(path, true);
   }
   log.info("Saving RDD to HDFS at {}", file);
   rdd.mapToPair(
     new ValueToWritableFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass)
   ).saveAsNewAPIHadoopFile(
     file,
     keyWritableClass,
     messageWritableClass,
     SequenceFileOutputFormat.class,
     hadoopConf);
  }
 }
}

@SuppressWarnings("unchecked")
@Test
public void writeWithNewAPIHadoopFile() {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsNewAPIHadoopFile(outputDir, IntWritable.class, Text.class,
   org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output =
  sc.sequenceFile(outputDir, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

@SuppressWarnings("unchecked")
@Test
public void writeWithNewAPIHadoopFile() {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsNewAPIHadoopFile(outputDir, IntWritable.class, Text.class,
   org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output =
  sc.sequenceFile(outputDir, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

@SuppressWarnings("unchecked")
@Test
public void writeWithNewAPIHadoopFile() {
 String outputDir = new File(tempDir, "output").getAbsolutePath();
 List<Tuple2<Integer, String>> pairs = Arrays.asList(
  new Tuple2<>(1, "a"),
  new Tuple2<>(2, "aa"),
  new Tuple2<>(3, "aaa")
 );
 JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs);
 rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2())))
  .saveAsNewAPIHadoopFile(outputDir, IntWritable.class, Text.class,
   org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat.class);
 JavaPairRDD<IntWritable, Text> output =
  sc.sequenceFile(outputDir, IntWritable.class, Text.class);
 assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString());
}

    metaUrl, segmentId, StringUtil.splitByComma(segmentIds), statOutputPath, tblColRefs, sConf));
colToDictPathRDD.coalesce(1, false).saveAsNewAPIHadoopFile(dictOutputPath, Text.class, Text.class,
    SequenceFileOutputFormat.class);

@Override
public void writeGraphRDD(final Configuration configuration, final JavaPairRDD<Object, VertexWritable> graphRDD) {
  final org.apache.hadoop.conf.Configuration hadoopConfiguration = ConfUtil.makeHadoopConfiguration(configuration);
  final String outputLocation = hadoopConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION);
  if (null != outputLocation) {
    // map back to a <nullwritable,vertexwritable> stream for output
    graphRDD.mapToPair(tuple -> new Tuple2<>(NullWritable.get(), tuple._2()))
        .saveAsNewAPIHadoopFile(Constants.getGraphLocation(outputLocation),
            NullWritable.class,
            VertexWritable.class,
            (Class<OutputFormat<NullWritable, VertexWritable>>) hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, OutputFormat.class), hadoopConfiguration);
  }
}

  @Override
  public <K, V> Iterator<KeyValue<K, V>> writeMemoryRDD(final Configuration configuration, final String memoryKey, JavaPairRDD<K, V> memoryRDD) {
    final org.apache.hadoop.conf.Configuration hadoopConfiguration = ConfUtil.makeHadoopConfiguration(configuration);
    final String outputLocation = hadoopConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION);
    if (null != outputLocation) {
      // map back to a Hadoop stream for output
      memoryRDD.mapToPair(keyValue -> new Tuple2<>(new ObjectWritable<>(keyValue._1()), new ObjectWritable<>(keyValue._2())))
          .saveAsNewAPIHadoopFile(Constants.getMemoryLocation(outputLocation, memoryKey),
              ObjectWritable.class,
              ObjectWritable.class,
              SequenceFileOutputFormat.class, hadoopConfiguration);
      try {
        return (Iterator) new ObjectWritableIterator(hadoopConfiguration, new Path(Constants.getMemoryLocation(outputLocation, memoryKey)));
      } catch (final IOException e) {
        throw new IllegalStateException(e.getMessage(), e);
      }
    }
    return Collections.emptyIterator();
  }
}

        m.getTableName()));
javaRDDPair.saveAsNewAPIHadoopFile("file://dummy", Object.class, BSONObject.class, MongoOutputFormat.class,
    outputConfig);
return true;

/**
 * Saves the keys from the given javaPairRDD as avro data with the given schema in a directory or file defined by path.  
 */
public <K, V> void saveJavaPairRDDKeys(JavaPairRDD<K, V> javaPairRDD, Schema avroSchema, String path) {
  Preconditions.checkNotNull(javaPairRDD);
  checkSchemaAndPath(avroSchema, path);
  
  Job job = getJob(avroSchema);
  
  javaPairRDD.saveAsNewAPIHadoopFile(path, AvroKey.class, NullWritable.class, AvroKeyOutputFormat.class, job.getConfiguration());

}

@Override
public void writeGraphRDD(final Configuration configuration, final JavaPairRDD<Object, VertexWritable> graphRDD) {
  final org.apache.hadoop.conf.Configuration hadoopConfiguration = ConfUtil.makeHadoopConfiguration(configuration);
  final String outputLocation = hadoopConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION);
  if (null != outputLocation) {
    // map back to a <nullwritable,vertexwritable> stream for output
    graphRDD.mapToPair(tuple -> new Tuple2<>(NullWritable.get(), tuple._2()))
        .saveAsNewAPIHadoopFile(Constants.getGraphLocation(outputLocation),
            NullWritable.class,
            VertexWritable.class,
            (Class<OutputFormat<NullWritable, VertexWritable>>) hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, OutputFormat.class), hadoopConfiguration);
  }
}

 @Override
 public void call(JavaPairRDD<K,M> rdd, Time time) throws IOException {
  if (rdd.isEmpty()) {
   log.info("RDD was empty, not saving to HDFS");
  } else {
   String file = prefix + "-" + time.milliseconds() + "." + suffix;
   Path path = new Path(file);
   FileSystem fs = FileSystem.get(path.toUri(), hadoopConf);
   if (fs.exists(path)) {
    log.warn("Saved data already existed, possibly from a failed job. Deleting {}", path);
    fs.delete(path, true);
   }
   log.info("Saving RDD to HDFS at {}", file);
   rdd.mapToPair(
     new ValueToWritableFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass)
   ).saveAsNewAPIHadoopFile(
     file,
     keyWritableClass,
     messageWritableClass,
     SequenceFileOutputFormat.class,
     hadoopConf);
  }
 }
}

@Override
protected void prepareKeyValues(final ImportKeyValueJavaPairRDDToAccumulo operation, final AccumuloKeyRangePartitioner partitioner) throws OperationException {
  final JavaPairRDD<Key, Value> rdd = operation.getInput().repartitionAndSortWithinPartitions(partitioner);
  rdd.saveAsNewAPIHadoopFile(operation.getOutputPath(), Key.class, Value.class, AccumuloFileOutputFormat.class, getConfiguration(operation));
}

  @Override
  public <K, V> Iterator<KeyValue<K, V>> writeMemoryRDD(final Configuration configuration, final String memoryKey, JavaPairRDD<K, V> memoryRDD) {
    final org.apache.hadoop.conf.Configuration hadoopConfiguration = ConfUtil.makeHadoopConfiguration(configuration);
    final String outputLocation = hadoopConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION);
    if (null != outputLocation) {
      // map back to a Hadoop stream for output
      memoryRDD.mapToPair(keyValue -> new Tuple2<>(new ObjectWritable<>(keyValue._1()), new ObjectWritable<>(keyValue._2())))
          .saveAsNewAPIHadoopFile(Constants.getMemoryLocation(outputLocation, memoryKey),
              ObjectWritable.class,
              ObjectWritable.class,
              SequenceFileOutputFormat.class, hadoopConfiguration);
      try {
        return (Iterator) new ObjectWritableIterator(hadoopConfiguration, new Path(Constants.getMemoryLocation(outputLocation, memoryKey)));
      } catch (final IOException e) {
        throw new IllegalStateException(e.getMessage(), e);
      }
    }
    return Collections.emptyIterator();
  }
}

/**
 * Save a {@code JavaRDD<List<List<Writable>>>} to a Hadoop {@link org.apache.hadoop.io.SequenceFile}. Each record
 * is given a unique (but noncontiguous) {@link LongWritable} key, and values are stored as {@link SequenceRecordWritable} instances.
 * <p>
 * Use {@link #restoreSequenceFileSequences(String, JavaSparkContext)} to restore values saved with this method.
 *
 * @param path           Path to save the sequence file
 * @param rdd            RDD to save
 * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions)
 *                       to limit the maximum number of output sequence files
 * @see #saveSequenceFile(String, JavaRDD)
 * @see #saveMapFileSequences(String, JavaRDD)
 */
public static void saveSequenceFileSequences(String path, JavaRDD<List<List<Writable>>> rdd,
        @Nullable Integer maxOutputFiles) {
  path = FilenameUtils.normalize(path, true);
  if (maxOutputFiles != null) {
    rdd = rdd.coalesce(maxOutputFiles);
  }
  JavaPairRDD<List<List<Writable>>, Long> dataIndexPairs = rdd.zipWithUniqueId(); //Note: Long values are unique + NOT contiguous; more efficient than zipWithIndex
  JavaPairRDD<LongWritable, SequenceRecordWritable> keyedByIndex =
          dataIndexPairs.mapToPair(new SequenceRecordSavePrepPairFunction());
  keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, SequenceRecordWritable.class,
          SequenceFileOutputFormat.class);
}

/**
 * Save a {@code JavaRDD<List<Writable>>} to a Hadoop {@link org.apache.hadoop.io.SequenceFile}. Each record is given
 * a unique (but noncontiguous) {@link LongWritable} key, and values are stored as {@link RecordWritable} instances.
 * <p>
 * Use {@link #restoreSequenceFile(String, JavaSparkContext)} to restore values saved with this method.
 *
 * @param path           Path to save the sequence file
 * @param rdd            RDD to save
 * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions)
 *                       to limit the maximum number of output sequence files
 * @see #saveSequenceFileSequences(String, JavaRDD)
 * @see #saveMapFile(String, JavaRDD)
 */
public static void saveSequenceFile(String path, JavaRDD<List<Writable>> rdd, @Nullable Integer maxOutputFiles) {
  path = FilenameUtils.normalize(path, true);
  if (maxOutputFiles != null) {
    rdd = rdd.coalesce(maxOutputFiles);
  }
  JavaPairRDD<List<Writable>, Long> dataIndexPairs = rdd.zipWithUniqueId(); //Note: Long values are unique + NOT contiguous; more efficient than zipWithIndex
  JavaPairRDD<LongWritable, RecordWritable> keyedByIndex =
          dataIndexPairs.mapToPair(new RecordSavePrepPairFunction());
  keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, RecordWritable.class,
          SequenceFileOutputFormat.class);
}

/**
 * Save a {@code JavaRDD<List<Writable>>} to a Hadoop {@link org.apache.hadoop.io.SequenceFile}. Each record is given
 * a unique (but noncontiguous) {@link LongWritable} key, and values are stored as {@link RecordWritable} instances.
 * <p>
 * Use {@link #restoreSequenceFile(String, JavaSparkContext)} to restore values saved with this method.
 *
 * @param path           Path to save the sequence file
 * @param rdd            RDD to save
 * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions)
 *                       to limit the maximum number of output sequence files
 * @see #saveSequenceFileSequences(String, JavaRDD)
 * @see #saveMapFile(String, JavaRDD)
 */
public static void saveSequenceFile(String path, JavaRDD<List<Writable>> rdd, @Nullable Integer maxOutputFiles) {
  path = FilenameUtils.normalize(path, true);
  if (maxOutputFiles != null) {
    rdd = rdd.coalesce(maxOutputFiles);
  }
  JavaPairRDD<List<Writable>, Long> dataIndexPairs = rdd.zipWithUniqueId(); //Note: Long values are unique + NOT contiguous; more efficient than zipWithIndex
  JavaPairRDD<LongWritable, RecordWritable> keyedByIndex =
          dataIndexPairs.mapToPair(new RecordSavePrepPairFunction());
  keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, RecordWritable.class,
          SequenceFileOutputFormat.class);
}

/**
 * Save a {@code JavaRDD<List<List<Writable>>>} to a Hadoop {@link org.apache.hadoop.io.MapFile}. Each record is
 * given a <i>unique and contiguous</i> {@link LongWritable} key, and values are stored as
 * {@link SequenceRecordWritable} instances.<br>
 * <b>Note</b>: If contiguous keys are not required, using a sequence file instead is preferable from a performance
 * point of view. Contiguous keys are often only required for non-Spark use cases, such as with
 * {@link org.datavec.hadoop.records.reader.mapfile.MapFileSequenceRecordReader}<br>
 * <p>
 * Use {@link #restoreMapFileSequences(String, JavaSparkContext)} to restore values saved with this method.
 *
 * @param path Path to save the MapFile
 * @param rdd  RDD to save
 * @param c    Configuration object, used to customise options for the map file
 * @see #saveMapFileSequences(String, JavaRDD)
 * @see #saveSequenceFile(String, JavaRDD)
 */
public static void saveMapFileSequences(String path, JavaRDD<List<List<Writable>>> rdd, Configuration c,
        @Nullable Integer maxOutputFiles) {
  path = FilenameUtils.normalize(path, true);
  if (maxOutputFiles != null) {
    rdd = rdd.coalesce(maxOutputFiles);
  }
  JavaPairRDD<List<List<Writable>>, Long> dataIndexPairs = rdd.zipWithIndex();
  JavaPairRDD<LongWritable, SequenceRecordWritable> keyedByIndex =
          dataIndexPairs.mapToPair(new SequenceRecordSavePrepPairFunction());
  keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, SequenceRecordWritable.class,
          MapFileOutputFormat.class, c);
}

/**
 * Save a {@code JavaRDD<List<List<Writable>>>} to a Hadoop {@link org.apache.hadoop.io.SequenceFile}. Each record
 * is given a unique (but noncontiguous) {@link LongWritable} key, and values are stored as {@link SequenceRecordWritable} instances.
 * <p>
 * Use {@link #restoreSequenceFileSequences(String, JavaSparkContext)} to restore values saved with this method.
 *
 * @param path           Path to save the sequence file
 * @param rdd            RDD to save
 * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions)
 *                       to limit the maximum number of output sequence files
 * @see #saveSequenceFile(String, JavaRDD)
 * @see #saveMapFileSequences(String, JavaRDD)
 */
public static void saveSequenceFileSequences(String path, JavaRDD<List<List<Writable>>> rdd,
        @Nullable Integer maxOutputFiles) {
  path = FilenameUtils.normalize(path, true);
  if (maxOutputFiles != null) {
    rdd = rdd.coalesce(maxOutputFiles);
  }
  JavaPairRDD<List<List<Writable>>, Long> dataIndexPairs = rdd.zipWithUniqueId(); //Note: Long values are unique + NOT contiguous; more efficient than zipWithIndex
  JavaPairRDD<LongWritable, SequenceRecordWritable> keyedByIndex =
          dataIndexPairs.mapToPair(new SequenceRecordSavePrepPairFunction());
  keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, SequenceRecordWritable.class,
          SequenceFileOutputFormat.class);
}

/**
 * Save a {@code JavaRDD<List<List<Writable>>>} to a Hadoop {@link org.apache.hadoop.io.MapFile}. Each record is
 * given a <i>unique and contiguous</i> {@link LongWritable} key, and values are stored as
 * {@link SequenceRecordWritable} instances.<br>
 * <b>Note</b>: If contiguous keys are not required, using a sequence file instead is preferable from a performance
 * point of view. Contiguous keys are often only required for non-Spark use cases, such as with
 * {@link org.datavec.hadoop.records.reader.mapfile.MapFileSequenceRecordReader}<br>
 * <p>
 * Use {@link #restoreMapFileSequences(String, JavaSparkContext)} to restore values saved with this method.
 *
 * @param path Path to save the MapFile
 * @param rdd  RDD to save
 * @param c    Configuration object, used to customise options for the map file
 * @see #saveMapFileSequences(String, JavaRDD)
 * @see #saveSequenceFile(String, JavaRDD)
 */
public static void saveMapFileSequences(String path, JavaRDD<List<List<Writable>>> rdd, Configuration c,
        @Nullable Integer maxOutputFiles) {
  path = FilenameUtils.normalize(path, true);
  if (maxOutputFiles != null) {
    rdd = rdd.coalesce(maxOutputFiles);
  }
  JavaPairRDD<List<List<Writable>>, Long> dataIndexPairs = rdd.zipWithIndex();
  JavaPairRDD<LongWritable, SequenceRecordWritable> keyedByIndex =
          dataIndexPairs.mapToPair(new SequenceRecordSavePrepPairFunction());
  keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, SequenceRecordWritable.class,
          MapFileOutputFormat.class, c);
}

/**
 * Save a {@code JavaRDD<List<Writable>>} to a Hadoop {@link org.apache.hadoop.io.MapFile}. Each record is
 * given a <i>unique and contiguous</i> {@link LongWritable} key, and values are stored as
 * {@link RecordWritable} instances.<br>
 * <b>Note</b>: If contiguous keys are not required, using a sequence file instead is preferable from a performance
 * point of view. Contiguous keys are often only required for non-Spark use cases, such as with
 * {@link org.datavec.hadoop.records.reader.mapfile.MapFileRecordReader}
 * <p>
 * Use {@link #restoreMapFileSequences(String, JavaSparkContext)} to restore values saved with this method.
 *
 * @param path           Path to save the MapFile
 * @param rdd            RDD to save
 * @param c              Configuration object, used to customise options for the map file
 * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions)
 *                       to limit the maximum number of output map files
 * @see #saveMapFileSequences(String, JavaRDD)
 * @see #saveSequenceFile(String, JavaRDD)
 */
public static void saveMapFile(String path, JavaRDD<List<Writable>> rdd, Configuration c,
        @Nullable Integer maxOutputFiles) {
  path = FilenameUtils.normalize(path, true);
  if (maxOutputFiles != null) {
    rdd = rdd.coalesce(maxOutputFiles);
  }
  JavaPairRDD<List<Writable>, Long> dataIndexPairs = rdd.zipWithIndex(); //Note: Long values are unique + contiguous, but requires a count
  JavaPairRDD<LongWritable, RecordWritable> keyedByIndex =
          dataIndexPairs.mapToPair(new RecordSavePrepPairFunction());
  keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, RecordWritable.class, MapFileOutputFormat.class,
          c);
}

Popular methods of JavaPairRDD

Popular in Java

Start an intent from android
setContentView (Activity)
runOnUiThread (Activity)
compareTo (BigDecimal)
InputStream (java.io)
A readable source of bytes.Most clients will use input streams that read data from the file system (
SocketException (java.net)
This SocketException may be thrown during socket creation or setting options, and is the superclass
DateFormat (java.text)
Formats or parses dates and times.This class provides factories for obtaining instances configured f
XPath (javax.xml.xpath)
XPath provides access to the XPath evaluation environment and expressions. Evaluation of XPath Expr
IsNull (org.hamcrest.core)
Is the value null?
BorderLayout (java.awt)
A border layout lays out a container, arranging and resizing its components to fit in five regions:
Top plugins for Android Studio

How to use saveAsNewAPIHadoopFilemethodin org.apache.spark.api.java.JavaPairRDD

Best Java code snippets using org.apache.spark.api.java.JavaPairRDD.saveAsNewAPIHadoopFile (Showing top 20 results out of 315)

How to use
saveAsNewAPIHadoopFile
method
in
org.apache.spark.api.java.JavaPairRDD