@Override public void call(JavaPairRDD<K,M> rdd, Time time) throws IOException { if (rdd.isEmpty()) { log.info("RDD was empty, not saving to HDFS"); } else { String file = prefix + "-" + time.milliseconds() + "." + suffix; Path path = new Path(file); FileSystem fs = FileSystem.get(path.toUri(), hadoopConf); if (fs.exists(path)) { log.warn("Saved data already existed, possibly from a failed job. Deleting {}", path); fs.delete(path, true); } log.info("Saving RDD to HDFS at {}", file); rdd.mapToPair( new ValueToWritableFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass) ).saveAsNewAPIHadoopFile( file, keyWritableClass, messageWritableClass, SequenceFileOutputFormat.class, hadoopConf); } } }
@SuppressWarnings("unchecked") @Test public void writeWithNewAPIHadoopFile() { String outputDir = new File(tempDir, "output").getAbsolutePath(); List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs); rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2()))) .saveAsNewAPIHadoopFile(outputDir, IntWritable.class, Text.class, org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat.class); JavaPairRDD<IntWritable, Text> output = sc.sequenceFile(outputDir, IntWritable.class, Text.class); assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString()); }
@SuppressWarnings("unchecked") @Test public void writeWithNewAPIHadoopFile() { String outputDir = new File(tempDir, "output").getAbsolutePath(); List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs); rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2()))) .saveAsNewAPIHadoopFile(outputDir, IntWritable.class, Text.class, org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat.class); JavaPairRDD<IntWritable, Text> output = sc.sequenceFile(outputDir, IntWritable.class, Text.class); assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString()); }
@SuppressWarnings("unchecked") @Test public void writeWithNewAPIHadoopFile() { String outputDir = new File(tempDir, "output").getAbsolutePath(); List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs); rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2()))) .saveAsNewAPIHadoopFile(outputDir, IntWritable.class, Text.class, org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat.class); JavaPairRDD<IntWritable, Text> output = sc.sequenceFile(outputDir, IntWritable.class, Text.class); assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString()); }
metaUrl, segmentId, StringUtil.splitByComma(segmentIds), statOutputPath, tblColRefs, sConf)); colToDictPathRDD.coalesce(1, false).saveAsNewAPIHadoopFile(dictOutputPath, Text.class, Text.class, SequenceFileOutputFormat.class);
@Override public void writeGraphRDD(final Configuration configuration, final JavaPairRDD<Object, VertexWritable> graphRDD) { final org.apache.hadoop.conf.Configuration hadoopConfiguration = ConfUtil.makeHadoopConfiguration(configuration); final String outputLocation = hadoopConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION); if (null != outputLocation) { // map back to a <nullwritable,vertexwritable> stream for output graphRDD.mapToPair(tuple -> new Tuple2<>(NullWritable.get(), tuple._2())) .saveAsNewAPIHadoopFile(Constants.getGraphLocation(outputLocation), NullWritable.class, VertexWritable.class, (Class<OutputFormat<NullWritable, VertexWritable>>) hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, OutputFormat.class), hadoopConfiguration); } }
@Override public <K, V> Iterator<KeyValue<K, V>> writeMemoryRDD(final Configuration configuration, final String memoryKey, JavaPairRDD<K, V> memoryRDD) { final org.apache.hadoop.conf.Configuration hadoopConfiguration = ConfUtil.makeHadoopConfiguration(configuration); final String outputLocation = hadoopConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION); if (null != outputLocation) { // map back to a Hadoop stream for output memoryRDD.mapToPair(keyValue -> new Tuple2<>(new ObjectWritable<>(keyValue._1()), new ObjectWritable<>(keyValue._2()))) .saveAsNewAPIHadoopFile(Constants.getMemoryLocation(outputLocation, memoryKey), ObjectWritable.class, ObjectWritable.class, SequenceFileOutputFormat.class, hadoopConfiguration); try { return (Iterator) new ObjectWritableIterator(hadoopConfiguration, new Path(Constants.getMemoryLocation(outputLocation, memoryKey))); } catch (final IOException e) { throw new IllegalStateException(e.getMessage(), e); } } return Collections.emptyIterator(); } }
m.getTableName())); javaRDDPair.saveAsNewAPIHadoopFile("file://dummy", Object.class, BSONObject.class, MongoOutputFormat.class, outputConfig); return true;
/** * Saves the keys from the given javaPairRDD as avro data with the given schema in a directory or file defined by path. */ public <K, V> void saveJavaPairRDDKeys(JavaPairRDD<K, V> javaPairRDD, Schema avroSchema, String path) { Preconditions.checkNotNull(javaPairRDD); checkSchemaAndPath(avroSchema, path); Job job = getJob(avroSchema); javaPairRDD.saveAsNewAPIHadoopFile(path, AvroKey.class, NullWritable.class, AvroKeyOutputFormat.class, job.getConfiguration()); }
@Override public void writeGraphRDD(final Configuration configuration, final JavaPairRDD<Object, VertexWritable> graphRDD) { final org.apache.hadoop.conf.Configuration hadoopConfiguration = ConfUtil.makeHadoopConfiguration(configuration); final String outputLocation = hadoopConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION); if (null != outputLocation) { // map back to a <nullwritable,vertexwritable> stream for output graphRDD.mapToPair(tuple -> new Tuple2<>(NullWritable.get(), tuple._2())) .saveAsNewAPIHadoopFile(Constants.getGraphLocation(outputLocation), NullWritable.class, VertexWritable.class, (Class<OutputFormat<NullWritable, VertexWritable>>) hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_WRITER, OutputFormat.class), hadoopConfiguration); } }
@Override public void call(JavaPairRDD<K,M> rdd, Time time) throws IOException { if (rdd.isEmpty()) { log.info("RDD was empty, not saving to HDFS"); } else { String file = prefix + "-" + time.milliseconds() + "." + suffix; Path path = new Path(file); FileSystem fs = FileSystem.get(path.toUri(), hadoopConf); if (fs.exists(path)) { log.warn("Saved data already existed, possibly from a failed job. Deleting {}", path); fs.delete(path, true); } log.info("Saving RDD to HDFS at {}", file); rdd.mapToPair( new ValueToWritableFunction<>(keyClass, messageClass, keyWritableClass, messageWritableClass) ).saveAsNewAPIHadoopFile( file, keyWritableClass, messageWritableClass, SequenceFileOutputFormat.class, hadoopConf); } } }
@Override protected void prepareKeyValues(final ImportKeyValueJavaPairRDDToAccumulo operation, final AccumuloKeyRangePartitioner partitioner) throws OperationException { final JavaPairRDD<Key, Value> rdd = operation.getInput().repartitionAndSortWithinPartitions(partitioner); rdd.saveAsNewAPIHadoopFile(operation.getOutputPath(), Key.class, Value.class, AccumuloFileOutputFormat.class, getConfiguration(operation)); }
@Override public <K, V> Iterator<KeyValue<K, V>> writeMemoryRDD(final Configuration configuration, final String memoryKey, JavaPairRDD<K, V> memoryRDD) { final org.apache.hadoop.conf.Configuration hadoopConfiguration = ConfUtil.makeHadoopConfiguration(configuration); final String outputLocation = hadoopConfiguration.get(Constants.GREMLIN_HADOOP_OUTPUT_LOCATION); if (null != outputLocation) { // map back to a Hadoop stream for output memoryRDD.mapToPair(keyValue -> new Tuple2<>(new ObjectWritable<>(keyValue._1()), new ObjectWritable<>(keyValue._2()))) .saveAsNewAPIHadoopFile(Constants.getMemoryLocation(outputLocation, memoryKey), ObjectWritable.class, ObjectWritable.class, SequenceFileOutputFormat.class, hadoopConfiguration); try { return (Iterator) new ObjectWritableIterator(hadoopConfiguration, new Path(Constants.getMemoryLocation(outputLocation, memoryKey))); } catch (final IOException e) { throw new IllegalStateException(e.getMessage(), e); } } return Collections.emptyIterator(); } }
/** * Save a {@code JavaRDD<List<List<Writable>>>} to a Hadoop {@link org.apache.hadoop.io.SequenceFile}. Each record * is given a unique (but noncontiguous) {@link LongWritable} key, and values are stored as {@link SequenceRecordWritable} instances. * <p> * Use {@link #restoreSequenceFileSequences(String, JavaSparkContext)} to restore values saved with this method. * * @param path Path to save the sequence file * @param rdd RDD to save * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions) * to limit the maximum number of output sequence files * @see #saveSequenceFile(String, JavaRDD) * @see #saveMapFileSequences(String, JavaRDD) */ public static void saveSequenceFileSequences(String path, JavaRDD<List<List<Writable>>> rdd, @Nullable Integer maxOutputFiles) { path = FilenameUtils.normalize(path, true); if (maxOutputFiles != null) { rdd = rdd.coalesce(maxOutputFiles); } JavaPairRDD<List<List<Writable>>, Long> dataIndexPairs = rdd.zipWithUniqueId(); //Note: Long values are unique + NOT contiguous; more efficient than zipWithIndex JavaPairRDD<LongWritable, SequenceRecordWritable> keyedByIndex = dataIndexPairs.mapToPair(new SequenceRecordSavePrepPairFunction()); keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, SequenceRecordWritable.class, SequenceFileOutputFormat.class); }
/** * Save a {@code JavaRDD<List<Writable>>} to a Hadoop {@link org.apache.hadoop.io.SequenceFile}. Each record is given * a unique (but noncontiguous) {@link LongWritable} key, and values are stored as {@link RecordWritable} instances. * <p> * Use {@link #restoreSequenceFile(String, JavaSparkContext)} to restore values saved with this method. * * @param path Path to save the sequence file * @param rdd RDD to save * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions) * to limit the maximum number of output sequence files * @see #saveSequenceFileSequences(String, JavaRDD) * @see #saveMapFile(String, JavaRDD) */ public static void saveSequenceFile(String path, JavaRDD<List<Writable>> rdd, @Nullable Integer maxOutputFiles) { path = FilenameUtils.normalize(path, true); if (maxOutputFiles != null) { rdd = rdd.coalesce(maxOutputFiles); } JavaPairRDD<List<Writable>, Long> dataIndexPairs = rdd.zipWithUniqueId(); //Note: Long values are unique + NOT contiguous; more efficient than zipWithIndex JavaPairRDD<LongWritable, RecordWritable> keyedByIndex = dataIndexPairs.mapToPair(new RecordSavePrepPairFunction()); keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, RecordWritable.class, SequenceFileOutputFormat.class); }
/** * Save a {@code JavaRDD<List<Writable>>} to a Hadoop {@link org.apache.hadoop.io.SequenceFile}. Each record is given * a unique (but noncontiguous) {@link LongWritable} key, and values are stored as {@link RecordWritable} instances. * <p> * Use {@link #restoreSequenceFile(String, JavaSparkContext)} to restore values saved with this method. * * @param path Path to save the sequence file * @param rdd RDD to save * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions) * to limit the maximum number of output sequence files * @see #saveSequenceFileSequences(String, JavaRDD) * @see #saveMapFile(String, JavaRDD) */ public static void saveSequenceFile(String path, JavaRDD<List<Writable>> rdd, @Nullable Integer maxOutputFiles) { path = FilenameUtils.normalize(path, true); if (maxOutputFiles != null) { rdd = rdd.coalesce(maxOutputFiles); } JavaPairRDD<List<Writable>, Long> dataIndexPairs = rdd.zipWithUniqueId(); //Note: Long values are unique + NOT contiguous; more efficient than zipWithIndex JavaPairRDD<LongWritable, RecordWritable> keyedByIndex = dataIndexPairs.mapToPair(new RecordSavePrepPairFunction()); keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, RecordWritable.class, SequenceFileOutputFormat.class); }
/** * Save a {@code JavaRDD<List<List<Writable>>>} to a Hadoop {@link org.apache.hadoop.io.MapFile}. Each record is * given a <i>unique and contiguous</i> {@link LongWritable} key, and values are stored as * {@link SequenceRecordWritable} instances.<br> * <b>Note</b>: If contiguous keys are not required, using a sequence file instead is preferable from a performance * point of view. Contiguous keys are often only required for non-Spark use cases, such as with * {@link org.datavec.hadoop.records.reader.mapfile.MapFileSequenceRecordReader}<br> * <p> * Use {@link #restoreMapFileSequences(String, JavaSparkContext)} to restore values saved with this method. * * @param path Path to save the MapFile * @param rdd RDD to save * @param c Configuration object, used to customise options for the map file * @see #saveMapFileSequences(String, JavaRDD) * @see #saveSequenceFile(String, JavaRDD) */ public static void saveMapFileSequences(String path, JavaRDD<List<List<Writable>>> rdd, Configuration c, @Nullable Integer maxOutputFiles) { path = FilenameUtils.normalize(path, true); if (maxOutputFiles != null) { rdd = rdd.coalesce(maxOutputFiles); } JavaPairRDD<List<List<Writable>>, Long> dataIndexPairs = rdd.zipWithIndex(); JavaPairRDD<LongWritable, SequenceRecordWritable> keyedByIndex = dataIndexPairs.mapToPair(new SequenceRecordSavePrepPairFunction()); keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, SequenceRecordWritable.class, MapFileOutputFormat.class, c); }
/** * Save a {@code JavaRDD<List<List<Writable>>>} to a Hadoop {@link org.apache.hadoop.io.SequenceFile}. Each record * is given a unique (but noncontiguous) {@link LongWritable} key, and values are stored as {@link SequenceRecordWritable} instances. * <p> * Use {@link #restoreSequenceFileSequences(String, JavaSparkContext)} to restore values saved with this method. * * @param path Path to save the sequence file * @param rdd RDD to save * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions) * to limit the maximum number of output sequence files * @see #saveSequenceFile(String, JavaRDD) * @see #saveMapFileSequences(String, JavaRDD) */ public static void saveSequenceFileSequences(String path, JavaRDD<List<List<Writable>>> rdd, @Nullable Integer maxOutputFiles) { path = FilenameUtils.normalize(path, true); if (maxOutputFiles != null) { rdd = rdd.coalesce(maxOutputFiles); } JavaPairRDD<List<List<Writable>>, Long> dataIndexPairs = rdd.zipWithUniqueId(); //Note: Long values are unique + NOT contiguous; more efficient than zipWithIndex JavaPairRDD<LongWritable, SequenceRecordWritable> keyedByIndex = dataIndexPairs.mapToPair(new SequenceRecordSavePrepPairFunction()); keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, SequenceRecordWritable.class, SequenceFileOutputFormat.class); }
/** * Save a {@code JavaRDD<List<List<Writable>>>} to a Hadoop {@link org.apache.hadoop.io.MapFile}. Each record is * given a <i>unique and contiguous</i> {@link LongWritable} key, and values are stored as * {@link SequenceRecordWritable} instances.<br> * <b>Note</b>: If contiguous keys are not required, using a sequence file instead is preferable from a performance * point of view. Contiguous keys are often only required for non-Spark use cases, such as with * {@link org.datavec.hadoop.records.reader.mapfile.MapFileSequenceRecordReader}<br> * <p> * Use {@link #restoreMapFileSequences(String, JavaSparkContext)} to restore values saved with this method. * * @param path Path to save the MapFile * @param rdd RDD to save * @param c Configuration object, used to customise options for the map file * @see #saveMapFileSequences(String, JavaRDD) * @see #saveSequenceFile(String, JavaRDD) */ public static void saveMapFileSequences(String path, JavaRDD<List<List<Writable>>> rdd, Configuration c, @Nullable Integer maxOutputFiles) { path = FilenameUtils.normalize(path, true); if (maxOutputFiles != null) { rdd = rdd.coalesce(maxOutputFiles); } JavaPairRDD<List<List<Writable>>, Long> dataIndexPairs = rdd.zipWithIndex(); JavaPairRDD<LongWritable, SequenceRecordWritable> keyedByIndex = dataIndexPairs.mapToPair(new SequenceRecordSavePrepPairFunction()); keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, SequenceRecordWritable.class, MapFileOutputFormat.class, c); }
/** * Save a {@code JavaRDD<List<Writable>>} to a Hadoop {@link org.apache.hadoop.io.MapFile}. Each record is * given a <i>unique and contiguous</i> {@link LongWritable} key, and values are stored as * {@link RecordWritable} instances.<br> * <b>Note</b>: If contiguous keys are not required, using a sequence file instead is preferable from a performance * point of view. Contiguous keys are often only required for non-Spark use cases, such as with * {@link org.datavec.hadoop.records.reader.mapfile.MapFileRecordReader} * <p> * Use {@link #restoreMapFileSequences(String, JavaSparkContext)} to restore values saved with this method. * * @param path Path to save the MapFile * @param rdd RDD to save * @param c Configuration object, used to customise options for the map file * @param maxOutputFiles Nullable. If non-null: first coalesce the RDD to the specified size (number of partitions) * to limit the maximum number of output map files * @see #saveMapFileSequences(String, JavaRDD) * @see #saveSequenceFile(String, JavaRDD) */ public static void saveMapFile(String path, JavaRDD<List<Writable>> rdd, Configuration c, @Nullable Integer maxOutputFiles) { path = FilenameUtils.normalize(path, true); if (maxOutputFiles != null) { rdd = rdd.coalesce(maxOutputFiles); } JavaPairRDD<List<Writable>, Long> dataIndexPairs = rdd.zipWithIndex(); //Note: Long values are unique + contiguous, but requires a count JavaPairRDD<LongWritable, RecordWritable> keyedByIndex = dataIndexPairs.mapToPair(new RecordSavePrepPairFunction()); keyedByIndex.saveAsNewAPIHadoopFile(path, LongWritable.class, RecordWritable.class, MapFileOutputFormat.class, c); }