org.apache.spark.api.java.JavaSparkContext.newAPIHadoopRDD java code examples

sparkContext.newAPIHadoopRDD(updatedConf,
               SequenceFileInputFormat.class,
               keyWritableClass,

  @Override
  public <K, V> JavaPairRDD<K, V> readMemoryRDD(final Configuration configuration, final String memoryKey, final JavaSparkContext sparkContext) {
    return sparkContext.newAPIHadoopRDD(ConfUtil.makeHadoopConfiguration(configuration),
        SequenceFileInputFormat.class,
        ObjectWritable.class,
        ObjectWritable.class)
        .mapToPair(tuple -> new Tuple2<>((K) ((Tuple2<ObjectWritable, ObjectWritable>) tuple)._1().get(), (V) ((Tuple2<ObjectWritable, ObjectWritable>) tuple)._2().get()));
  }
}

@Override
public JavaPairRDD<Object, VertexWritable> readGraphRDD(final Configuration configuration, final JavaSparkContext sparkContext) {
  final org.apache.hadoop.conf.Configuration hadoopConfiguration = ConfUtil.makeHadoopConfiguration(configuration);
  return sparkContext.newAPIHadoopRDD(hadoopConfiguration,
      (Class<InputFormat<NullWritable, VertexWritable>>) hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, InputFormat.class),
      NullWritable.class,
      VertexWritable.class)
      .mapToPair(tuple -> new Tuple2<>(tuple._2().get().id(), new VertexWritable(tuple._2().get())));
}

public static JavaPairRDD<Object, BSONObject> mongoRdd(JavaSparkContext sparkContext, String mongoHost, long mongoPort, String db, String collection) {
  Configuration mongodbConfig = new Configuration();
  mongodbConfig.set("mongo.job.input.format", "com.mongodb.hadoop.MongoInputFormat");
  mongodbConfig.set("mongo.input.uri", String.format("mongodb://%s:%d/%s.%s", mongoHost, mongoPort, db, collection));
  return sparkContext.newAPIHadoopRDD(mongodbConfig, MongoInputFormat.class, Object.class, BSONObject.class);
}

 /**
  * Creates the hive table rdd.
  *
  * @param javaSparkContext the java spark context
  * @param conf             the conf
  * @param db               the db
  * @param table            the table
  * @param partitionFilter  the partition filter
  * @return the java pair rdd
  * @throws IOException Signals that an I/O exception has occurred.
  */
 public static JavaPairRDD<WritableComparable, HCatRecord> createHiveTableRDD(JavaSparkContext javaSparkContext,
  Configuration conf, String db, String table, String partitionFilter) throws IOException {

  HCatInputFormat.setInput(conf, db, table, partitionFilter);

  JavaPairRDD<WritableComparable, HCatRecord> rdd = javaSparkContext.newAPIHadoopRDD(conf,
   HCatInputFormat.class, // Input
   WritableComparable.class, // input key class
   HCatRecord.class); // input value class
  return rdd;
 }
}

/**
 * Restore a {@code JavaPairRDD<Long,List<Writable>>} previously saved with {@link #saveMapFile(String, JavaRDD)}}<br>
 * Note that if the keys are not required, simply use {@code restoreMapFile(...).values()}
 *
 * @param path Path of the MapFile
 * @param sc   Spark context
 * @return The restored RDD, with their unique indices as the key
 */
public static JavaPairRDD<Long, List<Writable>> restoreMapFile(String path, JavaSparkContext sc) {
  Configuration c = new Configuration();
  c.set(FileInputFormat.INPUT_DIR, FilenameUtils.normalize(path, true));
  JavaPairRDD<LongWritable, RecordWritable> pairRDD =
          sc.newAPIHadoopRDD(c, SequenceFileInputFormat.class, LongWritable.class, RecordWritable.class);
  return pairRDD.mapToPair(new RecordLoadPairFunction());
}

/**
 * Restore a {@code JavaPairRDD<Long,List<List<Writable>>>} previously saved with {@link #saveMapFile(String, JavaRDD)}}<br>
 * Note that if the keys are not required, simply use {@code restoreMapFileSequences(...).values()}
 *
 * @param path Path of the MapFile
 * @param sc   Spark context
 * @return The restored RDD, with their unique indices as the key
 */
public static JavaPairRDD<Long, List<List<Writable>>> restoreMapFileSequences(String path, JavaSparkContext sc) {
  Configuration c = new Configuration();
  c.set(FileInputFormat.INPUT_DIR, FilenameUtils.normalize(path, true));
  JavaPairRDD<LongWritable, SequenceRecordWritable> pairRDD = sc.newAPIHadoopRDD(c, SequenceFileInputFormat.class,
          LongWritable.class, SequenceRecordWritable.class);
  return pairRDD.mapToPair(new SequenceRecordLoadPairFunction());
}

/**
 * Restore a {@code JavaPairRDD<Long,List<Writable>>} previously saved with {@link #saveMapFile(String, JavaRDD)}}<br>
 * Note that if the keys are not required, simply use {@code restoreMapFile(...).values()}
 *
 * @param path Path of the MapFile
 * @param sc   Spark context
 * @return The restored RDD, with their unique indices as the key
 */
public static JavaPairRDD<Long, List<Writable>> restoreMapFile(String path, JavaSparkContext sc) {
  Configuration c = new Configuration();
  c.set(FileInputFormat.INPUT_DIR, FilenameUtils.normalize(path, true));
  JavaPairRDD<LongWritable, RecordWritable> pairRDD =
          sc.newAPIHadoopRDD(c, SequenceFileInputFormat.class, LongWritable.class, RecordWritable.class);
  return pairRDD.mapToPair(new RecordLoadPairFunction());
}

/**
 * Restore a {@code JavaPairRDD<Long,List<List<Writable>>>} previously saved with {@link #saveMapFile(String, JavaRDD)}}<br>
 * Note that if the keys are not required, simply use {@code restoreMapFileSequences(...).values()}
 *
 * @param path Path of the MapFile
 * @param sc   Spark context
 * @return The restored RDD, with their unique indices as the key
 */
public static JavaPairRDD<Long, List<List<Writable>>> restoreMapFileSequences(String path, JavaSparkContext sc) {
  Configuration c = new Configuration();
  c.set(FileInputFormat.INPUT_DIR, FilenameUtils.normalize(path, true));
  JavaPairRDD<LongWritable, SequenceRecordWritable> pairRDD = sc.newAPIHadoopRDD(c, SequenceFileInputFormat.class,
          LongWritable.class, SequenceRecordWritable.class);
  return pairRDD.mapToPair(new SequenceRecordLoadPairFunction());
}

public static void usingNewAPIHadoopRDD(String[] args) {
 
 String loaderJsonFile = args[0];
 String queryJsonFile = args[1];
 String hostfile = args[2];
 SparkConf conf = new SparkConf();
 conf.setAppName("GenomicsDBTest using newAPIHadoopRDD");
 JavaSparkContext sc = new JavaSparkContext(conf);
 Configuration hadoopConf = sc.hadoopConfiguration();
 hadoopConf.set(GenomicsDBConfiguration.LOADERJSON, loaderJsonFile);
 hadoopConf.set(GenomicsDBConfiguration.QUERYJSON, queryJsonFile);
 hadoopConf.set(GenomicsDBConfiguration.MPIHOSTFILE, hostfile);
 JavaPairRDD variants;
 Class gformatClazz = GenomicsDBInputFormat.class;
 variants = sc.newAPIHadoopRDD(hadoopConf, gformatClazz, String.class, VariantContext.class);
 System.out.println("Number of variants "+variants.count());
 List variantList = variants.collect();
 for (Object variantObj : variantList) {
  System.out.println(variantObj);
 }
}

/**
 * Initializes a {@link JavaPairRDD} from given Spark context, Hadoop
 * configuration and data store.
 * 
 * @param sparkContext
 *          Spark context
 * @param conf
 *          Hadoop configuration
 * @param dataStore
 *          Data store
 * @return initialized rdd
 */
public JavaPairRDD<K, V> initialize(JavaSparkContext sparkContext,
  Configuration conf, DataStore<K, V> dataStore) {
 GoraMapReduceUtils.setIOSerializations(conf, true);
 try {
  IOUtils
    .storeToConf(dataStore.newQuery(), conf, GoraInputFormat.QUERY_KEY);
 } catch (IOException ioex) {
  throw new RuntimeException(ioex.getMessage());
 }
 return sparkContext.newAPIHadoopRDD(conf, GoraInputFormat.class, clazzK,
   clazzV);
}

/**
 * Initializes a {@link JavaPairRDD} from given Spark context, Hadoop
 * configuration and data store.
 * 
 * @param sparkContext
 *          Spark context
 * @param conf
 *          Hadoop configuration
 * @param dataStore
 *          Data store
 * @return initialized rdd
 */
public JavaPairRDD<K, V> initialize(JavaSparkContext sparkContext,
  Configuration conf, DataStore<K, V> dataStore) {
 GoraMapReduceUtils.setIOSerializations(conf, true);
 try {
  IOUtils
    .storeToConf(dataStore.newQuery(), conf, GoraInputFormat.QUERY_KEY);
 } catch (IOException ioex) {
  throw new RuntimeException(ioex.getMessage());
 }
 return sparkContext.newAPIHadoopRDD(conf, GoraInputFormat.class, clazzK,
   clazzV);
}

  @Override
  public <K, V> JavaPairRDD<K, V> readMemoryRDD(final Configuration configuration, final String memoryKey, final JavaSparkContext sparkContext) {
    return sparkContext.newAPIHadoopRDD(ConfUtil.makeHadoopConfiguration(configuration),
        SequenceFileInputFormat.class,
        ObjectWritable.class,
        ObjectWritable.class)
        .mapToPair(tuple -> new Tuple2<>((K) ((Tuple2<ObjectWritable, ObjectWritable>) tuple)._1().get(), (V) ((Tuple2<ObjectWritable, ObjectWritable>) tuple)._2().get()));
  }
}

@Override
public JavaPairRDD<Object, VertexWritable> readGraphRDD(final Configuration configuration, final JavaSparkContext sparkContext) {
  final org.apache.hadoop.conf.Configuration hadoopConfiguration = ConfUtil.makeHadoopConfiguration(configuration);
  return sparkContext.newAPIHadoopRDD(hadoopConfiguration,
      (Class<InputFormat<NullWritable, VertexWritable>>) hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, InputFormat.class),
      NullWritable.class,
      VertexWritable.class)
      .mapToPair(tuple -> new Tuple2<>(tuple._2().get().id(), new VertexWritable(tuple._2().get())));
}

 @Override
 public JavaRDDLike<?, ?> getJavaRDDLike(SparkRuntime runtime) {
  try {
   Job job = new Job(runtime.getConfiguration());
   source.configureSource(job, 0); // TODO: a custom input format for crunch-spark
   Converter converter = source.getConverter();
   JavaPairRDD<?, ?> input = runtime.getSparkContext().newAPIHadoopRDD(
     job.getConfiguration(),
     CrunchInputFormat.class,
     converter.getKeyClass(),
     converter.getValueClass());
   input.rdd().setName(getName());
   MapFn mapFn = converter.applyPTypeTransforms() ? source.getType().getInputMapFn() : IdentityFn.getInstance();
   return input
     .map(new InputConverterFunction(source.getConverter()))
     .mapToPair(new Tuple2MapFunction(mapFn, runtime.getRuntimeContext()));
  } catch (IOException e) {
   throw new RuntimeException(e);
  }
 }
}

 public JavaRDDLike<?, ?> getJavaRDDLike(SparkRuntime runtime) {
  try {
   Job job = new Job(runtime.getConfiguration());
   FileInputFormat.addInputPaths(job, "/tmp"); //placeholder
   source.configureSource(job, 0);
   Converter converter = source.getConverter();
   JavaPairRDD<?, ?> input = runtime.getSparkContext().newAPIHadoopRDD(
     job.getConfiguration(),
     CrunchInputFormat.class,
     converter.getKeyClass(),
     converter.getValueClass());
   input.rdd().setName(getName());
   MapFn mapFn = converter.applyPTypeTransforms() ? source.getType().getInputMapFn() : IdentityFn.getInstance();
   return input
     .map(new InputConverterFunction(source.getConverter()))
     .map(new MapFunction(mapFn, runtime.getRuntimeContext()));
  } catch (IOException e) {
   throw new RuntimeException(e);
  }
 }
}

 @Override
 public JavaRDDLike<?, ?> getJavaRDDLike(SparkRuntime runtime) {
  try {
   Job job = new Job(runtime.getConfiguration());
   source.configureSource(job, 0); // TODO: a custom input format for crunch-spark
   Converter converter = source.getConverter();
   JavaPairRDD<?, ?> input = runtime.getSparkContext().newAPIHadoopRDD(
     job.getConfiguration(),
     CrunchInputFormat.class,
     converter.getKeyClass(),
     converter.getValueClass());
   input.rdd().setName(getName());
   MapFn mapFn = converter.applyPTypeTransforms() ? source.getType().getInputMapFn() : IdentityFn.getInstance();
   return input
     .map(new InputConverterFunction(source.getConverter()))
     .mapToPair(new Tuple2MapFunction(mapFn, runtime.getRuntimeContext()));
  } catch (IOException e) {
   throw new RuntimeException(e);
  }
 }
}

 public JavaRDDLike<?, ?> getJavaRDDLike(SparkRuntime runtime) {
  try {
   Job job = new Job(runtime.getConfiguration());
   FileInputFormat.addInputPaths(job, "/tmp"); //placeholder
   source.configureSource(job, 0);
   Converter converter = source.getConverter();
   JavaPairRDD<?, ?> input = runtime.getSparkContext().newAPIHadoopRDD(
     job.getConfiguration(),
     CrunchInputFormat.class,
     converter.getKeyClass(),
     converter.getValueClass());
   input.rdd().setName(getName());
   MapFn mapFn = converter.applyPTypeTransforms() ? source.getType().getInputMapFn() : IdentityFn.getInstance();
   return input
     .map(new InputConverterFunction(source.getConverter()))
     .map(new MapFunction(mapFn, runtime.getRuntimeContext()));
  } catch (IOException e) {
   throw new RuntimeException(e);
  }
 }
}

private JavaRDD<Element> doOperation(final GetJavaRDDOfElementsInRanges operation,
                   final Context context,
                   final AccumuloStore accumuloStore) throws OperationException {
  final JavaSparkContext sparkContext = JavaSparkContext.fromSparkContext(SparkContextUtil.getSparkSession(context, accumuloStore.getProperties()).sparkContext());
  final Configuration conf = getConfiguration(operation);
  // Use batch scan option when performing seeded operation
  InputConfigurator.setBatchScan(AccumuloInputFormat.class, conf, true);
  addIterators(accumuloStore, conf, context.getUser(), operation);
  addRangesFromPairs(accumuloStore, conf, operation);
  final JavaPairRDD<Element, NullWritable> pairRDD = sparkContext.newAPIHadoopRDD(conf,
      ElementInputFormat.class,
      Element.class,
      NullWritable.class);
  final JavaRDD<Element> rdd = pairRDD.map(new FirstElement());
  return rdd;
}

private JavaRDD<Element> doOperation(final GetJavaRDDOfElements operation,
                   final Context context,
                   final AccumuloStore accumuloStore) throws OperationException {
  final JavaSparkContext sparkContext = JavaSparkContext.fromSparkContext(SparkContextUtil.getSparkSession(context, accumuloStore.getProperties()).sparkContext());
  final Configuration conf = getConfiguration(operation);
  // Use batch scan option when performing seeded operation
  InputConfigurator.setBatchScan(AccumuloInputFormat.class, conf, true);
  addIterators(accumuloStore, conf, context.getUser(), operation);
  addRanges(accumuloStore, conf, operation);
  final JavaPairRDD<Element, NullWritable> pairRDD = sparkContext.newAPIHadoopRDD(conf,
      ElementInputFormat.class,
      Element.class,
      NullWritable.class);
  final JavaRDD<Element> rdd = pairRDD.map(new FirstElement());
  return rdd;
}

How to use newAPIHadoopRDDmethodin org.apache.spark.api.java.JavaSparkContext

Best Java code snippets using org.apache.spark.api.java.JavaSparkContext.newAPIHadoopRDD (Showing top 20 results out of 315)

How to use
newAPIHadoopRDD
method
in
org.apache.spark.api.java.JavaSparkContext