sparkContext.newAPIHadoopRDD(updatedConf, SequenceFileInputFormat.class, keyWritableClass,
@Override public <K, V> JavaPairRDD<K, V> readMemoryRDD(final Configuration configuration, final String memoryKey, final JavaSparkContext sparkContext) { return sparkContext.newAPIHadoopRDD(ConfUtil.makeHadoopConfiguration(configuration), SequenceFileInputFormat.class, ObjectWritable.class, ObjectWritable.class) .mapToPair(tuple -> new Tuple2<>((K) ((Tuple2<ObjectWritable, ObjectWritable>) tuple)._1().get(), (V) ((Tuple2<ObjectWritable, ObjectWritable>) tuple)._2().get())); } }
@Override public JavaPairRDD<Object, VertexWritable> readGraphRDD(final Configuration configuration, final JavaSparkContext sparkContext) { final org.apache.hadoop.conf.Configuration hadoopConfiguration = ConfUtil.makeHadoopConfiguration(configuration); return sparkContext.newAPIHadoopRDD(hadoopConfiguration, (Class<InputFormat<NullWritable, VertexWritable>>) hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, InputFormat.class), NullWritable.class, VertexWritable.class) .mapToPair(tuple -> new Tuple2<>(tuple._2().get().id(), new VertexWritable(tuple._2().get()))); }
public static JavaPairRDD<Object, BSONObject> mongoRdd(JavaSparkContext sparkContext, String mongoHost, long mongoPort, String db, String collection) { Configuration mongodbConfig = new Configuration(); mongodbConfig.set("mongo.job.input.format", "com.mongodb.hadoop.MongoInputFormat"); mongodbConfig.set("mongo.input.uri", String.format("mongodb://%s:%d/%s.%s", mongoHost, mongoPort, db, collection)); return sparkContext.newAPIHadoopRDD(mongodbConfig, MongoInputFormat.class, Object.class, BSONObject.class); }
/** * Creates the hive table rdd. * * @param javaSparkContext the java spark context * @param conf the conf * @param db the db * @param table the table * @param partitionFilter the partition filter * @return the java pair rdd * @throws IOException Signals that an I/O exception has occurred. */ public static JavaPairRDD<WritableComparable, HCatRecord> createHiveTableRDD(JavaSparkContext javaSparkContext, Configuration conf, String db, String table, String partitionFilter) throws IOException { HCatInputFormat.setInput(conf, db, table, partitionFilter); JavaPairRDD<WritableComparable, HCatRecord> rdd = javaSparkContext.newAPIHadoopRDD(conf, HCatInputFormat.class, // Input WritableComparable.class, // input key class HCatRecord.class); // input value class return rdd; } }
/** * Restore a {@code JavaPairRDD<Long,List<Writable>>} previously saved with {@link #saveMapFile(String, JavaRDD)}}<br> * Note that if the keys are not required, simply use {@code restoreMapFile(...).values()} * * @param path Path of the MapFile * @param sc Spark context * @return The restored RDD, with their unique indices as the key */ public static JavaPairRDD<Long, List<Writable>> restoreMapFile(String path, JavaSparkContext sc) { Configuration c = new Configuration(); c.set(FileInputFormat.INPUT_DIR, FilenameUtils.normalize(path, true)); JavaPairRDD<LongWritable, RecordWritable> pairRDD = sc.newAPIHadoopRDD(c, SequenceFileInputFormat.class, LongWritable.class, RecordWritable.class); return pairRDD.mapToPair(new RecordLoadPairFunction()); }
/** * Restore a {@code JavaPairRDD<Long,List<List<Writable>>>} previously saved with {@link #saveMapFile(String, JavaRDD)}}<br> * Note that if the keys are not required, simply use {@code restoreMapFileSequences(...).values()} * * @param path Path of the MapFile * @param sc Spark context * @return The restored RDD, with their unique indices as the key */ public static JavaPairRDD<Long, List<List<Writable>>> restoreMapFileSequences(String path, JavaSparkContext sc) { Configuration c = new Configuration(); c.set(FileInputFormat.INPUT_DIR, FilenameUtils.normalize(path, true)); JavaPairRDD<LongWritable, SequenceRecordWritable> pairRDD = sc.newAPIHadoopRDD(c, SequenceFileInputFormat.class, LongWritable.class, SequenceRecordWritable.class); return pairRDD.mapToPair(new SequenceRecordLoadPairFunction()); }
/** * Restore a {@code JavaPairRDD<Long,List<Writable>>} previously saved with {@link #saveMapFile(String, JavaRDD)}}<br> * Note that if the keys are not required, simply use {@code restoreMapFile(...).values()} * * @param path Path of the MapFile * @param sc Spark context * @return The restored RDD, with their unique indices as the key */ public static JavaPairRDD<Long, List<Writable>> restoreMapFile(String path, JavaSparkContext sc) { Configuration c = new Configuration(); c.set(FileInputFormat.INPUT_DIR, FilenameUtils.normalize(path, true)); JavaPairRDD<LongWritable, RecordWritable> pairRDD = sc.newAPIHadoopRDD(c, SequenceFileInputFormat.class, LongWritable.class, RecordWritable.class); return pairRDD.mapToPair(new RecordLoadPairFunction()); }
/** * Restore a {@code JavaPairRDD<Long,List<List<Writable>>>} previously saved with {@link #saveMapFile(String, JavaRDD)}}<br> * Note that if the keys are not required, simply use {@code restoreMapFileSequences(...).values()} * * @param path Path of the MapFile * @param sc Spark context * @return The restored RDD, with their unique indices as the key */ public static JavaPairRDD<Long, List<List<Writable>>> restoreMapFileSequences(String path, JavaSparkContext sc) { Configuration c = new Configuration(); c.set(FileInputFormat.INPUT_DIR, FilenameUtils.normalize(path, true)); JavaPairRDD<LongWritable, SequenceRecordWritable> pairRDD = sc.newAPIHadoopRDD(c, SequenceFileInputFormat.class, LongWritable.class, SequenceRecordWritable.class); return pairRDD.mapToPair(new SequenceRecordLoadPairFunction()); }
public static void usingNewAPIHadoopRDD(String[] args) { String loaderJsonFile = args[0]; String queryJsonFile = args[1]; String hostfile = args[2]; SparkConf conf = new SparkConf(); conf.setAppName("GenomicsDBTest using newAPIHadoopRDD"); JavaSparkContext sc = new JavaSparkContext(conf); Configuration hadoopConf = sc.hadoopConfiguration(); hadoopConf.set(GenomicsDBConfiguration.LOADERJSON, loaderJsonFile); hadoopConf.set(GenomicsDBConfiguration.QUERYJSON, queryJsonFile); hadoopConf.set(GenomicsDBConfiguration.MPIHOSTFILE, hostfile); JavaPairRDD variants; Class gformatClazz = GenomicsDBInputFormat.class; variants = sc.newAPIHadoopRDD(hadoopConf, gformatClazz, String.class, VariantContext.class); System.out.println("Number of variants "+variants.count()); List variantList = variants.collect(); for (Object variantObj : variantList) { System.out.println(variantObj); } }
/** * Initializes a {@link JavaPairRDD} from given Spark context, Hadoop * configuration and data store. * * @param sparkContext * Spark context * @param conf * Hadoop configuration * @param dataStore * Data store * @return initialized rdd */ public JavaPairRDD<K, V> initialize(JavaSparkContext sparkContext, Configuration conf, DataStore<K, V> dataStore) { GoraMapReduceUtils.setIOSerializations(conf, true); try { IOUtils .storeToConf(dataStore.newQuery(), conf, GoraInputFormat.QUERY_KEY); } catch (IOException ioex) { throw new RuntimeException(ioex.getMessage()); } return sparkContext.newAPIHadoopRDD(conf, GoraInputFormat.class, clazzK, clazzV); }
/** * Initializes a {@link JavaPairRDD} from given Spark context, Hadoop * configuration and data store. * * @param sparkContext * Spark context * @param conf * Hadoop configuration * @param dataStore * Data store * @return initialized rdd */ public JavaPairRDD<K, V> initialize(JavaSparkContext sparkContext, Configuration conf, DataStore<K, V> dataStore) { GoraMapReduceUtils.setIOSerializations(conf, true); try { IOUtils .storeToConf(dataStore.newQuery(), conf, GoraInputFormat.QUERY_KEY); } catch (IOException ioex) { throw new RuntimeException(ioex.getMessage()); } return sparkContext.newAPIHadoopRDD(conf, GoraInputFormat.class, clazzK, clazzV); }
@Override public <K, V> JavaPairRDD<K, V> readMemoryRDD(final Configuration configuration, final String memoryKey, final JavaSparkContext sparkContext) { return sparkContext.newAPIHadoopRDD(ConfUtil.makeHadoopConfiguration(configuration), SequenceFileInputFormat.class, ObjectWritable.class, ObjectWritable.class) .mapToPair(tuple -> new Tuple2<>((K) ((Tuple2<ObjectWritable, ObjectWritable>) tuple)._1().get(), (V) ((Tuple2<ObjectWritable, ObjectWritable>) tuple)._2().get())); } }
@Override public JavaPairRDD<Object, VertexWritable> readGraphRDD(final Configuration configuration, final JavaSparkContext sparkContext) { final org.apache.hadoop.conf.Configuration hadoopConfiguration = ConfUtil.makeHadoopConfiguration(configuration); return sparkContext.newAPIHadoopRDD(hadoopConfiguration, (Class<InputFormat<NullWritable, VertexWritable>>) hadoopConfiguration.getClass(Constants.GREMLIN_HADOOP_GRAPH_READER, InputFormat.class), NullWritable.class, VertexWritable.class) .mapToPair(tuple -> new Tuple2<>(tuple._2().get().id(), new VertexWritable(tuple._2().get()))); }
@Override public JavaRDDLike<?, ?> getJavaRDDLike(SparkRuntime runtime) { try { Job job = new Job(runtime.getConfiguration()); source.configureSource(job, 0); // TODO: a custom input format for crunch-spark Converter converter = source.getConverter(); JavaPairRDD<?, ?> input = runtime.getSparkContext().newAPIHadoopRDD( job.getConfiguration(), CrunchInputFormat.class, converter.getKeyClass(), converter.getValueClass()); input.rdd().setName(getName()); MapFn mapFn = converter.applyPTypeTransforms() ? source.getType().getInputMapFn() : IdentityFn.getInstance(); return input .map(new InputConverterFunction(source.getConverter())) .mapToPair(new Tuple2MapFunction(mapFn, runtime.getRuntimeContext())); } catch (IOException e) { throw new RuntimeException(e); } } }
public JavaRDDLike<?, ?> getJavaRDDLike(SparkRuntime runtime) { try { Job job = new Job(runtime.getConfiguration()); FileInputFormat.addInputPaths(job, "/tmp"); //placeholder source.configureSource(job, 0); Converter converter = source.getConverter(); JavaPairRDD<?, ?> input = runtime.getSparkContext().newAPIHadoopRDD( job.getConfiguration(), CrunchInputFormat.class, converter.getKeyClass(), converter.getValueClass()); input.rdd().setName(getName()); MapFn mapFn = converter.applyPTypeTransforms() ? source.getType().getInputMapFn() : IdentityFn.getInstance(); return input .map(new InputConverterFunction(source.getConverter())) .map(new MapFunction(mapFn, runtime.getRuntimeContext())); } catch (IOException e) { throw new RuntimeException(e); } } }
@Override public JavaRDDLike<?, ?> getJavaRDDLike(SparkRuntime runtime) { try { Job job = new Job(runtime.getConfiguration()); source.configureSource(job, 0); // TODO: a custom input format for crunch-spark Converter converter = source.getConverter(); JavaPairRDD<?, ?> input = runtime.getSparkContext().newAPIHadoopRDD( job.getConfiguration(), CrunchInputFormat.class, converter.getKeyClass(), converter.getValueClass()); input.rdd().setName(getName()); MapFn mapFn = converter.applyPTypeTransforms() ? source.getType().getInputMapFn() : IdentityFn.getInstance(); return input .map(new InputConverterFunction(source.getConverter())) .mapToPair(new Tuple2MapFunction(mapFn, runtime.getRuntimeContext())); } catch (IOException e) { throw new RuntimeException(e); } } }
public JavaRDDLike<?, ?> getJavaRDDLike(SparkRuntime runtime) { try { Job job = new Job(runtime.getConfiguration()); FileInputFormat.addInputPaths(job, "/tmp"); //placeholder source.configureSource(job, 0); Converter converter = source.getConverter(); JavaPairRDD<?, ?> input = runtime.getSparkContext().newAPIHadoopRDD( job.getConfiguration(), CrunchInputFormat.class, converter.getKeyClass(), converter.getValueClass()); input.rdd().setName(getName()); MapFn mapFn = converter.applyPTypeTransforms() ? source.getType().getInputMapFn() : IdentityFn.getInstance(); return input .map(new InputConverterFunction(source.getConverter())) .map(new MapFunction(mapFn, runtime.getRuntimeContext())); } catch (IOException e) { throw new RuntimeException(e); } } }
private JavaRDD<Element> doOperation(final GetJavaRDDOfElementsInRanges operation, final Context context, final AccumuloStore accumuloStore) throws OperationException { final JavaSparkContext sparkContext = JavaSparkContext.fromSparkContext(SparkContextUtil.getSparkSession(context, accumuloStore.getProperties()).sparkContext()); final Configuration conf = getConfiguration(operation); // Use batch scan option when performing seeded operation InputConfigurator.setBatchScan(AccumuloInputFormat.class, conf, true); addIterators(accumuloStore, conf, context.getUser(), operation); addRangesFromPairs(accumuloStore, conf, operation); final JavaPairRDD<Element, NullWritable> pairRDD = sparkContext.newAPIHadoopRDD(conf, ElementInputFormat.class, Element.class, NullWritable.class); final JavaRDD<Element> rdd = pairRDD.map(new FirstElement()); return rdd; }
private JavaRDD<Element> doOperation(final GetJavaRDDOfElements operation, final Context context, final AccumuloStore accumuloStore) throws OperationException { final JavaSparkContext sparkContext = JavaSparkContext.fromSparkContext(SparkContextUtil.getSparkSession(context, accumuloStore.getProperties()).sparkContext()); final Configuration conf = getConfiguration(operation); // Use batch scan option when performing seeded operation InputConfigurator.setBatchScan(AccumuloInputFormat.class, conf, true); addIterators(accumuloStore, conf, context.getUser(), operation); addRanges(accumuloStore, conf, operation); final JavaPairRDD<Element, NullWritable> pairRDD = sparkContext.newAPIHadoopRDD(conf, ElementInputFormat.class, Element.class, NullWritable.class); final JavaRDD<Element> rdd = pairRDD.map(new FirstElement()); return rdd; }