/** * Read the given path as a Java RDD; The path can have second level sub folder. * @param inputPath * @param fs * @param sc * @param keyClass * @param valueClass * @return * @throws IOException */ public static JavaPairRDD parseInputPath(String inputPath, FileSystem fs, JavaSparkContext sc, Class keyClass, Class valueClass) throws IOException { List<String> inputFolders = Lists.newArrayList(); Path inputHDFSPath = new Path(inputPath); FileStatus[] fileStatuses = fs.listStatus(inputHDFSPath); boolean hasDir = false; for (FileStatus stat : fileStatuses) { if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) { hasDir = true; inputFolders.add(stat.getPath().toString()); } } if (!hasDir) { return sc.sequenceFile(inputHDFSPath.toString(), keyClass, valueClass); } return sc.sequenceFile(StringUtil.join(inputFolders, ","), keyClass, valueClass); }
private static JavaRDD<String[]> getSequenceFormatHiveInput(JavaSparkContext sc, String inputPath) { return sc.sequenceFile(inputPath, BytesWritable.class, Text.class).values() .map(new Function<Text, String[]>() { @Override public String[] call(Text text) throws Exception { String s = Bytes.toString(text.getBytes(), 0, text.getLength()); return s.split(BatchConstants.SEQUENCE_FILE_DEFAULT_DELIMITER, -1); } }); }
public static void main(String[] args) throws Exception { if (args.length != 2) { throw new Exception("Usage BasicLoadSequenceFile [sparkMaster] [input]"); } String master = args[0]; String fileName = args[1]; JavaSparkContext sc = new JavaSparkContext( master, "basicloadsequencefile", System.getenv("SPARK_HOME"), System.getenv("JARS")); JavaPairRDD<Text, IntWritable> input = sc.sequenceFile(fileName, Text.class, IntWritable.class); JavaPairRDD<String, Integer> result = input.mapToPair(new ConvertToNativeTypes()); List<Tuple2<String, Integer>> resultList = result.collect(); for (Tuple2<String, Integer> record : resultList) { System.out.println(record); } } }
@SuppressWarnings("unchecked") @Test public void writeWithNewAPIHadoopFile() { String outputDir = new File(tempDir, "output").getAbsolutePath(); List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs); rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2()))) .saveAsNewAPIHadoopFile(outputDir, IntWritable.class, Text.class, org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat.class); JavaPairRDD<IntWritable, Text> output = sc.sequenceFile(outputDir, IntWritable.class, Text.class); assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString()); }
@SuppressWarnings("unchecked") @Test public void sequenceFile() { String outputDir = new File(tempDir, "output").getAbsolutePath(); List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs); rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2()))) .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class); // Try reading the output back as an object file JavaPairRDD<Integer, String> readRDD = sc.sequenceFile(outputDir, IntWritable.class, Text.class).mapToPair(pair -> new Tuple2<>(pair._1().get(), pair._2().toString())); assertEquals(pairs, readRDD.collect()); }
@SuppressWarnings("unchecked") @Test public void writeWithNewAPIHadoopFile() { String outputDir = new File(tempDir, "output").getAbsolutePath(); List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs); rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2()))) .saveAsNewAPIHadoopFile(outputDir, IntWritable.class, Text.class, org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat.class); JavaPairRDD<IntWritable, Text> output = sc.sequenceFile(outputDir, IntWritable.class, Text.class); assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString()); }
@SuppressWarnings("unchecked") @Test public void writeWithNewAPIHadoopFile() { String outputDir = new File(tempDir, "output").getAbsolutePath(); List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs); rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2()))) .saveAsNewAPIHadoopFile(outputDir, IntWritable.class, Text.class, org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat.class); JavaPairRDD<IntWritable, Text> output = sc.sequenceFile(outputDir, IntWritable.class, Text.class); assertEquals(pairs.toString(), output.map(Tuple2::toString).collect().toString()); }
@Test public void sequenceFile() { File tempDir = Files.createTempDir(); tempDir.deleteOnExit(); String outputDir = new File(tempDir, "output").getAbsolutePath(); List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs); rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2()))) .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class); // Try reading the output back as an object file JavaPairRDD<Integer, String> readRDD = sc.sequenceFile(outputDir, IntWritable.class, Text.class) .mapToPair(pair -> new Tuple2<>(pair._1().get(), pair._2().toString())); Assert.assertEquals(pairs, readRDD.collect()); Utils.deleteRecursively(tempDir); }
@Test public void sequenceFile() { File tempDir = Files.createTempDir(); tempDir.deleteOnExit(); String outputDir = new File(tempDir, "output").getAbsolutePath(); List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs); rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2()))) .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class); // Try reading the output back as an object file JavaPairRDD<Integer, String> readRDD = sc.sequenceFile(outputDir, IntWritable.class, Text.class) .mapToPair(pair -> new Tuple2<>(pair._1().get(), pair._2().toString())); Assert.assertEquals(pairs, readRDD.collect()); Utils.deleteRecursively(tempDir); }
@SuppressWarnings("unchecked") @Test public void sequenceFile() { String outputDir = new File(tempDir, "output").getAbsolutePath(); List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs); rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2()))) .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class); // Try reading the output back as an object file JavaPairRDD<Integer, String> readRDD = sc.sequenceFile(outputDir, IntWritable.class, Text.class).mapToPair(pair -> new Tuple2<>(pair._1().get(), pair._2().toString())); assertEquals(pairs, readRDD.collect()); }
@SuppressWarnings("unchecked") @Test public void sequenceFile() { String outputDir = new File(tempDir, "output").getAbsolutePath(); List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs); rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2()))) .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class); // Try reading the output back as an object file JavaPairRDD<Integer, String> readRDD = sc.sequenceFile(outputDir, IntWritable.class, Text.class).mapToPair(pair -> new Tuple2<>(pair._1().get(), pair._2().toString())); assertEquals(pairs, readRDD.collect()); }
@Test public void sequenceFile() { File tempDir = Files.createTempDir(); tempDir.deleteOnExit(); String outputDir = new File(tempDir, "output").getAbsolutePath(); List<Tuple2<Integer, String>> pairs = Arrays.asList( new Tuple2<>(1, "a"), new Tuple2<>(2, "aa"), new Tuple2<>(3, "aaa") ); JavaPairRDD<Integer, String> rdd = sc.parallelizePairs(pairs); rdd.mapToPair(pair -> new Tuple2<>(new IntWritable(pair._1()), new Text(pair._2()))) .saveAsHadoopFile(outputDir, IntWritable.class, Text.class, SequenceFileOutputFormat.class); // Try reading the output back as an object file JavaPairRDD<Integer, String> readRDD = sc.sequenceFile(outputDir, IntWritable.class, Text.class) .mapToPair(pair -> new Tuple2<>(pair._1().get(), pair._2().toString())); Assert.assertEquals(pairs, readRDD.collect()); Utils.deleteRecursively(tempDir); }
CubeSegment sourceSegment = findSourceSegment(path, cubeInstance); final String cuboidInputPath = BatchCubingJobBuilder2.getCuboidOutputPathsByLevel(path, level); JavaPairRDD<Text, Text> segRdd = sc.sequenceFile(cuboidInputPath, Text.class, Text.class);
JavaSparkContext sc = new JavaSparkContext(conf); JavaPairRDD<LongWritable, Vector> input = sc.sequenceFile(fileName, LongWritable.class, Vector.class);
/** * Read a hadoop sequence file to a Strign byte[] pair. * @param filePath the input path of the hadoop file * @return the {@link JavaPairRDD} of the PDB code and the byte array of the data. */ public static JavaPairRDD<String, byte[]> readHadoopFile(String filePath) { return getSparkContext() .sequenceFile(filePath, Text.class, BytesWritable.class, 8) .mapToPair(t -> new Tuple2<String, byte[]>(t._1.toString(),t._2.getBytes())); }
/** * Read the given path as a Java RDD; The path can have second level sub folder. * @param inputPath * @param fs * @param sc * @param keyClass * @param valueClass * @return * @throws IOException */ public static JavaPairRDD parseInputPath(String inputPath, FileSystem fs, JavaSparkContext sc, Class keyClass, Class valueClass) throws IOException { List<String> inputFolders = Lists.newArrayList(); Path inputHDFSPath = new Path(inputPath); FileStatus[] fileStatuses = fs.listStatus(inputHDFSPath); boolean hasDir = false; for (FileStatus stat : fileStatuses) { if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) { hasDir = true; inputFolders.add(stat.getPath().toString()); } } if (!hasDir) { return sc.sequenceFile(inputHDFSPath.toString(), keyClass, valueClass); } return sc.sequenceFile(StringUtil.join(inputFolders, ","), keyClass, valueClass); }
private static JavaRDD<String[]> getSequenceFormatHiveInput(JavaSparkContext sc, String inputPath) { return sc.sequenceFile(inputPath, BytesWritable.class, Text.class).values() .map(new Function<Text, String[]>() { @Override public String[] call(Text text) throws Exception { String s = Bytes.toString(text.getBytes(), 0, text.getLength()); return s.split(BatchConstants.SEQUENCE_FILE_DEFAULT_DELIMITER); } }); }
public static void main(String[] args) { // TODO Auto-generated method stub SparkConf conf = new SparkConf().setAppName("ImagenetSampler") .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer"); JavaSparkContext sc = new JavaSparkContext(conf); int numExecutors = conf.getInt("spark.executor.instances", -1); System.out.println("number of executors = " + numExecutors); System.out.println("Data Loading..."); JavaPairRDD<FloatWritable, ArrayPrimitiveWritable> train_seq = sc.sequenceFile("imagenet_sampled.hsf", FloatWritable.class, ArrayPrimitiveWritable.class); train_seq.foreach(new VoidFunction<Tuple2<FloatWritable,ArrayPrimitiveWritable>>() { @Override public void call(Tuple2<FloatWritable, ArrayPrimitiveWritable> arg0) throws Exception { System.out.println(arg0._1.get() + " " + ((float[]) arg0._2.get()).length); } }); sc.close(); }
sc.sequenceFile("/imagenet_train.hsf", FloatWritable.class, ArrayPrimitiveWritable.class);
JavaPairRDD<Writable, Text> input = sc.sequenceFile(filePath, Writable.class, Text.class, partitions); pairs = input.mapToPair(new PairFunction<Tuple2<Writable, Text>, String, String>() { private static final long serialVersionUID = -9042224661662821670L;