public static void main(String[] args) throws Exception { String master; if (args.length > 0) { master = args[0]; } else { master = "local"; } JavaSparkContext sc = new JavaSparkContext( master, "basicmapfilter", System.getenv("SPARK_HOME"), System.getenv("JARS")); JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4)); JavaRDD<Integer> squared = rdd.map( new Function<Integer, Integer>() { public Integer call(Integer x) { return x*x;}}); JavaRDD<Integer> result = squared.filter( new Function<Integer, Boolean>() { public Boolean call(Integer x) { return x != 1; }}); System.out.println(StringUtils.join(result.collect(), ",")); } }
public static void main(String[] args) throws Exception { if (args.length != 3) { throw new Exception("Usage BasicLoadCsv sparkMaster csvInputFile csvOutputFile key"); } String master = args[0]; String csvInput = args[1]; String outputFile = args[2]; final String key = args[3]; JavaSparkContext sc = new JavaSparkContext( master, "loadwholecsv", System.getenv("SPARK_HOME"), System.getenv("JARS")); JavaPairRDD<String, String> csvData = sc.wholeTextFiles(csvInput); JavaRDD<String[]> keyedRDD = csvData.flatMap(new ParseLine()); JavaRDD<String[]> result = keyedRDD.filter(new Function<String[], Boolean>() { public Boolean call(String[] input) { return input[0].equals(key); }}); result.saveAsTextFile(outputFile); } }
public static void main(String[] args) throws Exception { if (args.length != 3) { throw new Exception("Usage BasicLoadJson [sparkMaster] [jsoninput] [jsonoutput]"); } String master = args[0]; String fileName = args[1]; String outfile = args[2]; JavaSparkContext sc = new JavaSparkContext( master, "basicloadjson", System.getenv("SPARK_HOME"), System.getenv("JARS")); JavaRDD<String> input = sc.textFile(fileName); JavaRDD<Person> result = input.mapPartitions(new ParseJson()).filter(new LikesPandas()); JavaRDD<String> formatted = result.mapPartitions(new WriteJson()); formatted.saveAsTextFile(outfile); } }
/** * Implementation which splits based solely on time. It will return approximately * the earliest {@link #getTestFraction()} of input, ordered by timestamp, as new training * data and the rest as test data. */ @Override protected Pair<JavaRDD<String>,JavaRDD<String>> splitNewDataToTrainTest(JavaRDD<String> newData) { // Rough approximation; assumes timestamps are fairly evenly distributed StatCounter maxMin = newData.mapToDouble(line -> MLFunctions.TO_TIMESTAMP_FN.call(line).doubleValue()).stats(); long minTime = (long) maxMin.min(); long maxTime = (long) maxMin.max(); log.info("New data timestamp range: {} - {}", minTime, maxTime); long approxTestTrainBoundary = (long) (maxTime - getTestFraction() * (maxTime - minTime)); log.info("Splitting at timestamp {}", approxTestTrainBoundary); JavaRDD<String> newTrainData = newData.filter( line -> MLFunctions.TO_TIMESTAMP_FN.call(line) < approxTestTrainBoundary); JavaRDD<String> testData = newData.filter( line -> MLFunctions.TO_TIMESTAMP_FN.call(line) >= approxTestTrainBoundary); return new Pair<>(newTrainData, testData); }
static double accuracy(DecisionForest forest, JavaRDD<Example> examples) { long total = examples.count(); if (total == 0) { return 0.0; } long correct = examples.filter(example -> { CategoricalPrediction prediction = (CategoricalPrediction) forest.predict(example); CategoricalFeature target = (CategoricalFeature) example.getTarget(); return prediction.getMostProbableCategoryEncoding() == target.getEncoding(); }).count(); return (double) correct / total; }
JavaRDD<List<Writable>> parsedInputData = stringData.filter((x) -> !x.isEmpty()).map(new StringToWritablesFunction(rr));
@Test public void isEmpty() { assertTrue(sc.emptyRDD().isEmpty()); assertTrue(sc.parallelize(new ArrayList<Integer>()).isEmpty()); assertFalse(sc.parallelize(Arrays.asList(1)).isEmpty()); assertTrue(sc.parallelize(Arrays.asList(1, 2, 3), 3).filter(i -> i < 0).isEmpty()); assertFalse(sc.parallelize(Arrays.asList(1, 2, 3)).filter(i -> i > 1).isEmpty()); }
@Test public void isEmpty() { assertTrue(sc.emptyRDD().isEmpty()); assertTrue(sc.parallelize(new ArrayList<Integer>()).isEmpty()); assertFalse(sc.parallelize(Arrays.asList(1)).isEmpty()); assertTrue(sc.parallelize(Arrays.asList(1, 2, 3), 3).filter(i -> i < 0).isEmpty()); assertFalse(sc.parallelize(Arrays.asList(1, 2, 3)).filter(i -> i > 1).isEmpty()); }
@Test public void isEmpty() { assertTrue(sc.emptyRDD().isEmpty()); assertTrue(sc.parallelize(new ArrayList<Integer>()).isEmpty()); assertFalse(sc.parallelize(Arrays.asList(1)).isEmpty()); assertTrue(sc.parallelize(Arrays.asList(1, 2, 3), 3).filter(i -> i < 0).isEmpty()); assertFalse(sc.parallelize(Arrays.asList(1, 2, 3)).filter(i -> i > 1).isEmpty()); }
JavaRDD<String> validCallSigns = callSigns.filter( new Function<String, Boolean>(){ public Boolean call(String callSign) { Pattern p = Pattern.compile("\\A\\d?\\p{Alpha}{1,2}\\d{1,4}\\p{Alpha}{1,3}\\Z");
logLines = logLines.filter(new Function<String,Boolean>() { @Override public Boolean call(String s) throws Exception {
@Override public boolean cp(final String sourceLocation, final String targetLocation) { final List<String> rdds = Spark.getRDDs().stream().filter(r -> r.name().startsWith(sourceLocation)).map(RDD::name).collect(Collectors.toList()); if (rdds.size() == 0) return false; for (final String rdd : rdds) { Spark.getRDD(rdd).toJavaRDD().filter(a -> true).setName(rdd.equals(sourceLocation) ? targetLocation : rdd.replace(sourceLocation, targetLocation)).cache().count(); // TODO: this should use the original storage level } return true; }
JavaRDD<String> filteredRDD = records.filter((String record) -> { String firstChar = record.substring(0,1); if ( firstChar.equals("@") ||
JavaRDD<String> filteredRDD = records.filter(new Function<String,Boolean>() { @Override public Boolean call(String record) {
JavaRDD<String> filtered = biosets.filter((String record) -> { String ref = REF.value(); String[] tokens = record.split(",");
JavaRDD<String> filtered = biosets.filter(new Function<String,Boolean>() { @Override public Boolean call(String record) {
@VisibleForTesting public int getHBasePutAccessParallelism(final JavaRDD<WriteStatus> writeStatusRDD) { return Math.toIntExact(Math.max(writeStatusRDD .filter(w -> w.getStat().getNumInserts() > 0).count(), 1)); }
/** * Filter out HoodieRecords that already exists in the output folder. This is useful in * deduplication. * * @param hoodieRecords Input RDD of Hoodie records. * @return A subset of hoodieRecords RDD, with existing records filtered out. */ public JavaRDD<HoodieRecord<T>> filterExists(JavaRDD<HoodieRecord<T>> hoodieRecords) { JavaRDD<HoodieRecord<T>> recordsWithLocation = tagLocation(hoodieRecords); return recordsWithLocation.filter(v1 -> !v1.isCurrentLocationKnown()); }
.filter(vertexWritable -> ElementHelper.idExists(vertexWritable.get().id(), graphStepIds)) // ensure vertex ids are in V(x) .flatMap(vertexWritable -> { if (identityTraversal) // g.V.count()-style (identity)