static JavaRDD<Vector> fetchSampleData(JavaRDD<Vector> evalData) { long count = evalData.count(); if (count > MAX_SAMPLE_SIZE) { return evalData.sample(false, (double) MAX_SAMPLE_SIZE / count); } return evalData; }
@Test public void sample() { List<Integer> ints = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); JavaRDD<Integer> rdd = sc.parallelize(ints); // the seeds here are "magic" to make this work out nicely JavaRDD<Integer> sample20 = rdd.sample(true, 0.2, 8); assertEquals(2, sample20.count()); JavaRDD<Integer> sample20WithoutReplacement = rdd.sample(false, 0.2, 2); assertEquals(2, sample20WithoutReplacement.count()); }
@Test public void sample() { List<Integer> ints = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); JavaRDD<Integer> rdd = sc.parallelize(ints); // the seeds here are "magic" to make this work out nicely JavaRDD<Integer> sample20 = rdd.sample(true, 0.2, 8); assertEquals(2, sample20.count()); JavaRDD<Integer> sample20WithoutReplacement = rdd.sample(false, 0.2, 2); assertEquals(2, sample20WithoutReplacement.count()); }
@Test public void sample() { List<Integer> ints = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); JavaRDD<Integer> rdd = sc.parallelize(ints); // the seeds here are "magic" to make this work out nicely JavaRDD<Integer> sample20 = rdd.sample(true, 0.2, 8); assertEquals(2, sample20.count()); JavaRDD<Integer> sample20WithoutReplacement = rdd.sample(false, 0.2, 2); assertEquals(2, sample20WithoutReplacement.count()); }
private Map<Object, Integer> calculateSplitsForColumn(final JavaRDD<Element> data, final IdentifierType colName) { final List<Object> splits = data.sample(false, 1.0 / sampleRate) .map(element -> element.getIdentifier(colName)) .sortBy(obj -> obj, true, numOfSplits) .mapPartitions(objectIterator -> { final List<Object> list = new ArrayList<>(1); if (objectIterator.hasNext()) { list.add(objectIterator.next()); } return list.iterator(); }) .collect(); final Map<Object, Integer> splitPoints = new TreeMap<>(COMPARATOR); int i = 0; for (final Object split : splits) { if (null != split) { splitPoints.put(split, i); } i++; } return splitPoints; }
/** * This method calculate data size in megabytes * If total row number > ROW_SAMPLE_THRESHOLD * => It samples data to row number = ROW_SAMPLE_THRESHOLD * => Calculate sample data size by {@link FileSink#getSampleSizeInBytes(JavaRDD)} * => Calculate total data sizes by fraction and change to megabyte * Else calculate total data size by {@link FileSink#getSampleSizeInBytes(JavaRDD)} * * @param data data to calculate size in megabytes * @return estimated data size in megabytes */ protected double getRddSizeInMegaByte(@NonNull final JavaRDD<String> data) { final RDDWrapper<String> dataWrapper = new RDDWrapper<>(data); final long totalRows = dataWrapper.getCount(); final double totalSize; if (totalRows > ROW_SAMPLING_THRESHOLD) { log.debug("Start sampling on Write Data."); final double fraction = (double) ROW_SAMPLING_THRESHOLD / (double) totalRows; log.debug("Sample fraction: {}", fraction); final JavaRDD<String> sampleRdd = data.sample(false, fraction); final long sampleSizeInBytes = getSampleSizeInBytes(sampleRdd); final double sampleSizeInMB = (double) sampleSizeInBytes / FileUtils.ONE_MB; totalSize = sampleSizeInMB / fraction; } else { totalSize = (double) getSampleSizeInBytes(data) / FileUtils.ONE_MB; } return totalSize; }
public static void main(String[] args) throws Exception { SparkConf conf = new SparkConf().setAppName("sampling").setMaster("local").set("spark.cores.max", "10"); JavaSparkContext sc = new JavaSparkContext(conf); File outputFile = downloadFile("http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data.gz"); JavaRDD<String> rawData = sc.textFile(outputFile.getAbsolutePath()); /** * Sampling RDDs: In Spark, there are two sampling operations, the transformation sample and the action * takeSample. By using a transformation we can tell Spark to apply successive transformation on a sample * of a given RDD. By using an action we retrieve a given sample and we can have it in local memory to be * used by any other standard library. The sample transformation takes up to three parameters. * First is weather the sampling is done with replacement or not. * Second is the sample size as a fraction. Finally we can optionally provide a random seed. */ JavaRDD<String> sampledData = rawData.sample(false, 0.1, 1234); long sampleDataSize = sampledData.count(); long rawDataSize = rawData.count(); System.out.println(rawDataSize + " and after the sampling: " + sampleDataSize); /** * * takeSample allow the user takeSample(withReplacement, num, [seed]) to return an array with * a random sample of num elements of the dataset, with or without replacement, optionally * pre-specifying a random number generator seed. * */ List<String> sampledDataList = rawData.takeSample(false, 100, 20); System.out.println(rawDataSize + " and after the sampling: " + sampledDataList.size()); }
@Override public SparkStream<T> sample(boolean withReplacement, int number) { Preconditions.checkArgument(number >= 0, "Sample size must be non-negative."); if (number == 0) { return StreamingContext.distributed().empty(); } if (withReplacement) { SparkStream<T> sample = new SparkStream<>(rdd.sample(true, 0.5)); while (sample.count() < number) { sample = sample.union(new SparkStream<>(rdd.sample(true, 0.5))); } if (sample.count() > number) { sample = sample.limit(number); } return sample; } return shuffle().limit(number); }
@Override public Tuple<Collection<ExecutionLineageNode>, Collection<ChannelInstance>> evaluate( ChannelInstance[] inputs, ChannelInstance[] outputs, SparkExecutor sparkExecutor, OptimizationContext.OperatorContext operatorContext) { assert inputs.length == this.getNumInputs(); assert outputs.length == this.getNumOutputs(); final RddChannel.Instance input = (RddChannel.Instance) inputs[0]; final RddChannel.Instance output = (RddChannel.Instance) outputs[0]; final JavaRDD<Type> inputRdd = input.provideRdd(); long datasetSize = this.isDataSetSizeKnown() ? this.getDatasetSize() : inputRdd.count(); int sampleSize = this.getSampleSize(operatorContext); long seed = this.getSeed(operatorContext); double sampleFraction = ((double) sampleSize) / datasetSize; final JavaRDD<Type> outputRdd = inputRdd.sample(false, sampleFraction, seed); this.name(outputRdd); output.accept(outputRdd, sparkExecutor); return ExecutionOperator.modelLazyExecution(inputs, outputs, operatorContext); }
List<Envelope> samples = this.rawSpatialRDD.sample(false, fraction) .map(new Function<T, Envelope>()
List<Envelope> samples = this.rawSpatialRDD.sample(false, fraction) .map(new Function<T, Envelope>()