org.apache.spark.api.java.JavaRDD.sample java code examples

static JavaRDD<Vector> fetchSampleData(JavaRDD<Vector> evalData) {
 long count = evalData.count();
 if (count > MAX_SAMPLE_SIZE) {
  return evalData.sample(false, (double) MAX_SAMPLE_SIZE / count);
 }
 return evalData;
}

@Test
public void sample() {
 List<Integer> ints = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
 JavaRDD<Integer> rdd = sc.parallelize(ints);
 // the seeds here are "magic" to make this work out nicely
 JavaRDD<Integer> sample20 = rdd.sample(true, 0.2, 8);
 assertEquals(2, sample20.count());
 JavaRDD<Integer> sample20WithoutReplacement = rdd.sample(false, 0.2, 2);
 assertEquals(2, sample20WithoutReplacement.count());
}

@Test
public void sample() {
 List<Integer> ints = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
 JavaRDD<Integer> rdd = sc.parallelize(ints);
 // the seeds here are "magic" to make this work out nicely
 JavaRDD<Integer> sample20 = rdd.sample(true, 0.2, 8);
 assertEquals(2, sample20.count());
 JavaRDD<Integer> sample20WithoutReplacement = rdd.sample(false, 0.2, 2);
 assertEquals(2, sample20WithoutReplacement.count());
}

@Test
public void sample() {
 List<Integer> ints = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10);
 JavaRDD<Integer> rdd = sc.parallelize(ints);
 // the seeds here are "magic" to make this work out nicely
 JavaRDD<Integer> sample20 = rdd.sample(true, 0.2, 8);
 assertEquals(2, sample20.count());
 JavaRDD<Integer> sample20WithoutReplacement = rdd.sample(false, 0.2, 2);
 assertEquals(2, sample20WithoutReplacement.count());
}

private Map<Object, Integer> calculateSplitsForColumn(final JavaRDD<Element> data, final IdentifierType colName) {
  final List<Object> splits = data.sample(false, 1.0 / sampleRate)
      .map(element -> element.getIdentifier(colName))
      .sortBy(obj -> obj, true, numOfSplits)
      .mapPartitions(objectIterator -> {
        final List<Object> list = new ArrayList<>(1);
        if (objectIterator.hasNext()) {
          list.add(objectIterator.next());
        }
        return list.iterator();
      })
      .collect();
  final Map<Object, Integer> splitPoints = new TreeMap<>(COMPARATOR);
  int i = 0;
  for (final Object split : splits) {
    if (null != split) {
      splitPoints.put(split, i);
    }
    i++;
  }
  return splitPoints;
}

/**
 * This method calculate data size in megabytes
 * If total row number > ROW_SAMPLE_THRESHOLD
 * => It samples data to row number = ROW_SAMPLE_THRESHOLD
 * => Calculate sample data size by {@link FileSink#getSampleSizeInBytes(JavaRDD)}
 * => Calculate total data sizes by fraction and change to megabyte
 * Else calculate total data size by {@link FileSink#getSampleSizeInBytes(JavaRDD)}
 *
 * @param data data to calculate size in megabytes
 * @return estimated data size in megabytes
 */
protected double getRddSizeInMegaByte(@NonNull final JavaRDD<String> data) {
  final RDDWrapper<String> dataWrapper = new RDDWrapper<>(data);
  final long totalRows = dataWrapper.getCount();
  final double totalSize;
  if (totalRows > ROW_SAMPLING_THRESHOLD) {
    log.debug("Start sampling on Write Data.");
    final double fraction = (double) ROW_SAMPLING_THRESHOLD / (double) totalRows;
    log.debug("Sample fraction: {}", fraction);
    final JavaRDD<String> sampleRdd = data.sample(false, fraction);
    final long sampleSizeInBytes = getSampleSizeInBytes(sampleRdd);
    final double sampleSizeInMB = (double) sampleSizeInBytes / FileUtils.ONE_MB;
    totalSize = sampleSizeInMB / fraction;
  } else {
    totalSize = (double) getSampleSizeInBytes(data) / FileUtils.ONE_MB;
  }
  return totalSize;
}

public static void main(String[] args) throws Exception {
  SparkConf conf = new SparkConf().setAppName("sampling").setMaster("local").set("spark.cores.max", "10");
  JavaSparkContext sc = new JavaSparkContext(conf);
  File outputFile = downloadFile("http://kdd.ics.uci.edu/databases/kddcup99/kddcup.data.gz");
  JavaRDD<String> rawData = sc.textFile(outputFile.getAbsolutePath());
  /**
   * Sampling RDDs: In Spark, there are two sampling operations, the transformation sample and the action
   * takeSample. By using a transformation we can tell Spark to apply successive transformation on a sample
   * of a given RDD. By using an action we retrieve a given sample and we can have it in local memory to be
   * used by any other standard library. The sample transformation takes up to three parameters.
   * First is weather the sampling is done with replacement or not.
   * Second is the sample size as a fraction. Finally we can optionally provide a random seed.
   */
  JavaRDD<String> sampledData = rawData.sample(false, 0.1, 1234);
  long sampleDataSize = sampledData.count();
  long rawDataSize = rawData.count();
  System.out.println(rawDataSize + " and after the sampling: " + sampleDataSize);
  /**
   *
   * takeSample allow the user takeSample(withReplacement, num, [seed]) to return an array with
   * a random sample of num elements of the dataset, with or without replacement, optionally
   * pre-specifying a random number generator seed.
   *
   */
  List<String> sampledDataList = rawData.takeSample(false, 100, 20);
  System.out.println(rawDataSize + " and after the sampling: " + sampledDataList.size());
}

@Override
public SparkStream<T> sample(boolean withReplacement, int number) {
 Preconditions.checkArgument(number >= 0, "Sample size must be non-negative.");
 if (number == 0) {
   return StreamingContext.distributed().empty();
 }
 if (withReplacement) {
   SparkStream<T> sample = new SparkStream<>(rdd.sample(true, 0.5));
   while (sample.count() < number) {
    sample = sample.union(new SparkStream<>(rdd.sample(true, 0.5)));
   }
   if (sample.count() > number) {
    sample = sample.limit(number);
   }
   return sample;
 }
 return shuffle().limit(number);
}

@Override
public Tuple<Collection<ExecutionLineageNode>, Collection<ChannelInstance>> evaluate(
    ChannelInstance[] inputs,
    ChannelInstance[] outputs,
    SparkExecutor sparkExecutor,
    OptimizationContext.OperatorContext operatorContext) {
  assert inputs.length == this.getNumInputs();
  assert outputs.length == this.getNumOutputs();
  final RddChannel.Instance input = (RddChannel.Instance) inputs[0];
  final RddChannel.Instance output = (RddChannel.Instance) outputs[0];
  final JavaRDD<Type> inputRdd = input.provideRdd();
  long datasetSize = this.isDataSetSizeKnown() ? this.getDatasetSize() : inputRdd.count();
  int sampleSize = this.getSampleSize(operatorContext);
  long seed = this.getSeed(operatorContext);
  double sampleFraction = ((double) sampleSize) / datasetSize;
  final JavaRDD<Type> outputRdd = inputRdd.sample(false, sampleFraction, seed);
  this.name(outputRdd);
  output.accept(outputRdd, sparkExecutor);
  return ExecutionOperator.modelLazyExecution(inputs, outputs, operatorContext);
}

List<Envelope> samples = this.rawSpatialRDD.sample(false, fraction)
    .map(new Function<T, Envelope>()

List<Envelope> samples = this.rawSpatialRDD.sample(false, fraction)
    .map(new Function<T, Envelope>()

How to use samplemethodin org.apache.spark.api.java.JavaRDD

Best Java code snippets using org.apache.spark.api.java.JavaRDD.sample (Showing top 11 results out of 315)

How to use
sample
method
in
org.apache.spark.api.java.JavaRDD