/** * Generate a sample of DataSet which contains fixed size elements. * * <p><strong>NOTE:</strong> Sample with fixed size is not as efficient as sample with fraction, use sample with * fraction unless you need exact precision. * * @param withReplacement Whether element can be selected more than once. * @param numSamples The expected sample size. * @param seed Random number generator seed. * @return The sampled DataSet */ public static <T> DataSet<T> sampleWithSize( DataSet <T> input, final boolean withReplacement, final int numSamples, final long seed) { SampleInPartition<T> sampleInPartition = new SampleInPartition<>(withReplacement, numSamples, seed); MapPartitionOperator mapPartitionOperator = input.mapPartition(sampleInPartition); // There is no previous group, so the parallelism of GroupReduceOperator is always 1. String callLocation = Utils.getCallLocationName(); SampleInCoordinator<T> sampleInCoordinator = new SampleInCoordinator<>(withReplacement, numSamples, seed); return new GroupReduceOperator<>(mapPartitionOperator, input.getType(), sampleInCoordinator, callLocation); }
@Override public void mapPartition(Iterable<T> values, Collector<IntermediateSampleData<T>> out) throws Exception { DistributedRandomSampler<T> sampler; long seedAndIndex = seed + getRuntimeContext().getIndexOfThisSubtask(); if (withReplacement) { sampler = new ReservoirSamplerWithReplacement<T>(numSample, seedAndIndex); } else { sampler = new ReservoirSamplerWithoutReplacement<T>(numSample, seedAndIndex); } Iterator<IntermediateSampleData<T>> sampled = sampler.sampleInPartition(values.iterator()); while (sampled.hasNext()) { out.collect(sampled.next()); } } }
final SampleInPartition sampleInPartition = new SampleInPartition(false, sampleSize, SEED); final TypeInformation<?> sourceOutputType = sourceNode.getOptimizerNode().getOperator().getOperatorInfo().getOutputType(); final TypeInformation<IntermediateSampleData> isdTypeInformation = TypeExtractor.getForClass(IntermediateSampleData.class);
@Override public void mapPartition(Iterable<T> values, Collector<IntermediateSampleData<T>> out) throws Exception { DistributedRandomSampler<T> sampler; long seedAndIndex = seed + getRuntimeContext().getIndexOfThisSubtask(); if (withReplacement) { sampler = new ReservoirSamplerWithReplacement<T>(numSample, seedAndIndex); } else { sampler = new ReservoirSamplerWithoutReplacement<T>(numSample, seedAndIndex); } Iterator<IntermediateSampleData<T>> sampled = sampler.sampleInPartition(values.iterator()); while (sampled.hasNext()) { out.collect(sampled.next()); } } }
/** * Generate a sample of DataSet which contains fixed size elements. * * <p><strong>NOTE:</strong> Sample with fixed size is not as efficient as sample with fraction, use sample with * fraction unless you need exact precision. * * @param withReplacement Whether element can be selected more than once. * @param numSamples The expected sample size. * @param seed Random number generator seed. * @return The sampled DataSet */ public static <T> DataSet<T> sampleWithSize( DataSet <T> input, final boolean withReplacement, final int numSamples, final long seed) { SampleInPartition<T> sampleInPartition = new SampleInPartition<>(withReplacement, numSamples, seed); MapPartitionOperator mapPartitionOperator = input.mapPartition(sampleInPartition); // There is no previous group, so the parallelism of GroupReduceOperator is always 1. String callLocation = Utils.getCallLocationName(); SampleInCoordinator<T> sampleInCoordinator = new SampleInCoordinator<>(withReplacement, numSamples, seed); return new GroupReduceOperator<>(mapPartitionOperator, input.getType(), sampleInCoordinator, callLocation); }
@Override public void mapPartition(Iterable<T> values, Collector<IntermediateSampleData<T>> out) throws Exception { DistributedRandomSampler<T> sampler; long seedAndIndex = seed + getRuntimeContext().getIndexOfThisSubtask(); if (withReplacement) { sampler = new ReservoirSamplerWithReplacement<T>(numSample, seedAndIndex); } else { sampler = new ReservoirSamplerWithoutReplacement<T>(numSample, seedAndIndex); } Iterator<IntermediateSampleData<T>> sampled = sampler.sampleInPartition(values.iterator()); while (sampled.hasNext()) { out.collect(sampled.next()); } } }
/** * Generate a sample of DataSet which contains fixed size elements. * * <p><strong>NOTE:</strong> Sample with fixed size is not as efficient as sample with fraction, use sample with * fraction unless you need exact precision. * * @param withReplacement Whether element can be selected more than once. * @param numSamples The expected sample size. * @param seed Random number generator seed. * @return The sampled DataSet */ public static <T> DataSet<T> sampleWithSize( DataSet <T> input, final boolean withReplacement, final int numSamples, final long seed) { SampleInPartition<T> sampleInPartition = new SampleInPartition<>(withReplacement, numSamples, seed); MapPartitionOperator mapPartitionOperator = input.mapPartition(sampleInPartition); // There is no previous group, so the parallelism of GroupReduceOperator is always 1. String callLocation = Utils.getCallLocationName(); SampleInCoordinator<T> sampleInCoordinator = new SampleInCoordinator<>(withReplacement, numSamples, seed); return new GroupReduceOperator<>(mapPartitionOperator, input.getType(), sampleInCoordinator, callLocation); }
final SampleInPartition sampleInPartition = new SampleInPartition(false, sampleSize, SEED); final TypeInformation<?> sourceOutputType = sourceNode.getOptimizerNode().getOperator().getOperatorInfo().getOutputType(); final TypeInformation<IntermediateSampleData> isdTypeInformation = TypeExtractor.getForClass(IntermediateSampleData.class);
final SampleInPartition sampleInPartition = new SampleInPartition(false, sampleSize, SEED); final TypeInformation<?> sourceOutputType = sourceNode.getOptimizerNode().getOperator().getOperatorInfo().getOutputType(); final TypeInformation<IntermediateSampleData> isdTypeInformation = TypeExtractor.getForClass(IntermediateSampleData.class);
final SampleInPartition sampleInPartition = new SampleInPartition(false, sampleSize, SEED); final TypeInformation<?> sourceOutputType = sourceNode.getOptimizerNode().getOperator().getOperatorInfo().getOutputType(); final TypeInformation<IntermediateSampleData> isdTypeInformation = TypeExtractor.getForClass(IntermediateSampleData.class);
final SampleInPartition sampleInPartition = new SampleInPartition(false, sampleSize, SEED); final TypeInformation<?> sourceOutputType = sourceNode.getOptimizerNode().getOperator().getOperatorInfo().getOutputType(); final TypeInformation<IntermediateSampleData> isdTypeInformation = TypeExtractor.getForClass(IntermediateSampleData.class);