static JavaRDD<Vector> fetchSampleData(JavaRDD<Vector> evalData) { long count = evalData.count(); if (count > MAX_SAMPLE_SIZE) { return evalData.sample(false, (double) MAX_SAMPLE_SIZE / count); } return evalData; }
@Override public double evaluate(JavaSparkContext sparkContext, PMML model, Path modelParentPath, JavaRDD<String> testData, JavaRDD<String> trainData) { long testDataCount = testData.count(); testCounts.add((int) testDataCount); log.info("Returning eval {}", testDataCount); return testDataCount; }
static double accuracy(DecisionForest forest, JavaRDD<Example> examples) { long total = examples.count(); if (total == 0) { return 0.0; } long correct = examples.filter(example -> { CategoricalPrediction prediction = (CategoricalPrediction) forest.predict(example); CategoricalFeature target = (CategoricalFeature) example.getTarget(); return prediction.getMostProbableCategoryEncoding() == target.getEncoding(); }).count(); return (double) correct / total; }
@Override public PMML buildModel(JavaSparkContext sparkContext, JavaRDD<String> trainData, List<?> hyperParameters, Path candidatePath) { // If lists are unequal at this point, there must have been an empty test set // which yielded no call to evaluate(). Fill in the blank while (trainCounts.size() > testCounts.size()) { testCounts.add(0); } trainCounts.add((int) trainData.count()); return PMMLUtilsTest.buildDummyModel(); }
@Test public void sample() { List<Integer> ints = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); JavaRDD<Integer> rdd = sc.parallelize(ints); // the seeds here are "magic" to make this work out nicely JavaRDD<Integer> sample20 = rdd.sample(true, 0.2, 8); assertEquals(2, sample20.count()); JavaRDD<Integer> sample20WithoutReplacement = rdd.sample(false, 0.2, 2); assertEquals(2, sample20WithoutReplacement.count()); }
@Test public void sample() { List<Integer> ints = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); JavaRDD<Integer> rdd = sc.parallelize(ints); // the seeds here are "magic" to make this work out nicely JavaRDD<Integer> sample20 = rdd.sample(true, 0.2, 8); assertEquals(2, sample20.count()); JavaRDD<Integer> sample20WithoutReplacement = rdd.sample(false, 0.2, 2); assertEquals(2, sample20WithoutReplacement.count()); }
@Test public void sample() { List<Integer> ints = Arrays.asList(1, 2, 3, 4, 5, 6, 7, 8, 9, 10); JavaRDD<Integer> rdd = sc.parallelize(ints); // the seeds here are "magic" to make this work out nicely JavaRDD<Integer> sample20 = rdd.sample(true, 0.2, 8); assertEquals(2, sample20.count()); JavaRDD<Integer> sample20WithoutReplacement = rdd.sample(false, 0.2, 2); assertEquals(2, sample20WithoutReplacement.count()); }
@Test public void emptyRDD() { JavaRDD<String> rdd = sc.emptyRDD(); assertEquals("Empty RDD shouldn't have any values", 0, rdd.count()); }
@Test public void emptyRDD() { JavaRDD<String> rdd = sc.emptyRDD(); assertEquals("Empty RDD shouldn't have any values", 0, rdd.count()); }
@Test public void emptyRDD() { JavaRDD<String> rdd = sc.emptyRDD(); assertEquals("Empty RDD shouldn't have any values", 0, rdd.count()); }
@Test public void testJavaJdbcRDD() throws Exception { JavaRDD<Integer> rdd = JdbcRDD.create( sc, () -> DriverManager.getConnection("jdbc:derby:target/JavaJdbcRDDSuiteDb"), "SELECT DATA FROM FOO WHERE ? <= ID AND ID <= ?", 1, 100, 1, r -> r.getInt(1) ).cache(); Assert.assertEquals(100, rdd.count()); Assert.assertEquals(Integer.valueOf(10100), rdd.reduce((i1, i2) -> i1 + i2)); } }
@Test public void testJavaJdbcRDD() throws Exception { JavaRDD<Integer> rdd = JdbcRDD.create( sc, () -> DriverManager.getConnection("jdbc:derby:target/JavaJdbcRDDSuiteDb"), "SELECT DATA FROM FOO WHERE ? <= ID AND ID <= ?", 1, 100, 1, r -> r.getInt(1) ).cache(); Assert.assertEquals(100, rdd.count()); Assert.assertEquals(Integer.valueOf(10100), rdd.reduce((i1, i2) -> i1 + i2)); } }
@Test public void testJavaJdbcRDD() throws Exception { JavaRDD<Integer> rdd = JdbcRDD.create( sc, () -> DriverManager.getConnection("jdbc:derby:target/JavaJdbcRDDSuiteDb"), "SELECT DATA FROM FOO WHERE ? <= ID AND ID <= ?", 1, 100, 1, r -> r.getInt(1) ).cache(); Assert.assertEquals(100, rdd.count()); Assert.assertEquals(Integer.valueOf(10100), rdd.reduce((i1, i2) -> i1 + i2)); } }
@Test public void testFetchSampleEvalData() { JavaRDD<Vector> evalData = SilhouetteCoefficient.fetchSampleData(getRddOfVectors()); assertEquals(6, evalData.count()); }
@Test public void checkpointAndComputation() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); sc.setCheckpointDir(tempDir.getAbsolutePath()); assertFalse(rdd.isCheckpointed()); rdd.checkpoint(); rdd.count(); // Forces the DAG to cause a checkpoint assertTrue(rdd.isCheckpointed()); assertEquals(Arrays.asList(1, 2, 3, 4, 5), rdd.collect()); }
@Test public void checkpointAndComputation() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); sc.setCheckpointDir(tempDir.getAbsolutePath()); assertFalse(rdd.isCheckpointed()); rdd.checkpoint(); rdd.count(); // Forces the DAG to cause a checkpoint assertTrue(rdd.isCheckpointed()); assertEquals(Arrays.asList(1, 2, 3, 4, 5), rdd.collect()); }
@Test public void checkpointAndComputation() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); sc.setCheckpointDir(tempDir.getAbsolutePath()); assertFalse(rdd.isCheckpointed()); rdd.checkpoint(); rdd.count(); // Forces the DAG to cause a checkpoint assertTrue(rdd.isCheckpointed()); assertEquals(Arrays.asList(1, 2, 3, 4, 5), rdd.collect()); }
@Test public void checkpointAndRestore() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); sc.setCheckpointDir(tempDir.getAbsolutePath()); assertFalse(rdd.isCheckpointed()); rdd.checkpoint(); rdd.count(); // Forces the DAG to cause a checkpoint assertTrue(rdd.isCheckpointed()); assertTrue(rdd.getCheckpointFile().isPresent()); JavaRDD<Integer> recovered = sc.checkpointFile(rdd.getCheckpointFile().get()); assertEquals(Arrays.asList(1, 2, 3, 4, 5), recovered.collect()); }
@Test public void checkpointAndRestore() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); sc.setCheckpointDir(tempDir.getAbsolutePath()); assertFalse(rdd.isCheckpointed()); rdd.checkpoint(); rdd.count(); // Forces the DAG to cause a checkpoint assertTrue(rdd.isCheckpointed()); assertTrue(rdd.getCheckpointFile().isPresent()); JavaRDD<Integer> recovered = sc.checkpointFile(rdd.getCheckpointFile().get()); assertEquals(Arrays.asList(1, 2, 3, 4, 5), recovered.collect()); }
@Test public void checkpointAndRestore() { JavaRDD<Integer> rdd = sc.parallelize(Arrays.asList(1, 2, 3, 4, 5)); sc.setCheckpointDir(tempDir.getAbsolutePath()); assertFalse(rdd.isCheckpointed()); rdd.checkpoint(); rdd.count(); // Forces the DAG to cause a checkpoint assertTrue(rdd.isCheckpointed()); assertTrue(rdd.getCheckpointFile().isPresent()); JavaRDD<Integer> recovered = sc.checkpointFile(rdd.getCheckpointFile().get()); assertEquals(Arrays.asList(1, 2, 3, 4, 5), recovered.collect()); }