peopleList.add(new HappyPerson("holden", "coffee")); JavaRDD<HappyPerson> happyPeopleRDD = sc.parallelize(peopleList); DataFrame happyPeopleSchemaRDD = sqlCtx.applySchema(happyPeopleRDD, HappyPerson.class); happyPeopleSchemaRDD.registerTempTable("happy_people"); sqlCtx.udf().register("stringLengthJava", new UDF1<String, Integer>() {
/** * Creates a JavaSchemaRDD from a DeepJobConfig and a JavaSQLContext. * @param config Specific Deep ExtractorConfig. * @return A JavaSchemaRDD built from Cells. * @throws UnsupportedDataTypeException */ public DataFrame createJavaSchemaRDD(ExtractorConfig<Cells> config) throws UnsupportedDataTypeException, UnsupportedOperationException { JavaRDD<Cells> cellsRDD = createJavaRDD(config); JavaRDD<Row> rowsRDD = DeepSparkContext.createJavaRowRDD(cellsRDD); try { Cells firstCells = cellsRDD.first(); StructType schema = CellsUtils.getStructTypeFromCells(firstCells); return sqlContext.applySchema(rowsRDD, schema); } catch(UnsupportedOperationException e) { throw new UnsupportedOperationException("Cannot infer schema from empty data RDD", e); } }
/** * Creates a JavaSchemaRDD from a DeepJobConfig and a JavaSQLContext. * @param config Specific Deep ExtractorConfig. * @return A JavaSchemaRDD built from Cells. * @throws UnsupportedDataTypeException */ public DataFrame createJavaSchemaRDD(ExtractorConfig<Cells> config) throws UnsupportedDataTypeException, UnsupportedOperationException { JavaRDD<Cells> cellsRDD = createJavaRDD(config); JavaRDD<Row> rowsRDD = DeepSparkContext.createJavaRowRDD(cellsRDD); try { Cells firstCells = cellsRDD.first(); StructType schema = CellsUtils.getStructTypeFromCells(firstCells); return sqlContext.applySchema(rowsRDD, schema); } catch(UnsupportedOperationException e) { throw new UnsupportedOperationException("Cannot infer schema from empty data RDD", e); } }
@Test public void createJavaSchemaRDDTest() throws Exception { deepSparkContext = createDeepSparkContext(); DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext); SQLContext sqlContext = PowerMockito.mock(SQLContext.class); ExtractorConfig config = createDeepJobConfig(); Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext); Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext); PowerMockito.doReturn(singleRdd).when(deepSparkContextSpy).createJavaRDD(config); JavaRDD<Row> rowRDD = mock(JavaRDD.class); mockStatic(DeepSparkContext.class); when(DeepSparkContext.createJavaRowRDD(singleRdd)).thenReturn(rowRDD); Cells cells = mock(Cells.class); when(singleRdd.first()).thenReturn(cells); StructType schema = mock(StructType.class); mockStatic(CellsUtils.class); when(CellsUtils.getStructTypeFromCells(cells)).thenReturn(schema); deepSparkContextSpy.createJavaSchemaRDD(config); verify(sqlContext).applySchema(rowRDD, schema); }