public static String buildFilePath(ExtractorConfig extractorConfig) throws IllegalArgumentException { if(ExtractorConstants.HDFS.equals(extractorConfig.getExtractorImplClassName())) { String host = extractorConfig.getString(ExtractorConstants.HOST); String port = extractorConfig.getString(ExtractorConstants.PORT); String path = extractorConfig.getString(ExtractorConstants.FS_FILE_PATH); path = path.substring(0, path.lastIndexOf("/")); return ExtractorConstants.HDFS_PREFIX + host.toString() + ":" + port + path.toString() + "/" + METADATA_FILE; } else if(ExtractorConstants.S3.equals(extractorConfig.getExtractorImplClassName())) { String bucket = extractorConfig.getString(ExtractorConstants.S3_BUCKET); String path = extractorConfig.getString(ExtractorConstants.FS_FILE_PATH); path = path.substring(0, path.lastIndexOf("/")); return extractorConfig.getString(ExtractorConstants.FS_PREFIX) + bucket + path + "/" + METADATA_FILE; } throw new IllegalArgumentException("Configured ExtractorImplClassName must be 'hdfs' or 's3'"); }
setExtractorImplClassName(extractorConfig.getExtractorImplClassName()); setEntityClass(extractorConfig.getEntityClass()); setRddId(extractorConfig.getRddId()); setPartitionId(extractorConfig.getPartitionId()); Map<String, Serializable> values = extractorConfig.getValues(); username(extractorConfig.getString(USERNAME)); password(extractorConfig.getString(PASSWORD)); host((extractorConfig.getStringArray(HOST))); port((extractorConfig.getInteger(ES_REST_PORTS))); port((extractorConfig.getInteger(PORT))); table(extractorConfig.getString(COLLECTION)); inputColumns(extractorConfig.getStringArray(INPUT_COLUMNS)); catalog(extractorConfig.getString(DATABASE)); filters(extractorConfig.getFilterArray(FILTER_QUERY));
super.initialize(extractorConfig); Map<String, Serializable> values = extractorConfig.getValues(); batchSize(extractorConfig.getInteger(BATCHSIZE)); cqlPort(extractorConfig.getInteger(CQLPORT)); rpcPort(extractorConfig.getInteger(RPCPORT)); createTableOnWrite(extractorConfig.getBoolean(CREATE_ON_WRITE)); pageSize(extractorConfig.getInteger(PAGE_SIZE)); readConsistencyLevel(extractorConfig.getString(READ_CONSISTENCY_LEVEL)); writeConsistencyLevel(extractorConfig.getString(WRITE_CONSISTENCY_LEVEL)); bisectFactor(extractorConfig.getInteger(BISECT_FACTOR)); filters(extractorConfig.getFilterArray(ExtractorConstants.FILTER_QUERY)); setEqualsInValue((EqualsInValue) extractorConfig.getValue(EqualsInValue.class, ExtractorConstants.EQUALS_IN_FILTER));
@Test public void cloneObjectWithParentsTest(){ Map<String, Serializable> map = new HashMap<>(); map.put("key1","val1"); map.put("key2","val2"); map.put("key3","val3"); ExtractorConfig<Cells> extractorConfig = new ExtractorConfig(); extractorConfig.setExtractorImplClassName("testExtractor"); extractorConfig.setValues(map); ExtractorConfig<Cells> clone = extractorConfig.clone(); assertEquals(clone.getExtractorImplClassName(), extractorConfig.getExtractorImplClassName()); assertEquals(clone.getValues(), extractorConfig.getValues()); map.remove("key1"); assertEquals(map.size(),2); assertNotEquals(clone.getValues(), extractorConfig.getValues()); }
/** * Returns a Cells RDD from S3 fileSystem. * @param config Amazon S3 ExtractorConfig. * @return RDD of Cells. */ public RDD<Cells> createS3RDD(ExtractorConfig<Cells> config) { Serializable bucket = config.getValues().get(ExtractorConstants.S3_BUCKET); Serializable path = config.getValues().get(ExtractorConstants.FS_FILE_PATH); final TextFileDataTable textFileDataTable = UtilFS.createTextFileMetaDataFromConfig(config, this); String filePath = path.toString(); if (config.getExtractorImplClassName().equals(ExtractorConstants.S3)) { filePath = ExtractorConstants.S3_PREFIX + bucket.toString() + path.toString(); } Configuration hadoopConf = this.sc().hadoopConfiguration(); hadoopConf.set("fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem"); hadoopConf.set("fs.s3n.awsAccessKeyId", config.getString(ExtractorConstants.S3_ACCESS_KEY_ID)); hadoopConf.set("fs.s3n.awsSecretAccessKey", config.getString(ExtractorConstants.S3_SECRET_ACCESS_KEY)); return createRDDFromFilePath(filePath, textFileDataTable); }
@Override public ESDeepJobConfig<T> initialize(ExtractorConfig extractorConfig) { super.initialize(extractorConfig); Map<String, String> values = extractorConfig.getValues(); if (values.get(INPUT_COLUMNS) != null) { inputColumns(extractorConfig.getStringArray(INPUT_COLUMNS)); } if (values.get(FILTER_QUERY) != null) { filterQuery(extractorConfig.getFilterArray(FILTER_QUERY)); } this.initialize(); return this; }
/** * Returns a Cells RDD from HDFS. * @param config HDFS ExtractorConfig. * @return Cells RDD. */ public RDD<Cells> createHDFSRDD(ExtractorConfig<Cells> config) { Serializable host = config.getValues().get(ExtractorConstants.HOST); Serializable port = config.getValues().get(ExtractorConstants.PORT); Serializable path = config.getValues().get(ExtractorConstants.FS_FILE_PATH); final TextFileDataTable textFileDataTable = UtilFS.createTextFileMetaDataFromConfig(config, this); String filePath = path.toString(); if (config.getExtractorImplClassName().equals(ExtractorConstants.HDFS)) { filePath = ExtractorConstants.HDFS_PREFIX + host.toString() + ":" + port + path.toString(); } return createRDDFromFilePath(filePath, textFileDataTable); }
@Test public void testGetIntegerWorkArroundPorts() throws Exception { String value = "1234,1234,1234"; ExtractorConfig extractorConfig = new ExtractorConfig(); Map<String, Serializable> values = new HashedMap(); values.put(value,value); extractorConfig.setValues(values); Integer result = extractorConfig.getInteger(value); String expectedValue = value.split(",")[0]; assertEquals("The result must be "+ expectedValue, new Integer(expectedValue),result); }
/** * @param config */ @SuppressWarnings("unchecked") private void initExtractor(ExtractorConfig<T> config) { try { Class<T> rdd = config.getExtractorImplClass(); if (rdd == null) { rdd = (Class<T>) Class.forName(config.getExtractorImplClassName()); } Constructor<T> c = null; if (config.getEntityClass().isAssignableFrom(Cells.class)) { c = rdd.getConstructor(); this.extractor = (IExtractor<T, ExtractorConfig<T>>) c.newInstance(); } else { c = rdd.getConstructor(Class.class); this.extractor = (IExtractor<T, ExtractorConfig<T>>) c.newInstance(config.getEntityClass()); } } catch (ClassNotFoundException | InstantiationException | IllegalAccessException | IllegalArgumentException | InvocationTargetException | NoSuchMethodException | SecurityException e) { LOG.error("Impossible to make an extractor instance, check classpath " + e.getMessage()); throw new DeepInstantiationException( "Impossible to make an extractor instance, check classpath " + e.getMessage()); } }
/** * Get filter array. * * @param key the key * @return the filter [ ] */ public Filter[] getFilterArray(String key) { return getValue(Filter[].class, key); }
private ExtractorConfig createS3DeepJobConfig() { ExtractorConfig extractorConfig = new ExtractorConfig(); extractorConfig.setExtractorImplClassName(ExtractorConstants.S3); Map<String, Serializable> values = new HashMap<>(); values.put(ExtractorConstants.FS_FILE_SEPARATOR, ","); values.put(ExtractorConstants.S3_BUCKET, "bucket"); values.put(ExtractorConstants.FS_PREFIX, ExtractorConstants.S3_PREFIX); values.put(ExtractorConstants.FS_FILE_PATH, "/s3test.csv"); values.put(ExtractorConstants.S3_TYPE, ExtractorConstants.S3_TYPE); values.put(ExtractorConstants.CATALOG, ""); values.put(ExtractorConstants.TABLE, ""); extractorConfig.setValues(values); return extractorConfig; }
@Test public void initConfigTest(){ ExtractorConfig extractorConfig = new ExtractorConfig(); extractorConfig.setExtractorImplClassName("testExtractor"); DeepJobConfig deepJobConfig = new DeepJobConfig(); deepJobConfig.setExtractorImplClassName("testDeepJobConfig"); ChildDeepJobConfig childDeepJobConfig = new ChildDeepJobConfig(); childDeepJobConfig.setExtractorImplClassName("testChildDeepJobConfig"); childDeepJobConfig.setTestFiled("testField"); ChildDeepJobConfig test = initConfig(extractorConfig, new ChildDeepJobConfig()); assertEquals(test.getExtractorImplClassName(), extractorConfig.getExtractorImplClassName()); assertNull(test.getTestFiled()); test = initConfig(deepJobConfig, new ChildDeepJobConfig()); assertEquals(test.getExtractorImplClassName(), deepJobConfig.getExtractorImplClassName()); assertNull(test.getTestFiled()); test = initConfig(childDeepJobConfig, new ChildDeepJobConfig()); assertEquals(test.getExtractorImplClassName(), childDeepJobConfig.getExtractorImplClassName()); assertEquals(test.getTestFiled(), childDeepJobConfig.getTestFiled()); }
public static TextFileDataTable createTextFileMetaDataFromConfig(ExtractorConfig<Cells> extractorConfig, DeepSparkContext deepSparkContext) { Serializable separator = extractorConfig.getValues().get(ExtractorConstants.FS_FILE_SEPARATOR); String catalogName = (String) extractorConfig.getValues().get(ExtractorConstants.CATALOG); String tableName = (String) extractorConfig.getValues().get(ExtractorConstants.TABLE); final String splitSep = separator.toString(); if(extractorConfig.getValues().get(ExtractorConstants.FS_FILEDATATABLE)!=null ){ final TextFileDataTable textFileDataTable = (TextFileDataTable)extractorConfig.getValues().get(ExtractorConstants.FS_FILEDATATABLE); return textFileDataTable; }else if(extractorConfig.getValues().get(ExtractorConstants.FS_SCHEMA)!=null){ final ArrayList<SchemaMap<?>> columns = (ArrayList<SchemaMap<?>>) extractorConfig.getValues().get (ExtractorConstants.FS_SCHEMA); final TextFileDataTable textFileDataTableTemp = new TextFileDataTable(new TableName(catalogName, tableName), columns); textFileDataTableTemp.setLineSeparator(splitSep); return textFileDataTableTemp; }else{ final TextFileDataTable textFileDataTableTmp = createTextFileFromSchemaFile(buildFilePath(extractorConfig), deepSparkContext); textFileDataTableTmp.setLineSeparator(splitSep); return textFileDataTableTmp; } }
@Test public void testInitialize() throws Exception { ExtractorConfig<Cells> extractorConfig = new ExtractorConfig<>(Cells.class); extractorConfig.putValue(ExtractorConstants.HOST, testHost); extractorConfig.putValue(ExtractorConstants.PORT, testPort); extractorConfig.putValue(ExtractorConstants.CATALOG, testCatalog); extractorConfig.putValue(ExtractorConstants.TABLE, testTable); extractorConfig.putValue(ExtractorConstants.USERNAME, testUserName); extractorConfig.putValue(ExtractorConstants.PASSWORD, testPassword); extractorConfig.putValue(ExtractorConstants.INPUT_COLUMNS, testInputColumns); extractorConfig.putValue(ExtractorConstants.FILTER_QUERY, testFilter);
@Test public void createS3RDDTest() throws Exception { deepSparkContext = createDeepSparkContext(); Configuration hadoopConf = mock(Configuration.class); when(sparkContext.hadoopConfiguration()).thenReturn(hadoopConf); DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext); SQLContext sqlContext = mock(SQLContext.class); Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext); Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext); RDD<String> rdd = mock(RDD.class); JavaRDD<String> javaRDD = mock(JavaRDD.class); when(deepSparkContextSpy.sc().textFile(anyString(), anyInt())).thenReturn(rdd); doReturn(javaRDD).when(deepSparkContextSpy).textFile(anyString()); when(rdd.toJavaRDD()).thenReturn(javaRDD); when(rdd.toJavaRDD().map(any(Function.class))).thenReturn(singleRdd); ExtractorConfig<Cells> config = createS3DeepJobConfig(); deepSparkContextSpy.createS3RDD(config); verify(hadoopConf, times(1)).set("fs.s3n.awsAccessKeyId", config.getString(ExtractorConstants.S3_ACCESS_KEY_ID)); verify(hadoopConf, times(1)).set("fs.s3n.awsSecretAccessKey", config.getString(ExtractorConstants.S3_SECRET_ACCESS_KEY)); verify(deepSparkContextSpy.sc(), times(1)).textFile(anyString(), anyInt()); verify(javaRDD, times(1)).map(any(Function.class)); }
/** * Returns a Cells RDD from a HDFS or S3 ExtractorConfig. * @param config ExtractorConfig for HDFS or S3. * @return RDD of Cells. * @throws IllegalArgumentException */ public RDD textFile(ExtractorConfig<Cells> config) throws IllegalArgumentException { if(ExtractorConstants.HDFS.equals(config.getExtractorImplClassName())) { return createHDFSRDD(config); } else if(ExtractorConstants.S3.equals(config.getExtractorImplClassName())) { return createS3RDD(config); } throw new IllegalArgumentException("Valid configurations are HDFS paths, S3 paths or local file paths."); }
/** * Is entity class cells. * * @param extractorConfig the extractor config * @return the boolean */ private boolean isEntityClassCells(ExtractorConfig extractorConfig) { if (extractorConfig.getEntityClass().isAssignableFrom(Cells.class)) { return true; } return false; }
/** * Gets extractor config. * * @param clazz the clazz * @return the extractor config */ protected <W> ExtractorConfig<W> getExtractorConfig(Class<W> clazz) { return new ExtractorConfig<>(clazz); }
private ExtractorConfig createDeepJobConfig() { ExtractorConfig extractorConfig = mock(ExtractorConfig.class); when(extractorConfig.getExtractorImplClass()).thenReturn(new Object().getClass()); // when(extractorConfig.getInputFormatClass()).thenReturn(null); return extractorConfig; }