/** * Returns a Cells RDD from a HDFS or S3 ExtractorConfig. * @param config ExtractorConfig for HDFS or S3. * @return RDD of Cells. * @throws IllegalArgumentException */ public RDD textFile(ExtractorConfig<Cells> config) throws IllegalArgumentException { if(ExtractorConstants.HDFS.equals(config.getExtractorImplClassName())) { return createHDFSRDD(config); } else if(ExtractorConstants.S3.equals(config.getExtractorImplClassName())) { return createS3RDD(config); } throw new IllegalArgumentException("Valid configurations are HDFS paths, S3 paths or local file paths."); }
/** * Returns a Cells RDD from a HDFS or S3 ExtractorConfig. * @param config ExtractorConfig for HDFS or S3. * @return RDD of Cells. * @throws IllegalArgumentException */ public RDD textFile(ExtractorConfig<Cells> config) throws IllegalArgumentException { if(ExtractorConstants.HDFS.equals(config.getExtractorImplClassName())) { return createHDFSRDD(config); } else if(ExtractorConstants.S3.equals(config.getExtractorImplClassName())) { return createS3RDD(config); } throw new IllegalArgumentException("Valid configurations are HDFS paths, S3 paths or local file paths."); }
public static String buildFilePath(ExtractorConfig extractorConfig) throws IllegalArgumentException { if(ExtractorConstants.HDFS.equals(extractorConfig.getExtractorImplClassName())) { String host = extractorConfig.getString(ExtractorConstants.HOST); String port = extractorConfig.getString(ExtractorConstants.PORT); String path = extractorConfig.getString(ExtractorConstants.FS_FILE_PATH); path = path.substring(0, path.lastIndexOf("/")); return ExtractorConstants.HDFS_PREFIX + host.toString() + ":" + port + path.toString() + "/" + METADATA_FILE; } else if(ExtractorConstants.S3.equals(extractorConfig.getExtractorImplClassName())) { String bucket = extractorConfig.getString(ExtractorConstants.S3_BUCKET); String path = extractorConfig.getString(ExtractorConstants.FS_FILE_PATH); path = path.substring(0, path.lastIndexOf("/")); return extractorConfig.getString(ExtractorConstants.FS_PREFIX) + bucket + path + "/" + METADATA_FILE; } throw new IllegalArgumentException("Configured ExtractorImplClassName must be 'hdfs' or 's3'"); }
public static String buildFilePath(ExtractorConfig extractorConfig) throws IllegalArgumentException { if(ExtractorConstants.HDFS.equals(extractorConfig.getExtractorImplClassName())) { String host = extractorConfig.getString(ExtractorConstants.HOST); String port = extractorConfig.getString(ExtractorConstants.PORT); String path = extractorConfig.getString(ExtractorConstants.FS_FILE_PATH); path = path.substring(0, path.lastIndexOf("/")); return ExtractorConstants.HDFS_PREFIX + host.toString() + ":" + port + path.toString() + "/" + METADATA_FILE; } else if(ExtractorConstants.S3.equals(extractorConfig.getExtractorImplClassName())) { String bucket = extractorConfig.getString(ExtractorConstants.S3_BUCKET); String path = extractorConfig.getString(ExtractorConstants.FS_FILE_PATH); path = path.substring(0, path.lastIndexOf("/")); return extractorConfig.getString(ExtractorConstants.FS_PREFIX) + bucket + path + "/" + METADATA_FILE; } throw new IllegalArgumentException("Configured ExtractorImplClassName must be 'hdfs' or 's3'"); }
/** * Returns a Cells RDD from HDFS. * @param config HDFS ExtractorConfig. * @return Cells RDD. */ public RDD<Cells> createHDFSRDD(ExtractorConfig<Cells> config) { Serializable host = config.getValues().get(ExtractorConstants.HOST); Serializable port = config.getValues().get(ExtractorConstants.PORT); Serializable path = config.getValues().get(ExtractorConstants.FS_FILE_PATH); final TextFileDataTable textFileDataTable = UtilFS.createTextFileMetaDataFromConfig(config, this); String filePath = path.toString(); if (config.getExtractorImplClassName().equals(ExtractorConstants.HDFS)) { filePath = ExtractorConstants.HDFS_PREFIX + host.toString() + ":" + port + path.toString(); } return createRDDFromFilePath(filePath, textFileDataTable); }
/** * Returns a Cells RDD from HDFS. * @param config HDFS ExtractorConfig. * @return Cells RDD. */ public RDD<Cells> createHDFSRDD(ExtractorConfig<Cells> config) { Serializable host = config.getValues().get(ExtractorConstants.HOST); Serializable port = config.getValues().get(ExtractorConstants.PORT); Serializable path = config.getValues().get(ExtractorConstants.FS_FILE_PATH); final TextFileDataTable textFileDataTable = UtilFS.createTextFileMetaDataFromConfig(config, this); String filePath = path.toString(); if (config.getExtractorImplClassName().equals(ExtractorConstants.HDFS)) { filePath = ExtractorConstants.HDFS_PREFIX + host.toString() + ":" + port + path.toString(); } return createRDDFromFilePath(filePath, textFileDataTable); }
/** * @param config */ @SuppressWarnings("unchecked") private void initExtractor(ExtractorConfig<T> config) { try { Class<T> rdd = config.getExtractorImplClass(); if (rdd == null) { rdd = (Class<T>) Class.forName(config.getExtractorImplClassName()); } Constructor<T> c = null; if (config.getEntityClass().isAssignableFrom(Cells.class)) { c = rdd.getConstructor(); this.extractor = (IExtractor<T, ExtractorConfig<T>>) c.newInstance(); } else { c = rdd.getConstructor(Class.class); this.extractor = (IExtractor<T, ExtractorConfig<T>>) c.newInstance(config.getEntityClass()); } } catch (ClassNotFoundException | InstantiationException | IllegalAccessException | IllegalArgumentException | InvocationTargetException | NoSuchMethodException | SecurityException e) { LOG.error("Impossible to make an extractor instance, check classpath " + e.getMessage()); throw new DeepInstantiationException( "Impossible to make an extractor instance, check classpath " + e.getMessage()); } }
/** * @param config */ @SuppressWarnings("unchecked") private void initExtractor(ExtractorConfig<T> config) { try { Class<T> rdd = config.getExtractorImplClass(); if (rdd == null) { rdd = (Class<T>) Class.forName(config.getExtractorImplClassName()); } Constructor<T> c = null; if (config.getEntityClass().isAssignableFrom(Cells.class)) { c = rdd.getConstructor(); this.extractor = (IExtractor<T, ExtractorConfig<T>>) c.newInstance(); } else { c = rdd.getConstructor(Class.class); this.extractor = (IExtractor<T, ExtractorConfig<T>>) c.newInstance(config.getEntityClass()); } } catch (ClassNotFoundException | InstantiationException | IllegalAccessException | IllegalArgumentException | InvocationTargetException | NoSuchMethodException | SecurityException e) { LOG.error("Impossible to make an extractor instance, check classpath " + e.getMessage()); throw new DeepInstantiationException( "Impossible to make an extractor instance, check classpath " + e.getMessage()); } }
/** * Returns a Cells RDD from S3 fileSystem. * @param config Amazon S3 ExtractorConfig. * @return RDD of Cells. */ public RDD<Cells> createS3RDD(ExtractorConfig<Cells> config) { Serializable bucket = config.getValues().get(ExtractorConstants.S3_BUCKET); Serializable path = config.getValues().get(ExtractorConstants.FS_FILE_PATH); final TextFileDataTable textFileDataTable = UtilFS.createTextFileMetaDataFromConfig(config, this); String filePath = path.toString(); if (config.getExtractorImplClassName().equals(ExtractorConstants.S3)) { filePath = ExtractorConstants.S3_PREFIX + bucket.toString() + path.toString(); } Configuration hadoopConf = this.sc().hadoopConfiguration(); hadoopConf.set("fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem"); hadoopConf.set("fs.s3n.awsAccessKeyId", config.getString(ExtractorConstants.S3_ACCESS_KEY_ID)); hadoopConf.set("fs.s3n.awsSecretAccessKey", config.getString(ExtractorConstants.S3_SECRET_ACCESS_KEY)); return createRDDFromFilePath(filePath, textFileDataTable); }
/** * Returns a Cells RDD from S3 fileSystem. * @param config Amazon S3 ExtractorConfig. * @return RDD of Cells. */ public RDD<Cells> createS3RDD(ExtractorConfig<Cells> config) { Serializable bucket = config.getValues().get(ExtractorConstants.S3_BUCKET); Serializable path = config.getValues().get(ExtractorConstants.FS_FILE_PATH); final TextFileDataTable textFileDataTable = UtilFS.createTextFileMetaDataFromConfig(config, this); String filePath = path.toString(); if (config.getExtractorImplClassName().equals(ExtractorConstants.S3)) { filePath = ExtractorConstants.S3_PREFIX + bucket.toString() + path.toString(); } Configuration hadoopConf = this.sc().hadoopConfiguration(); hadoopConf.set("fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem"); hadoopConf.set("fs.s3n.awsAccessKeyId", config.getString(ExtractorConstants.S3_ACCESS_KEY_ID)); hadoopConf.set("fs.s3n.awsSecretAccessKey", config.getString(ExtractorConstants.S3_SECRET_ACCESS_KEY)); return createRDDFromFilePath(filePath, textFileDataTable); }
@Test public void cloneObjectWithParentsTest(){ Map<String, Serializable> map = new HashMap<>(); map.put("key1","val1"); map.put("key2","val2"); map.put("key3","val3"); ExtractorConfig<Cells> extractorConfig = new ExtractorConfig(); extractorConfig.setExtractorImplClassName("testExtractor"); extractorConfig.setValues(map); ExtractorConfig<Cells> clone = extractorConfig.clone(); assertEquals(clone.getExtractorImplClassName(), extractorConfig.getExtractorImplClassName()); assertEquals(clone.getValues(), extractorConfig.getValues()); map.remove("key1"); assertEquals(map.size(),2); assertNotEquals(clone.getValues(), extractorConfig.getValues()); }
setExtractorImplClassName(extractorConfig.getExtractorImplClassName()); setEntityClass(extractorConfig.getEntityClass()); setRddId(extractorConfig.getRddId());
setExtractorImplClassName(extractorConfig.getExtractorImplClassName()); setEntityClass(extractorConfig.getEntityClass()); setRddId(extractorConfig.getRddId());
@Test public void initConfigTest(){ ExtractorConfig extractorConfig = new ExtractorConfig(); extractorConfig.setExtractorImplClassName("testExtractor"); DeepJobConfig deepJobConfig = new DeepJobConfig(); deepJobConfig.setExtractorImplClassName("testDeepJobConfig"); ChildDeepJobConfig childDeepJobConfig = new ChildDeepJobConfig(); childDeepJobConfig.setExtractorImplClassName("testChildDeepJobConfig"); childDeepJobConfig.setTestFiled("testField"); ChildDeepJobConfig test = initConfig(extractorConfig, new ChildDeepJobConfig()); assertEquals(test.getExtractorImplClassName(), extractorConfig.getExtractorImplClassName()); assertNull(test.getTestFiled()); test = initConfig(deepJobConfig, new ChildDeepJobConfig()); assertEquals(test.getExtractorImplClassName(), deepJobConfig.getExtractorImplClassName()); assertNull(test.getTestFiled()); test = initConfig(childDeepJobConfig, new ChildDeepJobConfig()); assertEquals(test.getExtractorImplClassName(), childDeepJobConfig.getExtractorImplClassName()); assertEquals(test.getTestFiled(), childDeepJobConfig.getTestFiled()); }