public static String buildFilePath(ExtractorConfig extractorConfig) throws IllegalArgumentException { if(ExtractorConstants.HDFS.equals(extractorConfig.getExtractorImplClassName())) { String host = extractorConfig.getString(ExtractorConstants.HOST); String port = extractorConfig.getString(ExtractorConstants.PORT); String path = extractorConfig.getString(ExtractorConstants.FS_FILE_PATH); path = path.substring(0, path.lastIndexOf("/")); return ExtractorConstants.HDFS_PREFIX + host.toString() + ":" + port + path.toString() + "/" + METADATA_FILE; } else if(ExtractorConstants.S3.equals(extractorConfig.getExtractorImplClassName())) { String bucket = extractorConfig.getString(ExtractorConstants.S3_BUCKET); String path = extractorConfig.getString(ExtractorConstants.FS_FILE_PATH); path = path.substring(0, path.lastIndexOf("/")); return extractorConfig.getString(ExtractorConstants.FS_PREFIX) + bucket + path + "/" + METADATA_FILE; } throw new IllegalArgumentException("Configured ExtractorImplClassName must be 'hdfs' or 's3'"); }
public static String buildFilePath(ExtractorConfig extractorConfig) throws IllegalArgumentException { if(ExtractorConstants.HDFS.equals(extractorConfig.getExtractorImplClassName())) { String host = extractorConfig.getString(ExtractorConstants.HOST); String port = extractorConfig.getString(ExtractorConstants.PORT); String path = extractorConfig.getString(ExtractorConstants.FS_FILE_PATH); path = path.substring(0, path.lastIndexOf("/")); return ExtractorConstants.HDFS_PREFIX + host.toString() + ":" + port + path.toString() + "/" + METADATA_FILE; } else if(ExtractorConstants.S3.equals(extractorConfig.getExtractorImplClassName())) { String bucket = extractorConfig.getString(ExtractorConstants.S3_BUCKET); String path = extractorConfig.getString(ExtractorConstants.FS_FILE_PATH); path = path.substring(0, path.lastIndexOf("/")); return extractorConfig.getString(ExtractorConstants.FS_PREFIX) + bucket + path + "/" + METADATA_FILE; } throw new IllegalArgumentException("Configured ExtractorImplClassName must be 'hdfs' or 's3'"); }
/** * Get string array. * * @param key the key * @return the string [ ] */ public String[] getStringArray(String key) { try { return getValue(String[].class, key); } catch (ClassCastException e) { return new String[] { getString(key) }; } }
/** * Get string array. * * @param key the key * @return the string [ ] */ public String[] getStringArray(String key) { try { return getValue(String[].class, key); } catch (ClassCastException e) { return new String[] { getString(key) }; } }
/** * Gets integer. * * @param key the key * @return the integer */ public Integer getInteger(String key) { try { return getValue(Integer.class, key); } catch (ClassCastException e) { String value = getString(key); return Integer.parseInt(value.split(",")[0]); } }
/** * Gets integer. * * @param key the key * @return the integer */ public Integer getInteger(String key) { try { return getValue(Integer.class, key); } catch (ClassCastException e) { String value = getString(key); return Integer.parseInt(value.split(",")[0]); } }
/** * Returns a Cells RDD from S3 fileSystem. * @param config Amazon S3 ExtractorConfig. * @return RDD of Cells. */ public RDD<Cells> createS3RDD(ExtractorConfig<Cells> config) { Serializable bucket = config.getValues().get(ExtractorConstants.S3_BUCKET); Serializable path = config.getValues().get(ExtractorConstants.FS_FILE_PATH); final TextFileDataTable textFileDataTable = UtilFS.createTextFileMetaDataFromConfig(config, this); String filePath = path.toString(); if (config.getExtractorImplClassName().equals(ExtractorConstants.S3)) { filePath = ExtractorConstants.S3_PREFIX + bucket.toString() + path.toString(); } Configuration hadoopConf = this.sc().hadoopConfiguration(); hadoopConf.set("fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem"); hadoopConf.set("fs.s3n.awsAccessKeyId", config.getString(ExtractorConstants.S3_ACCESS_KEY_ID)); hadoopConf.set("fs.s3n.awsSecretAccessKey", config.getString(ExtractorConstants.S3_SECRET_ACCESS_KEY)); return createRDDFromFilePath(filePath, textFileDataTable); }
/** * Returns a Cells RDD from S3 fileSystem. * @param config Amazon S3 ExtractorConfig. * @return RDD of Cells. */ public RDD<Cells> createS3RDD(ExtractorConfig<Cells> config) { Serializable bucket = config.getValues().get(ExtractorConstants.S3_BUCKET); Serializable path = config.getValues().get(ExtractorConstants.FS_FILE_PATH); final TextFileDataTable textFileDataTable = UtilFS.createTextFileMetaDataFromConfig(config, this); String filePath = path.toString(); if (config.getExtractorImplClassName().equals(ExtractorConstants.S3)) { filePath = ExtractorConstants.S3_PREFIX + bucket.toString() + path.toString(); } Configuration hadoopConf = this.sc().hadoopConfiguration(); hadoopConf.set("fs.s3n.impl", "org.apache.hadoop.fs.s3native.NativeS3FileSystem"); hadoopConf.set("fs.s3n.awsAccessKeyId", config.getString(ExtractorConstants.S3_ACCESS_KEY_ID)); hadoopConf.set("fs.s3n.awsSecretAccessKey", config.getString(ExtractorConstants.S3_SECRET_ACCESS_KEY)); return createRDDFromFilePath(filePath, textFileDataTable); }
@Test public void createS3RDDTest() throws Exception { deepSparkContext = createDeepSparkContext(); Configuration hadoopConf = mock(Configuration.class); when(sparkContext.hadoopConfiguration()).thenReturn(hadoopConf); DeepSparkContext deepSparkContextSpy = PowerMockito.spy(deepSparkContext); SQLContext sqlContext = mock(SQLContext.class); Whitebox.setInternalState(deepSparkContextSpy, "sc", sparkContext); Whitebox.setInternalState(deepSparkContextSpy, "sqlContext", sqlContext); RDD<String> rdd = mock(RDD.class); JavaRDD<String> javaRDD = mock(JavaRDD.class); when(deepSparkContextSpy.sc().textFile(anyString(), anyInt())).thenReturn(rdd); doReturn(javaRDD).when(deepSparkContextSpy).textFile(anyString()); when(rdd.toJavaRDD()).thenReturn(javaRDD); when(rdd.toJavaRDD().map(any(Function.class))).thenReturn(singleRdd); ExtractorConfig<Cells> config = createS3DeepJobConfig(); deepSparkContextSpy.createS3RDD(config); verify(hadoopConf, times(1)).set("fs.s3n.awsAccessKeyId", config.getString(ExtractorConstants.S3_ACCESS_KEY_ID)); verify(hadoopConf, times(1)).set("fs.s3n.awsSecretAccessKey", config.getString(ExtractorConstants.S3_SECRET_ACCESS_KEY)); verify(deepSparkContextSpy.sc(), times(1)).textFile(anyString(), anyInt()); verify(javaRDD, times(1)).map(any(Function.class)); }