final JavaRDD<String> convertedData = this.converter.convertAll(data); final int partNum = getRepartitionNum(convertedData); final int desiredDigit = (int) Math.floor(Math.log10(partNum) + 1); final JavaRDD<String> dataToWrite; if (this.conf.isColumnHeader()) { final String header = this.converter.getHeader(data); dataToWrite = addColumnHeader(header, dataRepartitioned); } else {
@Test public void testGetHeaderWithCsv() { final String separator = ","; final JavaRDD<AvroPayload> payloadData = AvroPayloadUtil.generateTestData(this.jsc.get(), 10, StringTypes.EMPTY); final Configuration conf = initConf(separator, "csv"); log.info("Starting to get data header."); final FileSinkDataConverter converter = new FileSinkDataConverter(conf, new ErrorExtractor()); final String header = converter.getHeader(payloadData); final String resultHeader = "int_field,string_field,boolean_field"; Assert.assertEquals(resultHeader, header); log.info("Header: {}", header); }
/** * This method converts RDD of AvroPayload data to RDD of String with specified file type. * Currently supports csv file only. * @param data * @return * @throws UnsupportedOperationException */ public JavaRDD<String> convertAll(@NonNull final JavaRDD<AvroPayload> data) throws UnsupportedOperationException { final JavaRDD<String> lines = data.map(row -> { final String line = this.convert(row).get(0).getSuccessData().get().getData(); log.debug("Line: {}", line); return line; }); return lines; }
@Test(expected = SparkException.class) public void testConvertAllWithJsonNotSupported() { log.info("Starts Test convert all with json"); final String separator = ","; final JavaRDD<AvroPayload> payloadData = AvroPayloadUtil.generateTestData(this.jsc.get(), 10, StringTypes.EMPTY); final Configuration conf = initConf(separator, "json"); log.info("Starting to convert data."); final FileSinkDataConverter converter = new FileSinkDataConverter(conf, new ErrorExtractor()); final JavaRDD<String> dataConverted = converter.convertAll(payloadData); int i = 1; for (String line: dataConverted.collect()){ Assert.assertEquals(String.valueOf(i) + separator + String.valueOf(i) + separator + "true", line ); i = i + 1; } }
private void testWriteToCsvCommon(@NonNull final String pathPrefix, @NonNull final String path, @NonNull final String separator, @NonNull final JavaRDD<AvroPayload> testData, @NonNull final int partitionNum, @NonNull final String timeStamp, @NonNull final String sourceSubPath, @NonNull final String dispersalType) throws Exception { final Configuration conf = initConfig(pathPrefix, path, separator, timeStamp, sourceSubPath, dispersalType); final FileSinkDataConverter converter = new FileSinkDataConverter(conf, new ErrorExtractor()); final FileSinkConfiguration fileConf = new FileSinkConfiguration(conf); final HdfsFileSink hdfsSink = spy(new HdfsFileSink(fileConf, converter)); hdfsSink.write(testData); verify(hdfsSink, times(1)).write(Matchers.any(JavaRDD.class)); verify(hdfsSink, times(1)).getRepartitionNum(Matchers.any(JavaRDD.class)); verify(hdfsSink, times(1)).getRddSizeInMegaByte(Matchers.any(JavaRDD.class)); final FileStatus[] status = this.fileSystem.get().listStatus(new Path(fileConf.getPathHdfs())); int fileNum = 0; for (final FileStatus fileStatus : status) { if (fileStatus.isFile()) { fileNum++; } } assertEquals(partitionNum, fileNum); } }
@Test public void testConvertAllWithCsv() { log.info("Starts Test convert all with csv"); final String separator = " "; final JavaRDD<AvroPayload> payloadData = AvroPayloadUtil.generateTestData(this.jsc.get(), 10, StringTypes.EMPTY); final Configuration conf = initConf(separator, "csv"); log.info("Starting to convert data."); final FileSinkDataConverter converter = new FileSinkDataConverter(conf, new ErrorExtractor()); final JavaRDD<String> dataConverted = converter.convertAll(payloadData); int i = 1; for (String line: dataConverted.collect()){ Assert.assertEquals(String.valueOf(i) + separator + String.valueOf(i) + separator + "true", line ); i = i + 1; } }
private void testWriteGeneral(@NonNull final JavaRDD<AvroPayload> testData, @NonNull final Configuration conf) throws IOException { final FileSinkDataConverter converter = new FileSinkDataConverter(conf, new ErrorExtractor()); final FileSinkConfiguration fileConf = new FileSinkConfiguration(conf); final FileSink awsSink = new AwsFileSink(fileConf, converter); awsSink.write(testData); }
@Test public void testConvertAllWithCsvSpecialChar() { log.info("Starts Test convert all with csv"); final String separator = ","; final JavaRDD<AvroPayload> payloadData = AvroPayloadUtil.generateTestDataNew(this.jsc.get(), 10, StringTypes.EMPTY); final Configuration conf = initConf(separator, "csv"); log.info("Starting to convert data."); final FileSinkDataConverter converter = new FileSinkDataConverter(conf, new ErrorExtractor()); final JavaRDD<String> dataConverted = converter.convertAll(payloadData); int i = 1; for (String line: dataConverted.collect()){ Assert.assertEquals(String.valueOf(i) + separator + "\"" + String.valueOf(i) + "\\\",try\\\\\"" + separator + "true", line); i = i + 1; } }
@Test public void testWriteToCsvWithHeader() throws IOException { final JavaRDD<AvroPayload> testData = AvroPayloadUtil.generateTestDataNew(this.jsc.get(), NUM_RECORD2, StringTypes.EMPTY); final Configuration conf = initConfig(pathPrefix, PATH4, COMMA_SEPARATOR, TIMESTAMP1, SOURCE_SUB_PATH1, VERSION); conf.setProperty(FileSinkConfiguration.CSV_COLUMN_HEADER, "true"); final FileSinkDataConverter converter = new FileSinkDataConverter(conf, new ErrorExtractor()); final FileSinkConfiguration fileConf = new FileSinkConfiguration(conf); final HdfsFileSink hdfsSink = spy(new HdfsFileSink(fileConf, converter)); hdfsSink.write(testData); verify(hdfsSink, times(1)).write(Matchers.any(JavaRDD.class)); verify(hdfsSink, times(1)).addColumnHeader(Matchers.anyString(), Matchers.any(JavaRDD.class)); final FileStatus[] status = this.fileSystem.get().listStatus(new Path(fileConf.getPathHdfs())); for (final FileStatus fileStatus : status) { if (fileStatus.isFile()) { Path path = fileStatus.getPath(); FSDataInputStream in = this.fileSystem.get().open(path); BufferedReader d = new BufferedReader(new InputStreamReader(in)); String header = d.readLine(); Assert.assertEquals("int_field,string_field,boolean_field", header); in.close(); d.close(); } } }
@Before public void setupTest() { super.setupTest(); this.testData1 = AvroPayloadUtil.generateTestData(this.jsc.get(), NUM_RECORD1, StringTypes.EMPTY); this.testData2 = AvroPayloadUtil.generateTestDataNew(this.jsc.get(), NUM_RECORD2, StringTypes.EMPTY); this.conf = initConfig(pathPrefix, PATH1, COMMA_SEPARATOR, TIMESTAMP1, SOURCE_SUB_PATH1, VERSION); this.converter = new FileSinkDataConverter(conf, new ErrorExtractor()); final FileSinkConfiguration fileConf = new FileSinkConfiguration(conf); this.fileSink = spy(new HdfsFileSink(fileConf, converter)); this.convertedData1 = this.converter.convertAll(this.testData1); this.convertedData2 = this.converter.convertAll(this.testData2); }
private String testWriteToMockS3General(@NonNull final Configuration conf) throws IOException { final JavaRDD<AvroPayload> testData = AvroPayloadUtil.generateTestDataNew(this.jsc.get(), NUM_RECORD, StringTypes.EMPTY); final FileSinkDataConverter converter = new FileSinkDataConverter(conf, new ErrorExtractor()); final FileSinkConfiguration fileConf = new FileSinkConfiguration(conf); final AwsConfiguration awsConf = new AwsConfiguration(fileConf); final MockAwsFileSink awsMockSink = spy(new MockAwsFileSink(fileConf, converter)); awsMockSink.write(testData); final AmazonS3 MockClient = awsMockSink.getS3Client(); verify(awsMockSink, times(EXPECTED_INVOCATIONS)).write(Matchers.any(JavaRDD.class)); verify(MockClient, times(EXPECTED_PARTITION_NUM)).putObject(Matchers.any(PutObjectRequest.class)); assertTrue(MockClient.doesBucketExistV2(fileConf.getBucketName().get())); for (int i = 0 ; i < EXPECTED_PARTITION_NUM ; i++) { final Boolean objectExist = MockClient.doesObjectExist(fileConf.getBucketName().get(), awsConf.getS3FilePrefix()+ "_0000" + i); assertTrue(objectExist); } return awsConf.getS3FilePrefix(); }