@Override public JavaRDD<AvroPayload> getData(@NonNull final FileWorkUnitCalculator.FileWorkUnitCalculatorResult result) { Preconditions.checkState(result.hasWorkUnits(), "no work to do: " + this.conf.getDirectory()); // todo: support more types Preconditions.checkState(this.conf.getType().equals("json"), "only json files supported"); try { final FileSystem fs = this.conf.getFileSystem(); final String filesToRead = result.getWorkUnits().stream() .map(LocatedFileStatus::getPath) .map(Path::toString) .collect(Collectors.joining(",")); final RDD<String> fileRows = this.jsc.sc().textFile(filesToRead, 1); return this.converter.map(fileRows.toJavaRDD()).getData(); } catch (IOException e) { throw new JobRuntimeException("Error getting files", e); } } }
public FileSourceConfiguration(@NonNull final Configuration conf) { ConfigUtil.checkMandatoryProperties(conf, getMandatoryProperties()); this.conf = conf; }
@Test public void computeWorkUnitsNoJson() throws Exception{ final Configuration conf = new Configuration(); conf.setProperty(FileSourceConfiguration.TYPE, "json"); conf.setProperty(FileSourceConfiguration.SCHEMA, "{}"); final Path testDir = Files.createTempDirectory(null); try { conf.setProperty(FileSourceConfiguration.DIRECTORY, testDir.toString()); final FileWorkUnitCalculator workUnitCalculator = new FileWorkUnitCalculator(new FileSourceConfiguration(conf)); final FileWorkUnitCalculator.FileWorkUnitCalculatorResult result = workUnitCalculator.computeWorkUnits(); // there are no *.json files in this directory Assert.assertFalse(result.hasWorkUnits()); } finally { FileUtils.deleteDirectory(testDir.toFile()); } }
@Test(expected = JobRuntimeException.class) public void computeWorkUnitsNoSuchDirectory() { final Configuration conf = new Configuration(); conf.setProperty(FileSourceConfiguration.TYPE, "json"); conf.setProperty(FileSourceConfiguration.SCHEMA, "{}"); conf.setProperty(FileSourceConfiguration.DIRECTORY, "path/not/exist"); final FileWorkUnitCalculator workUnitCalculator = new FileWorkUnitCalculator(new FileSourceConfiguration(conf)); final FileWorkUnitCalculator.FileWorkUnitCalculatorResult result = workUnitCalculator.computeWorkUnits(); }
@Test public void computeWorkUnitsSuccess() throws Exception { final Path testDir = Files.createTempDirectory(null); final String jsonFile1 = "file1.json"; final String jsonFile2 = "file2.json"; final String csvFile = "file3.csv"; try { createFile(testDir, jsonFile1); createFile(testDir, jsonFile2); createFile(testDir, csvFile); final Configuration conf = new Configuration(); conf.setProperty(FileSourceConfiguration.TYPE, "json"); conf.setProperty(FileSourceConfiguration.SCHEMA, "{}"); conf.setProperty(FileSourceConfiguration.DIRECTORY, testDir.toString()); final FileWorkUnitCalculator workUnitCalculator = new FileWorkUnitCalculator(new FileSourceConfiguration(conf)); final FileWorkUnitCalculator.FileWorkUnitCalculatorResult result = workUnitCalculator.computeWorkUnits(); Assert.assertEquals(2, result.getWorkUnits().size()); final Set<String> expectedResults = new HashSet<>(Arrays.asList(jsonFile1, jsonFile2)); final Set<String> actualResults = result.getWorkUnits().stream() .map(FileStatus::getPath) .map(org.apache.hadoop.fs.Path::getName) .collect(Collectors.toSet()); Assert.assertEquals(expectedResults, actualResults); } finally { FileUtils.deleteDirectory(testDir.toFile()); } }
final FileSource fileSource = new FileSource(new FileSourceConfiguration(conf), jsc, sourceDataConverter); final HoodieSinkDataConverter sinkDataConverter = new TSBasedHoodieSinkDataConverter(conf, "firstName", "timestamp", TimeUnit.SECONDS); HoodieSink.HoodieSinkOp.INSERT, metadataMgr); final FileWorkUnitCalculator workUnitCalculator = new FileWorkUnitCalculator(new FileSourceConfiguration(conf));