public Builder() { super(new SampleDataForSplitPoints()); }
public SampleDataAndCreateSplitsFileTool(final SampleDataForSplitPointsJobFactory jobFactory, final SampleDataForSplitPoints operation, final Store store) { this.operation = operation; this.store = store; this.jobFactory = jobFactory; if (null == operation.getNumSplits() || operation.getNumSplits() < 1) { expectedNumberOfSplits = jobFactory.getExpectedNumberOfSplits(store); } else { expectedNumberOfSplits = operation.getNumSplits(); } }
protected void setupOutput(final Job job, final SampleDataForSplitPoints operation, final Store store) throws IOException { job.setOutputFormatClass(SequenceFileOutputFormat.class); SequenceFileOutputFormat.setOutputPath(job, new Path(operation.getOutputPath())); if (null != operation.getCompressionCodec()) { if (GzipCodec.class.isAssignableFrom(operation.getCompressionCodec()) && !NativeCodeLoader.isNativeCodeLoaded() && !ZlibFactory.isNativeZlibLoaded(job.getConfiguration())) { LOGGER.warn("SequenceFile doesn't work with GzipCodec without native-hadoop code!"); } else { SequenceFileOutputFormat.setCompressOutput(job, true); SequenceFileOutputFormat.setOutputCompressorClass(job, operation.getCompressionCodec()); SequenceFileOutputFormat.setOutputCompressionType(job, SequenceFile.CompressionType.BLOCK); } } } }
@Override public JobConf createJobConf(final SampleDataForSplitPoints operation, final String mapperGeneratorClassName, final Store store) throws IOException { final JobConf jobConf = new JobConf(new Configuration()); LOGGER.info("Setting up job conf"); jobConf.set(SCHEMA, new String(store.getSchema().toCompactJson(), CommonConstants.UTF_8)); LOGGER.info("Added {} {} to job conf", SCHEMA, new String(store.getSchema().toCompactJson(), CommonConstants.UTF_8)); jobConf.set(MAPPER_GENERATOR, mapperGeneratorClassName); LOGGER.info("Added {} of {} to job conf", MAPPER_GENERATOR, mapperGeneratorClassName); jobConf.set(VALIDATE, String.valueOf(operation.isValidate())); LOGGER.info("Added {} option of {} to job conf", VALIDATE, operation.isValidate()); jobConf.set(PROPORTION_TO_SAMPLE, String.valueOf(operation.getProportionToSample())); LOGGER.info("Added {} option of {} to job conf", PROPORTION_TO_SAMPLE, String.valueOf(operation.getProportionToSample())); final Integer numTasks = operation.getNumMapTasks(); if (null != numTasks) { jobConf.setNumMapTasks(numTasks); LOGGER.info("Set number of map tasks to {} on job conf", numTasks); } jobConf.setNumReduceTasks(1); LOGGER.info("Set number of reduce tasks to 1 on job conf"); jobConf.set(AccumuloStoreConstants.ACCUMULO_ELEMENT_CONVERTER_CLASS, ((AccumuloStore) store).getKeyPackage().getKeyConverter().getClass().getName()); return jobConf; }
@Override public void setupJob(final Job job, final SampleDataForSplitPoints operation, final String mapperGeneratorClassName, final Store store) throws IOException { job.setJarByClass(getClass()); job.setJobName(getJobName(mapperGeneratorClassName, new Path(operation.getOutputPath()))); setupMapper(job); setupReducer(job); setupOutput(job, operation, store); }
final List<Job> jobs = new ArrayList<>(); Map<String, List<String>> mapperGeneratorsToInputPathsList = new HashMap<>(); for (final Map.Entry<String, String> entry : operation.getInputMapperPairs().entrySet()) { if (mapperGeneratorsToInputPathsList.containsKey(entry.getValue())) { mapperGeneratorsToInputPathsList.get(entry.getValue()).add(entry.getKey()); setupJob(job, operation, mapperGeneratorClassName, store); if (null != operation.getJobInitialiser()) { operation.getJobInitialiser().initialiseJob(job, operation, store);
private void writeSplits(final FileSystem fs, final Path resultsFile, final long outputEveryNthRecord, final int numberSplitsExpected) throws OperationException { LOGGER.info("Writing splits to {}", operation.getSplitsFilePath()); final Writable key = jobFactory.createKey(); final Writable value = jobFactory.createValue(); try (final SequenceFile.Reader reader = new SequenceFile.Reader(fs.getConf(), Reader.file(resultsFile)); final PrintStream splitsWriter = new PrintStream( new BufferedOutputStream(fs.create(new Path(operation.getSplitsFilePath()), true)), false, CommonConstants.UTF_8) ) {
final Path resultsFile = new Path(operation.getOutputPath(), "part-r-00000"); LOGGER.info("Will output every {}-th record from {}", outputEveryNthRecord, resultsFile);