@Override public RecordReader<AvroKey<GenericRecord>, NullWritable> createRecordReader(InputSplit split, TaskAttemptContext cx) throws IOException { return new CombineFileRecordReader<>((CombineFileSplit) split, cx, AvroKeyCombineFileRecordReader.class); } }
private void setupMapper(Path input) throws IOException { FileInputFormat.setInputPaths(job, input); job.setMapperClass(CalculateStatsFromBaseCuboidMapper.class); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(Text.class); }
@Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit fSplit = (FileSplit) split; Path path = fSplit.getPath(); Configuration conf = context.getConfiguration(); this.in = new RCFile.Reader(path.getFileSystem(conf), path, conf); this.end = fSplit.getStart() + fSplit.getLength(); if (fSplit.getStart() > in.getPosition()) { in.sync(fSplit.getStart()); } this.start = in.getPosition(); more = start < end; key = new LongWritable(); value = new BytesRefArrayWritable(); } }
@Override public CubeSegment findSourceSegment(FileSplit fileSplit, CubeInstance cube) { String filePath = fileSplit.getPath().toString(); String jobID = JobBuilderSupport.extractJobIDFromPath(filePath); return CubeInstance.findSegmentWithJobId(jobID, cube); }
/** * Creates a Flink {@link InputFormat} that wraps the given Hadoop {@link org.apache.hadoop.mapreduce.lib.input.FileInputFormat}. * * @return A Flink InputFormat that wraps the Hadoop FileInputFormat. */ public static <K, V> org.apache.flink.api.java.hadoop.mapreduce.HadoopInputFormat<K, V> readHadoopFile( org.apache.hadoop.mapreduce.lib.input.FileInputFormat<K, V> mapreduceInputFormat, Class<K> key, Class<V> value, String inputPath, Job job) throws IOException { // set input path in Job org.apache.hadoop.mapreduce.lib.input.FileInputFormat.addInputPath(job, new org.apache.hadoop.fs.Path(inputPath)); // return wrapping InputFormat return createHadoopInput(mapreduceInputFormat, key, value, job); }
@Override public List<InputSplit> getSplits(JobContext ctx) throws IOException, InterruptedException { List<InputSplit> res = new ArrayList<>(BLOCK_CNT); for (int i = 0; i < BLOCK_CNT; i++) try { res.add(new FileSplit(new Path(new URI("someFile")), i, i + 1, new String[] {"localhost"})); } catch (URISyntaxException e) { throw new IOException(e); } return res; }
private static Schema getSchema(CombineFileSplit split, TaskAttemptContext cx, Integer idx) throws IOException { Schema schema = AvroJob.getInputKeySchema(cx.getConfiguration()); if (schema != null) { return schema; } Path path = split.getPath(idx); FileSystem fs = path.getFileSystem(cx.getConfiguration()); return AvroUtils.getSchemaFromDataFile(path, fs); }
@Override public List<InputSplit> getSplits(JobContext cx) throws IOException { Job modifiedJob = Job.getInstance(cx.getConfiguration()); setSplitSize(modifiedJob); FileInputFormat.setInputDirRecursive(modifiedJob, true); return cleanSplits(super.getSplits(modifiedJob)); }
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { HiveConf.setLongVar(job.getConfiguration(), HiveConf.ConfVars.MAPREDMINSPLITSIZE, SequenceFile.SYNC_INTERVAL); return super.getSplits(job); } }
private static void addInputPath(Job job, Iterable<String> pathStrings, Class<? extends InputFormat> inputFormatClass) { Configuration conf = job.getConfiguration(); StringBuilder inputFormats = new StringBuilder( StringUtils.nullToEmptyNonDruidDataString(conf.get(MultipleInputs.DIR_FORMATS)) ); String[] paths = Iterables.toArray(pathStrings, String.class); for (int i = 0; i < paths.length - 1; i++) { if (inputFormats.length() > 0) { inputFormats.append(','); } inputFormats.append(paths[i]).append(';').append(inputFormatClass.getName()); } if (inputFormats.length() > 0) { conf.set(MultipleInputs.DIR_FORMATS, inputFormats.toString()); } // add last one separately for possible initialization in MultipleInputs MultipleInputs.addInputPath(job, new Path(paths[paths.length - 1]), inputFormatClass); }
@Override public List<InputSplit> getSplits(JobContext ctx) throws IOException { List<InputSplit> res = super.getSplits(ctx); splitsCount.set(res.size()); X.println("___ split of input: " + splitsCount.get()); return res; } }
private void setSplitSize(JobContext cx) { super.setMaxSplitSize(cx.getConfiguration().getLong(COMPACTION_JOB_MAPRED_MAX_SPLIT_SIZE, DEFAULT_COMPACTION_JOB_MAPRED_MAX_SPLIT_SIZE)); super.setMinSplitSizeNode(cx.getConfiguration().getLong(COMPACTION_JOB_MAPRED_MIN_SPLIT_SIZE, DEFAULT_COMPACTION_JOB_MAPRED_MIN_SPLIT_SIZE)); }
@Override public RecordReader<NullWritable, OrcStruct> createRecordReader( InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit) inputSplit; Path path = fileSplit.getPath(); Configuration conf = ShimLoader.getHadoopShims() .getConfiguration(context); return new OrcRecordReader(OrcFile.createReader(path, OrcFile.readerOptions(conf)), ShimLoader.getHadoopShims().getConfiguration(context), fileSplit.getStart(), fileSplit.getLength()); }
/** * Job configuration. */ public static Job configureJob(Configuration conf, String [] args) throws IOException { Path inputPath = new Path(args[0]); String tableName = args[1]; Job job = new Job(conf, NAME + "_" + tableName); job.setJarByClass(Uploader.class); FileInputFormat.setInputPaths(job, inputPath); job.setInputFormatClass(SequenceFileInputFormat.class); job.setMapperClass(Uploader.class); // No reducers. Just write straight to table. Call initTableReducerJob // because it sets up the TableOutputFormat. TableMapReduceUtil.initTableReducerJob(tableName, null, job); job.setNumReduceTasks(0); return job; }
@Override public void initialize(InputSplit split, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit) split; conf = context.getConfiguration(); Path path = fileSplit.getPath(); FileSystem fs = path.getFileSystem(conf); LOG.info("Initialize HFileRecordReader for {}", path); this.in = HFile.createReader(fs, path, conf); // The file info must be loaded before the scanner can be used. // This seems like a bug in HBase, but it's easily worked around. this.in.loadFileInfo(); this.scanner = in.getScanner(false, false); }
private void configureInputAndOutputPaths(Job job) throws IOException { for (Path inputPath : getInputPaths()) { FileInputFormat.addInputPath(job, inputPath); } //MR output path must not exist when MR job starts, so delete if exists. this.tmpFs.delete(this.dataset.outputTmpPath(), true); FileOutputFormat.setOutputPath(job, this.dataset.outputTmpPath()); }
@Override public RecordReader<NullWritable, OrcStruct> createRecordReader( InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit) inputSplit; Path path = fileSplit.getPath(); Configuration conf = ShimLoader.getHadoopShims() .getConfiguration(context); return new OrcRecordReader(OrcFile.createReader(path, OrcFile.readerOptions(conf)), ShimLoader.getHadoopShims().getConfiguration(context), fileSplit.getStart(), fileSplit.getLength()); }
/** * Gets fully configured Job instance. * * @param input Input file name. * @param output Output directory name. * @return Job instance. * @throws IOException If fails. */ public static Job getJob(String input, String output) throws IOException { Job job = Job.getInstance(); job.setOutputKeyClass(Text.class); job.setOutputValueClass(IntWritable.class); setTasksClasses(job, true, true, true, false); FileInputFormat.setInputPaths(job, new Path(input)); FileOutputFormat.setOutputPath(job, new Path(output)); job.setJarByClass(HadoopWordCount2.class); return job; }
public void checkInputFormat() throws Exception { Job job = new Job(); WordCountUtil wordCountUtil = new WordCountUtil("trevniMapReduceKeyValueTest"); job.setMapperClass(Counter.class); FileInputFormat.setInputPaths(job, new Path(wordCountUtil.getDir().toString() + "/out/*")); job.setInputFormatClass(AvroTrevniKeyValueInputFormat.class); job.setNumReduceTasks(0); job.setOutputFormatClass(NullOutputFormat.class); total = 0; job.waitForCompletion(true); assertEquals(WordCountUtil.TOTAL, total); } }