public Class<? extends InputFormat<?, ?>> getInputFormatClass() { return bundle.getFormatClass(); }
@Override public String toString() { return new StringBuilder() .append(formatBundle.getFormatClass().getSimpleName()) .append("(") .append(path) .append(")") .toString(); }
private static void configureJob( String namedOutput, Job job, OutputConfig outConfig) throws IOException { job.getConfiguration().set(BASE_OUTPUT_NAME, namedOutput); job.setOutputFormatClass(outConfig.bundle.getFormatClass()); job.setOutputKeyClass(outConfig.keyClass); job.setOutputValueClass(outConfig.valueClass); outConfig.bundle.configure(job.getConfiguration()); }
@Override @SuppressWarnings("unchecked") public void configureSource(Job job, int inputId) throws IOException { Configuration conf = job.getConfiguration(); //an id of -1 indicates that this is the only input so just use it directly if (inputId == -1) { job.setMapperClass(CrunchMapper.class); job.setInputFormatClass(inputBundle.getFormatClass()); inputBundle.configure(conf); } else { //there are multiple inputs for this mapper so add it as a CrunchInputs and need a fake path just to //make it play well with other file based inputs. Path dummy = new Path("/kafka/" + inputId); CrunchInputs.addInputPath(job, dummy, inputBundle, inputId); } }
@Override public void configureSource(Job job, int inputId) throws IOException { TableMapReduceUtil.addDependencyJars(job); Configuration conf = job.getConfiguration(); conf.setStrings("io.serializations", conf.get("io.serializations"), ResultSerialization.class.getName()); if (inputId == -1) { job.setMapperClass(CrunchMapper.class); job.setInputFormatClass(inputBundle.getFormatClass()); inputBundle.configure(conf); } else { Path dummy = new Path("/hbase/" + table); CrunchInputs.addInputPath(job, dummy, inputBundle, inputId); } }
protected void configureForMapReduce(Job job, Class keyClass, Class valueClass, FormatBundle formatBundle, Path outputPath, String name) { try { FileOutputFormat.setOutputPath(job, outputPath); } catch (Exception e) { throw new RuntimeException(e); } if (name == null) { job.setOutputFormatClass(formatBundle.getFormatClass()); formatBundle.configure(job.getConfiguration()); job.setOutputKeyClass(keyClass); job.setOutputValueClass(valueClass); } else { CrunchOutputs.addNamedOutput(job, name, formatBundle, keyClass, valueClass); } }
@Override public void configureSource(Job job, int inputId) throws IOException { Configuration conf = job.getConfiguration(); //an id of -1 indicates that this is the only input so just use it directly if (inputId == -1) { job.setMapperClass(CrunchMapper.class); job.setInputFormatClass(inputBundle.getFormatClass()); inputBundle.configure(conf); } else { //there are multiple inputs for this mapper so add it as a CrunchInputs and need a fake path just to //make it play well with other file based inputs. Path dummy = new Path("/kafka/" + inputId); CrunchInputs.addInputPath(job, dummy, inputBundle, inputId); } }
@Override public void configureSource(Job job, int inputId) throws IOException { TableMapReduceUtil.addDependencyJars(job); Configuration conf = job.getConfiguration(); conf.setStrings("io.serializations", conf.get("io.serializations"), ResultSerialization.class.getName()); if (inputId == -1) { job.setMapperClass(CrunchMapper.class); job.setInputFormatClass(inputBundle.getFormatClass()); inputBundle.configure(conf); } else { Path dummy = new Path("/hbase/" + table); CrunchInputs.addInputPath(job, dummy, inputBundle, inputId); } }
private TaskAttemptContext getContext(String nameOutput) throws IOException { TaskAttemptContext taskContext = taskContextCache.get(nameOutput); if (taskContext != null) { return taskContext; } // The following trick leverages the instantiation of a record writer via // the job thus supporting arbitrary output formats. OutputConfig outConfig = namedOutputs.get(nameOutput); Configuration conf = new Configuration(baseContext.getConfiguration()); Job job = new Job(conf); job.getConfiguration().set("crunch.namedoutput", nameOutput); job.setOutputFormatClass(outConfig.bundle.getFormatClass()); job.setOutputKeyClass(outConfig.keyClass); job.setOutputValueClass(outConfig.valueClass); outConfig.bundle.configure(job.getConfiguration()); taskContext = TaskAttemptContextFactory.create( job.getConfiguration(), baseContext.getTaskAttemptID()); taskContextCache.put(nameOutput, taskContext); return taskContext; }
@Override public void configureSource(Job job, int inputId) throws IOException { if (inputId == -1) { FileInputFormat.addInputPath(job, path); job.setInputFormatClass(inputBundle.getFormatClass()); inputBundle.configure(job.getConfiguration()); } else { CrunchInputs.addInputPath(job, path, inputBundle, inputId); } }
@Override public void configureSource(Job job, int inputId) throws IOException { Configuration jobConf = job.getConfiguration(); if (hcatConf == null) { hcatConf = configureHCatFormat(jobConf, bundle, database, table, filter); } if (inputId == -1) { job.setMapperClass(CrunchMapper.class); job.setInputFormatClass(bundle.getFormatClass()); bundle.configure(jobConf); } else { Path dummy = new Path("/hcat/" + database + "/" + table); CrunchInputs.addInputPath(job, dummy, bundle, inputId); } }
@Override @SuppressWarnings("unchecked") public void configureSource(Job job, int inputId) throws IOException { Configuration conf = job.getConfiguration(); if (inputId == -1) { job.setMapperClass(CrunchMapper.class); job.setInputFormatClass(formatBundle.getFormatClass()); formatBundle.configure(conf); } else { Path dummy = new Path("/view/" + view.getDataset().getName()); CrunchInputs.addInputPath(job, dummy, formatBundle, inputId); } }
@Override public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException { List<InputSplit> splits = Lists.newArrayList(); Configuration base = job.getConfiguration(); Map<FormatBundle, Map<Integer, List<Path>>> formatNodeMap = CrunchInputs.getFormatNodeMap(job); // First, build a map of InputFormats to Paths for (Map.Entry<FormatBundle, Map<Integer, List<Path>>> entry : formatNodeMap.entrySet()) { FormatBundle inputBundle = entry.getKey(); Configuration conf = new Configuration(base); inputBundle.configure(conf); Job jobCopy = new Job(conf); InputFormat<?, ?> format = (InputFormat<?, ?>) ReflectionUtils.newInstance(inputBundle.getFormatClass(), jobCopy.getConfiguration()); for (Map.Entry<Integer, List<Path>> nodeEntry : entry.getValue().entrySet()) { Integer nodeIndex = nodeEntry.getKey(); List<Path> paths = nodeEntry.getValue(); FileInputFormat.setInputPaths(jobCopy, paths.toArray(new Path[paths.size()])); // Get splits for each input path and tag with InputFormat // and Mapper types by wrapping in a TaggedInputSplit. List<InputSplit> pathSplits = format.getSplits(jobCopy); for (InputSplit pathSplit : pathSplits) { splits.add(new CrunchInputSplit(pathSplit, inputBundle.getFormatClass(), nodeIndex, jobCopy.getConfiguration())); } } } return splits; }
@Override public List<InputSplit> getSplits(JobContext job) throws IOException, InterruptedException { List<InputSplit> splits = Lists.newArrayList(); Configuration base = job.getConfiguration(); Map<FormatBundle, Map<Integer, List<Path>>> formatNodeMap = CrunchInputs.getFormatNodeMap(job); // First, build a map of InputFormats to Paths for (Map.Entry<FormatBundle, Map<Integer, List<Path>>> entry : formatNodeMap.entrySet()) { FormatBundle inputBundle = entry.getKey(); Configuration conf = new Configuration(base); inputBundle.configure(conf); Job jobCopy = new Job(conf); InputFormat<?, ?> format = (InputFormat<?, ?>) ReflectionUtils.newInstance(inputBundle.getFormatClass(), jobCopy.getConfiguration()); if (format instanceof FileInputFormat && !conf.getBoolean(RuntimeParameters.DISABLE_COMBINE_FILE, true)) { format = new CrunchCombineFileInputFormat<Object, Object>(jobCopy); } for (Map.Entry<Integer, List<Path>> nodeEntry : entry.getValue().entrySet()) { Integer nodeIndex = nodeEntry.getKey(); List<Path> paths = nodeEntry.getValue(); FileInputFormat.setInputPaths(jobCopy, paths.toArray(new Path[paths.size()])); // Get splits for each input path and tag with InputFormat // and Mapper types by wrapping in a TaggedInputSplit. List<InputSplit> pathSplits = format.getSplits(jobCopy); for (InputSplit pathSplit : pathSplits) { splits.add(new CrunchInputSplit(pathSplit, inputBundle, nodeIndex, jobCopy.getConfiguration())); } } } return splits; }
@Override @SuppressWarnings("unchecked") public void configureForMapReduce(Job job, PType<?> ptype, Path outputPath, String name) { Preconditions.checkNotNull(name, "Output name should not be null"); // see CRUNCH-82 Converter converter = getConverter(ptype); Class<?> keyClass = converter.getKeyClass(); Class<?> valueClass = Void.class; CrunchOutputs.addNamedOutput(job, name, formatBundle, keyClass, valueClass); job.setOutputFormatClass(formatBundle.getFormatClass()); formatBundle.configure(job.getConfiguration()); }
@Override public Iterator<T> read(FileSystem fs, Path path) { final Configuration conf = new Configuration(fs.getConf()); bundle.configure(conf); ptype.initialize(conf); final InputFormat fmt = ReflectionUtils.newInstance(bundle.getFormatClass(), conf); final TaskAttemptContext ctxt = new TaskAttemptContextImpl(conf, new TaskAttemptID()); try { Job job = new Job(conf); FileInputFormat.addInputPath(job, path); return Iterators.concat(Lists.transform(fmt.getSplits(job), new Function<InputSplit, Iterator<T>>() { @Override public Iterator<T> apply(InputSplit split) { try { RecordReader reader = fmt.createRecordReader(split, ctxt); reader.initialize(split, ctxt); return new RecordReaderIterator<T>(reader, ptype); } catch (Exception e) { LOG.error("Error reading split: " + split, e); throw new CrunchRuntimeException(e); } } }).iterator()); } catch (Exception e) { LOG.error("Error reading path: " + path, e); throw new CrunchRuntimeException(e); } }
@Override public Iterator<HCatRecord> iterator() { try { Job job = Job.getInstance(bundle.configure(conf)); final InputFormat fmt = ReflectionUtils.newInstance(bundle.getFormatClass(), conf); final TaskAttemptContext ctxt = new TaskAttemptContextImpl(conf, new TaskAttemptID()); return Iterators.concat(Lists.transform(fmt.getSplits(job), new Function<InputSplit, Iterator<HCatRecord>>() { @Override public Iterator<HCatRecord> apply(InputSplit split) { RecordReader reader = null; try { reader = fmt.createRecordReader(split, ctxt); reader.initialize(split, ctxt); } catch (IOException | InterruptedException e) { throw new CrunchRuntimeException(e); } return new HCatRecordReaderIterator(reader); } }).iterator()); } catch (Exception e) { throw new CrunchRuntimeException(e); } }
CrunchOutputs.OutputConfig outConfig = CrunchOutputs.getNamedOutputs(job.getConfiguration()).get("out0"); job.setOutputFormatClass(outConfig.bundle.getFormatClass()); job.setOutputKeyClass(outConfig.keyClass); job.setOutputValueClass(outConfig.valueClass); CrunchOutputs.OutputConfig outConfig = CrunchOutputs.getNamedOutputs(job.getConfiguration()).get("out0"); job.setOutputFormatClass(outConfig.bundle.getFormatClass()); job.setOutputKeyClass(outConfig.keyClass); job.setOutputValueClass(outConfig.valueClass);
CrunchOutputs.OutputConfig outConfig = CrunchOutputs.getNamedOutputs(job.getConfiguration()).get("out0"); job.setOutputFormatClass(outConfig.bundle.getFormatClass()); job.setOutputKeyClass(outConfig.keyClass); job.setOutputValueClass(outConfig.valueClass); CrunchOutputs.OutputConfig outConfig = CrunchOutputs.getNamedOutputs(job.getConfiguration()).get("out0"); job.setOutputFormatClass(outConfig.bundle.getFormatClass()); job.setOutputKeyClass(outConfig.keyClass); job.setOutputValueClass(outConfig.valueClass);