/** * Adds a named output for the job. * <p/> * * @param job job to add the named output * @param namedOutput named output name, it has to be a word, letters * and numbers only, cannot be the word 'part' as * that is reserved for the default output. * @param outputFormatClass OutputFormat class. * @param keySchema Schema for the Key */ @SuppressWarnings("unchecked") public static void addNamedOutput(Job job, String namedOutput, Class<? extends OutputFormat> outputFormatClass, Schema keySchema) { addNamedOutput(job,namedOutput,outputFormatClass,keySchema,null); }
/** * Write key and value to the namedOutput. * * Output path is a unique file generated for the namedOutput. * For example, {namedOutput}-(m|r)-{part-number} * * @param namedOutput the named output name * @param key the key * @param value the value */ @SuppressWarnings("unchecked") public void write(String namedOutput, Object key, Object value) throws IOException, InterruptedException { write(namedOutput, key, value, namedOutput); }
@Override protected void setup(Context context) { mStats = new AvroKey<>(null); amos = new AvroMultipleOutputs(context); }
/** * Checks if a named output name is valid. * * @param namedOutput named output Name * @throws IllegalArgumentException if the output name is not valid. */ private static void checkNamedOutputName(JobContext job, String namedOutput, boolean alreadyDefined) { checkTokenName(namedOutput); checkBaseOutputPath(namedOutput); List<String> definedChannels = getNamedOutputsList(job); if (alreadyDefined && definedChannels.contains(namedOutput)) { throw new IllegalArgumentException("Named output '" + namedOutput + "' already alreadyDefined"); } else if (!alreadyDefined && !definedChannels.contains(namedOutput)) { throw new IllegalArgumentException("Named output '" + namedOutput + "' not defined"); } }
/** * Write key and value to baseOutputPath using the namedOutput. * * @param namedOutput the named output name * @param key the key * @param value the value * @param baseOutputPath base-output path to write the record to. * Note: Framework will generate unique filename for the baseOutputPath */ @SuppressWarnings("unchecked") public void write(String namedOutput, Object key, Object value, String baseOutputPath) throws IOException, InterruptedException { checkNamedOutputName(context, namedOutput, false); checkBaseOutputPath(baseOutputPath); if (!namedOutputs.contains(namedOutput)) { throw new IllegalArgumentException("Undefined named output '" + namedOutput + "'"); } TaskAttemptContext taskContext = getContext(namedOutput); getRecordWriter(taskContext, baseOutputPath).write(key, value); }
/** * Write key value to an output file name. * * Gets the record writer from job's output format. Job's output format should * be a FileOutputFormat. * * @param key the key * @param value the value * @param keySchema keySchema to use * @param valSchema ValueSchema to use * @param baseOutputPath base-output path to write the record to. Note: Framework will * generate unique filename for the baseOutputPath */ @SuppressWarnings("unchecked") public void write(Object key, Object value, Schema keySchema, Schema valSchema, String baseOutputPath) throws IOException, InterruptedException { checkBaseOutputPath(baseOutputPath); Job job = new Job(context.getConfiguration()); setSchema(job, keySchema, valSchema); TaskAttemptContext taskContext = createTaskAttemptContext(job.getConfiguration(), context.getTaskAttemptID()); getRecordWriter(taskContext, baseOutputPath).write(key, value); }
@Override protected void reduce(final Text key, final Iterable<AvroValue<GenericRecord>> values, final Context context) throws IOException, InterruptedException { final BaseReducer baseReducer = new BaseReducer(); final URI uri = baseReducer.getReducerKey(key, context); final AvroMultipleOutputs multipleOutputs = new AvroMultipleOutputs(context); try { for (AvroValue value : values) { GenericRecord rec = (GenericRecord) value.datum(); multipleOutputs.write(new AvroKey<GenericRecord>(rec), NullWritable.get(), uri.toString()); } } finally { multipleOutputs.close(); } } }
@Override public void process(Annotation annotation, Job job, Object target) throws ToolException { AvroNamedOutput avroOut = (AvroNamedOutput)annotation; Schema schema = getSchema(avroOut.record()); String[] names = getNames(avroOut); for (String name : names) { name = (String)evaluateExpression(name); if (!configured.contains(name)) { AvroMultipleOutputs.addNamedOutput(job, name, avroOut.format(), schema); AvroMultipleOutputs.setCountersEnabled(job, avroOut.countersEnabled()); configured.add(name); } } AvroSerialization.addToConfiguration(job.getConfiguration()); }
@Override protected void reduce(Text line, Iterable<IntWritable> counts, Context context) throws IOException, InterruptedException { TextStats record = new TextStats(); record.count = 0; for (IntWritable count : counts) { record.count += count.get(); } record.name = line.toString(); mStats.datum(record); context.write(mStats, NullWritable.get()); amos.sync("myavro3","myavro3"); amos.write("myavro3",mStats,NullWritable.get()); } @Override
@Override protected void cleanup(Context context) throws IOException, InterruptedException { amos.close(); } }
/** * Adds a named output for the job. * <p/> * * @param job job to add the named output * @param namedOutput named output name, it has to be a word, letters * and numbers only, cannot be the word 'part' as * that is reserved for the default output. * @param outputFormatClass OutputFormat class. * @param keySchema Schema for the Key * @param valueSchema Schema for the Value (used in case of AvroKeyValueOutputFormat or null) */ @SuppressWarnings("unchecked") public static void addNamedOutput(Job job, String namedOutput, Class<? extends OutputFormat> outputFormatClass, Schema keySchema, Schema valueSchema) { checkNamedOutputName(job, namedOutput, true); Configuration conf = job.getConfiguration(); conf.set(MULTIPLE_OUTPUTS, conf.get(MULTIPLE_OUTPUTS, "") + " " + namedOutput); conf.setClass(MO_PREFIX + namedOutput + FORMAT, outputFormatClass, OutputFormat.class); conf.set(MO_PREFIX+namedOutput+".keyschema", keySchema.toString()); if(valueSchema!=null){ conf.set(MO_PREFIX+namedOutput+".valueschema",valueSchema.toString()); } }
/** * * Gets the record writer from job's output format. Job's output format should * be a FileOutputFormat.If the record writer implements Syncable then returns * the current position as a value that may be passed to DataFileReader.seek(long) * otherwise returns -1. * Forces the end of the current block, emitting a synchronization marker. * * @param namedOutput the namedOutput * @param baseOutputPath base-output path to write the record to. Note: Framework will * generate unique filename for the baseOutputPath */ @SuppressWarnings("unchecked") public long sync(String namedOutput, String baseOutputPath) throws IOException, InterruptedException { checkNamedOutputName(context, namedOutput, false); checkBaseOutputPath(baseOutputPath); if (!namedOutputs.contains(namedOutput)) { throw new IllegalArgumentException("Undefined named output '" + namedOutput + "'"); } TaskAttemptContext taskContext = getContext(namedOutput); RecordWriter recordWriter = getRecordWriter(taskContext, baseOutputPath); long position = -1; if (recordWriter instanceof Syncable) { Syncable syncableWriter = (Syncable) recordWriter; position = syncableWriter.sync(); } return position; } // by being synchronized MultipleOutputTask can be use with a
@Override protected void reduce(Text line, Iterable<IntWritable> counts, Context context) throws IOException, InterruptedException { GenericData.Record record = new GenericData.Record(STATS_SCHEMA); GenericData.Record record2 = new GenericData.Record(STATS_SCHEMA_2); int sum = 0; for (IntWritable count : counts) { sum += count.get(); } record.put("name", new Utf8(line.toString())); record.put("count", sum); mStats.datum(record); context.write(mStats, NullWritable.get()); amos.sync("myavro","myavro"); amos.write("myavro",mStats,NullWritable.get()); record2.put("name1", new Utf8(line.toString())); record2.put("count1", sum); mStats.datum(record2); amos.write(mStats, NullWritable.get(), STATS_SCHEMA_2, null, "testnewwrite2"); amos.sync("myavro1","myavro1"); amos.write("myavro1",mStats); amos.write(mStats, NullWritable.get(), STATS_SCHEMA, null, "testnewwrite"); amos.write(mStats, NullWritable.get(), "testwritenonschema"); }
@Override protected void cleanup(Context context) throws IOException, InterruptedException { amos.close(); } }
/** * Write key value to an output file name. * * Gets the record writer from job's output format. Job's output format should * be a FileOutputFormat. * * @param key the key * @param value the value * @param keySchema keySchema to use * @param valSchema ValueSchema to use * @param baseOutputPath base-output path to write the record to. Note: Framework will * generate unique filename for the baseOutputPath */ @SuppressWarnings("unchecked") public void write(Object key, Object value, Schema keySchema, Schema valSchema, String baseOutputPath) throws IOException, InterruptedException { checkBaseOutputPath(baseOutputPath); Job job = new Job(context.getConfiguration()); setSchema(job, keySchema, valSchema); TaskAttemptContext taskContext = createTaskAttemptContext(job.getConfiguration(), context.getTaskAttemptID()); getRecordWriter(taskContext, baseOutputPath).write(key, value); }
/** * Checks if a named output name is valid. * * @param namedOutput named output Name * @throws IllegalArgumentException if the output name is not valid. */ private static void checkNamedOutputName(JobContext job, String namedOutput, boolean alreadyDefined) { checkTokenName(namedOutput); checkBaseOutputPath(namedOutput); List<String> definedChannels = getNamedOutputsList(job); if (alreadyDefined && definedChannels.contains(namedOutput)) { throw new IllegalArgumentException("Named output '" + namedOutput + "' already alreadyDefined"); } else if (!alreadyDefined && !definedChannels.contains(namedOutput)) { throw new IllegalArgumentException("Named output '" + namedOutput + "' not defined"); } }
/** * Adds a named output for the job. * <p/> * * @param job job to add the named output * @param namedOutput named output name, it has to be a word, letters * and numbers only, cannot be the word 'part' as * that is reserved for the default output. * @param outputFormatClass OutputFormat class. * @param keySchema Schema for the Key * @param valueSchema Schema for the Value (used in case of AvroKeyValueOutputFormat or null) */ @SuppressWarnings("unchecked") public static void addNamedOutput(Job job, String namedOutput, Class<? extends OutputFormat> outputFormatClass, Schema keySchema, Schema valueSchema) { checkNamedOutputName(job, namedOutput, true); Configuration conf = job.getConfiguration(); conf.set(MULTIPLE_OUTPUTS, conf.get(MULTIPLE_OUTPUTS, "") + " " + namedOutput); conf.setClass(MO_PREFIX + namedOutput + FORMAT, outputFormatClass, OutputFormat.class); conf.set(MO_PREFIX+namedOutput+".keyschema", keySchema.toString()); if(valueSchema!=null){ conf.set(MO_PREFIX+namedOutput+".valueschema",valueSchema.toString()); } }
job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); AvroMultipleOutputs.addNamedOutput(job,"myavro3",AvroKeyOutputFormat.class,TextStats.SCHEMA$,null);
/** * Write key value to an output file name. * * Gets the record writer from job's output format. * Job's output format should be a FileOutputFormat. * * @param key the key * @param value the value * @param baseOutputPath base-output path to write the record to. * Note: Framework will generate unique filename for the baseOutputPath */ public void write(Object key, Object value, String baseOutputPath) throws IOException, InterruptedException { write(key, value, null, null, baseOutputPath); }
/** * Write key and value to baseOutputPath using the namedOutput. * * @param namedOutput the named output name * @param key the key * @param value the value * @param baseOutputPath base-output path to write the record to. * Note: Framework will generate unique filename for the baseOutputPath */ @SuppressWarnings("unchecked") public void write(String namedOutput, Object key, Object value, String baseOutputPath) throws IOException, InterruptedException { checkNamedOutputName(context, namedOutput, false); checkBaseOutputPath(baseOutputPath); if (!namedOutputs.contains(namedOutput)) { throw new IllegalArgumentException("Undefined named output '" + namedOutput + "'"); } TaskAttemptContext taskContext = getContext(namedOutput); getRecordWriter(taskContext, baseOutputPath).write(key, value); }