org.apache.avro.mapred.AvroMultipleOutputs java code examples

/**
 * Adds a named output for the job.
 * <p/>
 *
 * @param conf              job conf to add the named output
 * @param namedOutput       named output name, it has to be a word, letters
 *                          and numbers only, cannot be the word 'part' as
 *                          that is reserved for the
 *                          default output.
 * @param outputFormatClass OutputFormat class.
 * @param schema            Schema to used for this namedOutput
 */
public static void addNamedOutput(JobConf conf, String namedOutput,
               Class<? extends OutputFormat> outputFormatClass,
               Schema schema) {
 addNamedOutput(conf, namedOutput, false, outputFormatClass, schema);
}

@SuppressWarnings("rawtypes")
private AvroCollector getCollector(String namedOutput,Schema schema, Reporter reporter, String baseFileName)
  throws IOException{
 //namedOutputs.add(baseFileName);
 return getCollector(namedOutput,null,reporter,baseFileName,schema);
}

public void configure(JobConf Job) {
 this.amos = new AvroMultipleOutputs(Job);
}

/**
 * Creates and initializes multiple named outputs support, it should be
 * instantiated in the Mapper/Reducer configure method.
 *
 * @param job the job configuration object
 */
public AvroMultipleOutputs(JobConf job) {
 this.conf = job;
 outputFormat = new InternalFileOutputFormat();
 namedOutputs = Collections.unmodifiableSet(
  new HashSet<>(AvroMultipleOutputs.getNamedOutputsList(job)));
 recordWriters = new HashMap<>();
 countersEnabled = getCountersEnabled(job);
}

throws IOException {
checkNamedOutputName(namedOutput);
if (!namedOutputs.contains(namedOutput)) {
 throw new IllegalArgumentException("Undefined named output '" +
  namedOutput + "'");
boolean multi = isMultiNamedOutput(conf, namedOutput);
 checkTokenName(multiName);
 getRecordWriter(namedOutput, baseFileName, reporter,schema);

/**
 * Adds a named output for the job.
 * <p/>
 *
 * @param conf              job conf to add the named output
 * @param namedOutput       named output name, it has to be a word, letters
 *                          and numbers only, cannot be the word 'part' as
 *                          that is reserved for the
 *                          default output.
 * @param multi             indicates if the named output is multi
 * @param outputFormatClass OutputFormat class.
 * @param schema            Schema to used for this namedOutput
 */
private static void addNamedOutput(JobConf conf, String namedOutput,
               boolean multi,
               Class<? extends OutputFormat> outputFormatClass,
               Schema schema) {
 checkNamedOutputName(namedOutput);
 checkNamedOutput(conf, namedOutput, true);
 boolean isMapOnly = conf.getNumReduceTasks() == 0;
 if(schema!=null)
  conf.set(MO_PREFIX+namedOutput+".schema", schema.toString());
 conf.set(NAMED_OUTPUTS, conf.get(NAMED_OUTPUTS, "") + " " + namedOutput);
 conf.setClass(MO_PREFIX + namedOutput + FORMAT, outputFormatClass,
  OutputFormat.class);
 conf.setBoolean(MO_PREFIX + namedOutput + MULTI, multi);
}

@Override
public void reduce(Utf8 word, Iterable<Long> counts,
          AvroCollector<Pair<Utf8, Long>> collector,
          Reporter reporter) throws IOException {
 long sum = 0;
 for (long count : counts)
  sum += count;
 Pair<Utf8, Long> outputvalue = new Pair<>(word, sum);
 amos.getCollector("myavro", reporter).collect(outputvalue);
 amos.collect("myavro1", reporter, outputvalue.toString());
 amos.collect("myavro", reporter, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema(), outputvalue, "testavrofile");
 amos.collect("myavro", reporter, Schema.create(Schema.Type.STRING), outputvalue.toString(), "testavrofile1");
 collector.collect(new Pair<>(word, sum));
}

/**
 * Returns the named output OutputFormat.
 *
 * @param conf        job conf
 * @param namedOutput named output
 * @return namedOutput OutputFormat
 */
public static Class<? extends OutputFormat> getNamedOutputFormatClass(
 JobConf conf, String namedOutput) {
 checkNamedOutput(conf, namedOutput, false);
 return conf.getClass(MO_PREFIX + namedOutput + FORMAT, null,
  OutputFormat.class);
}

 public void close() throws IOException {
  amos.close();
 }
}

/**
 * Checks if a named output name is valid.
 *
 * @param namedOutput named output Name
 * @throws IllegalArgumentException if the output name is not valid.
 */
private static void checkNamedOutputName(String namedOutput) {
 checkTokenName(namedOutput);
 // name cannot be the name used for the default output
 if (namedOutput.equals("part")) {
  throw new IllegalArgumentException(
   "Named output name cannot be 'part'");
 }
}

 @SuppressWarnings({"unchecked", "deprecation"})
 public RecordWriter<Object, Object> getRecordWriter(FileSystem fs,JobConf job, String baseFileName, Progressable arg3) throws IOException {
 String nameOutput = job.get(CONFIG_NAMED_OUTPUT, null);
 String fileName = getUniqueName(job, baseFileName);
 Schema schema = null;
 String schemastr = job.get(MO_PREFIX+nameOutput+".schema",null);
 if (schemastr!=null)
  schema = Schema.parse(schemastr);
 JobConf outputConf = new JobConf(job);
 outputConf.setOutputFormat(getNamedOutputFormatClass(job, nameOutput));
 boolean isMapOnly = job.getNumReduceTasks() == 0;
 if (schema != null) {
  if (isMapOnly)
   AvroJob.setMapOutputSchema(outputConf, schema);
  else
   AvroJob.setOutputSchema(outputConf, schema);
 }
 OutputFormat outputFormat = outputConf.getOutputFormat();
 return outputFormat.getRecordWriter(fs, outputConf, fileName, arg3);
 }
}

throws IOException {
checkNamedOutputName(namedOutput);
if (!namedOutputs.contains(namedOutput)) {
 throw new IllegalArgumentException("Undefined named output '" +
  namedOutput + "'");
boolean multi = isMultiNamedOutput(conf, namedOutput);
 checkTokenName(multiName);
 getRecordWriter(namedOutput, baseFileName, reporter,schema);

/**
 * Creates and initializes multiple named outputs support, it should be
 * instantiated in the Mapper/Reducer configure method.
 *
 * @param job the job configuration object
 */
public AvroMultipleOutputs(JobConf job) {
 this.conf = job;
 outputFormat = new InternalFileOutputFormat();
 namedOutputs = Collections.unmodifiableSet(
  new HashSet<String>(AvroMultipleOutputs.getNamedOutputsList(job)));
 recordWriters = new HashMap<String, RecordWriter>();
 countersEnabled = getCountersEnabled(job);
}

/**
 * Adds a named output for the job.
 * <p/>
 *
 * @param conf              job conf to add the named output
 * @param namedOutput       named output name, it has to be a word, letters
 *                          and numbers only, cannot be the word 'part' as
 *                          that is reserved for the
 *                          default output.
 * @param multi             indicates if the named output is multi
 * @param outputFormatClass OutputFormat class.
 * @param schema            Schema to used for this namedOutput
 */
private static void addNamedOutput(JobConf conf, String namedOutput,
               boolean multi,
               Class<? extends OutputFormat> outputFormatClass,
               Schema schema) {
 checkNamedOutputName(namedOutput);
 checkNamedOutput(conf, namedOutput, true);
 boolean isMapOnly = conf.getNumReduceTasks() == 0;
 if(schema!=null)
  conf.set(MO_PREFIX+namedOutput+".schema", schema.toString());
 conf.set(NAMED_OUTPUTS, conf.get(NAMED_OUTPUTS, "") + " " + namedOutput);
 conf.setClass(MO_PREFIX + namedOutput + FORMAT, outputFormatClass,
  OutputFormat.class);
 conf.setBoolean(MO_PREFIX + namedOutput + MULTI, multi);
}

/**
 * Returns if a named output is multiple.
 *
 * @param conf        job conf
 * @param namedOutput named output
 * @return <code>true</code> if the name output is multi, <code>false</code>
 *         if it is single. If the name output is not defined it returns
 *         <code>false</code>
 */
public static boolean isMultiNamedOutput(JobConf conf, String namedOutput) {
 checkNamedOutput(conf, namedOutput, false);
 return conf.getBoolean(MO_PREFIX + namedOutput + MULTI, false);
}

 public void close() throws IOException {
  amos.close();
 }
}

/**
 * Checks if a named output name is valid.
 *
 * @param namedOutput named output Name
 * @throws IllegalArgumentException if the output name is not valid.
 */
private static void checkNamedOutputName(String namedOutput) {
 checkTokenName(namedOutput);
 // name cannot be the name used for the default output
 if (namedOutput.equals("part")) {
  throw new IllegalArgumentException(
   "Named output name cannot be 'part'");
 }
}

 @SuppressWarnings({"unchecked", "deprecation"})
 public RecordWriter<Object, Object> getRecordWriter(FileSystem fs,JobConf job, String baseFileName, Progressable arg3) throws IOException {
 String nameOutput = job.get(CONFIG_NAMED_OUTPUT, null);
 String fileName = getUniqueName(job, baseFileName);
 Schema schema = null;
 String schemastr = job.get(MO_PREFIX+nameOutput+".schema",null);
 if (schemastr!=null)
  schema = Schema.parse(schemastr);
 JobConf outputConf = new JobConf(job);
 outputConf.setOutputFormat(getNamedOutputFormatClass(job, nameOutput));
 boolean isMapOnly = job.getNumReduceTasks() == 0;
 if (schema != null) {
  if (isMapOnly)
   AvroJob.setMapOutputSchema(outputConf, schema);
  else
   AvroJob.setOutputSchema(outputConf, schema);
 }
 OutputFormat outputFormat = outputConf.getOutputFormat();
 return outputFormat.getRecordWriter(fs, outputConf, fileName, arg3);
 }
}

@SuppressWarnings("rawtypes")
private AvroCollector getCollector(String namedOutput, Reporter reporter, Schema schema)
  throws IOException{
 return getCollector(namedOutput,null,reporter,namedOutput,schema);
}

/**
 * Adds a multi named output for the job.
 * <p/>
 *
 * @param conf              job conf to add the named output
 * @param namedOutput       named output name, it has to be a word, letters
 *                          and numbers only, cannot be the word 'part' as
 *                          that is reserved for the
 *                          default output.
 * @param outputFormatClass OutputFormat class.
 * @param schema            Schema to used for this namedOutput
 */
public static void addMultiNamedOutput(JobConf conf, String namedOutput,
               Class<? extends OutputFormat> outputFormatClass,
               Schema schema) {
 addNamedOutput(conf, namedOutput, true, outputFormatClass, schema);
}

Javadoc

The AvroMultipleOutputs class simplifies writing Avro output data to multiple outputs

Case one: writing to additional outputs other than the job default output. Each additional output, or named output, may be configured with its own Schema and OutputFormat. A named output can be a single file or a multi file. The later is refered as a multi named output which is an unbound set of files all sharing the same Schema.

Case two: to write data to different files provided by user

AvroMultipleOutputs supports counters, by default they are disabled. The counters group is the AvroMultipleOutputs class name. The names of the counters are the same as the output name. These count the number of records written to each output name. For multi named outputs the name of the counter is the concatenation of the named output, and underscore '_' and the multiname.

Usage pattern for job submission:

 
JobConf job = new JobConf(); 
FileInputFormat.setInputPath(job, inDir); 
FileOutputFormat.setOutputPath(job, outDir); 
job.setMapperClass(MyAvroMapper.class); 
job.setReducerClass(HadoopReducer.class); 
job.set("avro.reducer",MyAvroReducer.class); 
... 
Schema schema; 
... 
// Defines additional single output 'avro1' for the job 
AvroMultipleOutputs.addNamedOutput(job, "avro1", AvroOutputFormat.class, 
schema); 
// Defines additional output 'avro2' with different schema for the job 
AvroMultipleOutputs.addNamedOutput(job, "avro2", 
AvroOutputFormat.class, 
null); // if Schema is specified as null then the default output schema is used 
... 
job.waitForCompletion(true); 
...

Usage in Reducer:

 
public class MyAvroReducer extends 
AvroReducer<K, V, OUT> { 
private MultipleOutputs amos; 
public void configure(JobConf conf) { 
... 
amos = new AvroMultipleOutputs(conf); 
} 
public void reduce(K, Iterator<V> values, 
AvroCollector<OUT>, Reporter reporter) 
throws IOException { 
... 
amos.collect("avro1", reporter,datum); 
amos.getCollector("avro2", "A", reporter).collect(datum); 
amos.collect("avro1",reporter,schema,datum,"testavrofile");// this create a file testavrofile and writes data with schema "schema" into it 
and uses other values from namedoutput "avro1" like outputclass etc. 
amos.collect("avro1",reporter,schema,datum,"testavrofile1"); 
... 
} 
public void close() throws IOException { 
amos.close(); 
... 
} 
}

Most used methods

addNamedOutput
Adds a named output for the job.
getCollector
<init>
Creates and initializes multiple named outputs support, it should be instantiated in the Mapper/Redu
checkNamedOutput
Checks if a named output is alreadyDefined or not.
checkNamedOutputName
Checks if a named output name is valid.
checkTokenName
Checks if a named output name is valid token.
close
Closes all the opened named outputs. If overriden subclasses must invoke super.close() at the end of
collect
OutputCollector with custom schema and file name.
getCountersEnabled
Returns if the counters for the named outputs are enabled or not. By default these counters are disa
getNamedOutputFormatClass
Returns the named output OutputFormat.
getNamedOutputsList
Returns list of channel names.
getRecordWriter

Popular in Java

Updating database using SQL prepared statement
getApplicationContext (Context)
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
getExternalFilesDir (Context)
MessageFormat (java.text)
Produces concatenated messages in language-neutral way. New code should probably use java.util.Forma
SimpleDateFormat (java.text)
Formats and parses dates in a locale-sensitive manner. Formatting turns a Date into a String, and pa
Dictionary (java.util)
Note: Do not use this class since it is obsolete. Please use the Map interface for new implementatio
Collectors (java.util.stream)
Graphics2D (java.awt)
This Graphics2D class extends the Graphics class to provide more sophisticated control overgraphics
JTable (javax.swing)
Top Vim plugins

How to useAvroMultipleOutputs in org.apache.avro.mapred

Best Java code snippets using org.apache.avro.mapred.AvroMultipleOutputs (Showing top 20 results out of 315)

How to use
AvroMultipleOutputs
in
org.apache.avro.mapred