org.apache.hadoop.mapred.JobConf.setNumReduceTasks java code examples

Refine search

public int run(String[] args) throws Exception {
  if(args.length != 3)
    Utils.croak("USAGE: GenerateData input-file output-dir value-size");
  JobConf conf = new JobConf(getConf(), GenerateData.class);
  conf.setJobName("generate-data");
  conf.setOutputKeyClass(Text.class);
  conf.setOutputValueClass(IntWritable.class);
  conf.setMapperClass(GenerateDataMapper.class);
  conf.setReducerClass(IdentityReducer.class);
  conf.setNumReduceTasks(0);
  conf.setInputFormat(TextInputFormat.class);
  conf.setOutputFormat(SequenceFileOutputFormat.class);
  conf.setOutputKeyClass(BytesWritable.class);
  conf.setOutputValueClass(BytesWritable.class);
  Path inputPath = new Path(args[0]);
  FileInputFormat.setInputPaths(conf, inputPath);
  Path outputPath = new Path(args[1]);
  // delete output path if it already exists
  FileSystem fs = outputPath.getFileSystem(conf);
  if(fs.exists(outputPath))
    fs.delete(outputPath, true);
  FileOutputFormat.setOutputPath(conf, outputPath);
  conf.setInt("value.size", Integer.parseInt(args[2]));
  JobClient.runJob(conf);
  return 0;
}

@Test
public void testNonAvroMapOnly() throws Exception {
 JobConf job = new JobConf();
 Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath());
 outputPath.getFileSystem(job).delete(outputPath);
 // configure input for non-Avro sequence file
 job.setInputFormat(SequenceFileInputFormat.class);
 FileInputFormat.setInputPaths(job, file().toURI().toString());
 // use a hadoop mapper that emits Avro output
 job.setMapperClass(NonAvroOnlyMapper.class);
 // configure output for avro
 job.setNumReduceTasks(0);                     // map-only
 FileOutputFormat.setOutputPath(job, outputPath);
 AvroJob.setOutputSchema(job, SCHEMA);
 JobClient.runJob(job);
 checkFile(new DataFileReader<>
      (new File(outputPath.toString() + "/part-00000.avro"),
       new SpecificDatumReader<>()));
}

@Override
protected void runJob(String jobName, Configuration c, List<Scan> scans)
  throws IOException, InterruptedException, ClassNotFoundException {
 JobConf job = new JobConf(TEST_UTIL.getConfiguration());
 job.setJobName(jobName);
 job.setMapperClass(Mapper.class);
 job.setReducerClass(Reducer.class);
 TableMapReduceUtil.initMultiTableSnapshotMapperJob(getSnapshotScanMapping(scans), Mapper.class,
   ImmutableBytesWritable.class, ImmutableBytesWritable.class, job, true, restoreDir);
 TableMapReduceUtil.addDependencyJars(job);
 job.setReducerClass(Reducer.class);
 job.setNumReduceTasks(1); // one to get final "first" and "last" key
 FileOutputFormat.setOutputPath(job, new Path(job.getJobName()));
 LOG.info("Started " + job.getJobName());
 RunningJob runningJob = JobClient.runJob(job);
 runningJob.waitForCompletion();
 assertTrue(runningJob.isSuccessful());
 LOG.info("After map/reduce completion - job " + jobName);
}

void testInputFormat(Class<? extends InputFormat> clazz) throws IOException {
 Configuration conf = UTIL.getConfiguration();
 final JobConf job = new JobConf(conf);
 job.setInputFormat(clazz);
 job.setOutputFormat(NullOutputFormat.class);
 job.setMapperClass(ExampleVerifier.class);
 job.setNumReduceTasks(0);
 LOG.debug("submitting job.");
 final RunningJob run = JobClient.runJob(job);
 assertTrue("job failed!", run.isSuccessful());
 assertEquals("Saw the wrong number of instances of the filtered-for row.", 2, run.getCounters()
   .findCounter(TestTableInputFormat.class.getName() + ":row", "aaa").getCounter());
 assertEquals("Saw any instances of the filtered out row.", 0, run.getCounters()
   .findCounter(TestTableInputFormat.class.getName() + ":row", "bbb").getCounter());
 assertEquals("Saw the wrong number of instances of columnA.", 1, run.getCounters()
   .findCounter(TestTableInputFormat.class.getName() + ":family", "columnA").getCounter());
 assertEquals("Saw the wrong number of instances of columnB.", 1, run.getCounters()
   .findCounter(TestTableInputFormat.class.getName() + ":family", "columnB").getCounter());
 assertEquals("Saw the wrong count of values for the filtered-for row.", 2, run.getCounters()
   .findCounter(TestTableInputFormat.class.getName() + ":value", "value aaa").getCounter());
 assertEquals("Saw the wrong count of values for the filtered-out row.", 0, run.getCounters()
   .findCounter(TestTableInputFormat.class.getName() + ":value", "value bbb").getCounter());
}

boolean addDependencyJars) throws IOException {
job.setOutputFormat(TableOutputFormat.class);
job.setReducerClass(reducer);
job.set(TableOutputFormat.OUTPUT_TABLE, table);
job.setOutputKeyClass(ImmutableBytesWritable.class);
job.setOutputValueClass(Put.class);
job.setStrings("io.serializations", job.get("io.serializations"),
  MutationSerialization.class.getName(), ResultSerialization.class.getName());
  MetaTableAccessor.getRegionCount(HBaseConfiguration.create(job), TableName.valueOf(table));
 if (job.getNumReduceTasks() > regions) {
  job.setNumReduceTasks(regions);

 @Test
  public void testJob() throws Exception {
  JobConf job = new JobConf();
  Path outputPath = new Path(DIR.getRoot().getPath() + "/out");
  outputPath.getFileSystem(job).delete(outputPath);

  job.setInputFormat(TextInputFormat.class);
  FileInputFormat.setInputPaths(job, DIR.getRoot().getPath() + "/in");

  job.setMapperClass(AvroTestConverter.class);
  job.setNumReduceTasks(0);

  FileOutputFormat.setOutputPath(job, outputPath);
  System.out.println(createSchema());
  AvroJob.setOutputSchema(job,
              Pair.getPairSchema(Schema.create(Schema.Type.LONG),
                        createSchema()));
  job.setOutputFormat(AvroOutputFormat.class);

  JobClient.runJob(job);
 }
}

if(!isAvro) {
  conf.setPartitionerClass(HadoopStoreBuilderPartitioner.class);
  conf.setMapperClass(mapperClass);
  conf.setMapOutputKeyClass(BytesWritable.class);
  conf.setMapOutputValueClass(BytesWritable.class);
  conf.setReducerClass(HadoopStoreBuilderReducer.class);
conf.setOutputKeyClass(BytesWritable.class);
conf.setOutputValueClass(BytesWritable.class);
conf.setJarByClass(getClass());
conf.setReduceSpeculativeExecution(false);
FileInputFormat.setInputPaths(conf, inputPath);
conf.set("final.output.dir", outputDir.toString());
conf.set(VoldemortBuildAndPushJob.CHECKSUM_TYPE, CheckSum.toString(checkSumType));
conf.set("dfs.umaskmode", "002");
FileOutputFormat.setOutputPath(conf, tempDir);
conf.setNumReduceTasks(numReducers);
  conf.setOutputKeyClass(ByteBuffer.class);
  conf.setOutputValueClass(ByteBuffer.class);
  conf.setReducerClass(AvroStoreBuilderReducer.class);

/** Uses default mapper with no reduces for a map-only identity job. */
@Test
@SuppressWarnings("deprecation")
public void testMapOnly() throws Exception {
 JobConf job = new JobConf();
 String inDir = System.getProperty("share.dir","../../../share")+"/test/data";
 Path input = new Path(inDir+"/weather.avro");
 Path output = new Path("target/test/weather-ident");
 output.getFileSystem(job).delete(output);
 job.setJobName("identity map weather");
 AvroJob.setInputSchema(job, Weather.SCHEMA$);
 AvroJob.setOutputSchema(job, Weather.SCHEMA$);
 FileInputFormat.setInputPaths(job, input);
 FileOutputFormat.setOutputPath(job, output);
 FileOutputFormat.setCompressOutput(job, true);
 job.setNumReduceTasks(0);                     // map-only
 JobClient.runJob(job);
 // check output is correct
 DatumReader<Weather> reader = new SpecificDatumReader<>();
 DataFileReader<Weather> check = new DataFileReader<>
  (new File(inDir + "/weather.avro"), reader);
 DataFileReader<Weather> sorted = new DataFileReader<>
  (new File(output.toString() + "/part-00000.avro"), reader);
 for (Weather w : sorted)
  assertEquals(check.next(), w);
 check.close();
 sorted.close();
}

private JobConf createBaseJobConf(HiveConf conf, String jobName, Table t, StorageDescriptor sd,
                 ValidWriteIdList writeIds, CompactionInfo ci) {
 JobConf job = new JobConf(conf);
 job.setJobName(jobName);
 job.setOutputKeyClass(NullWritable.class);
 job.setOutputValueClass(NullWritable.class);
 job.setJarByClass(CompactorMR.class);
 LOG.debug("User jar set to " + job.getJar());
 job.setMapperClass(CompactorMap.class);
 job.setNumReduceTasks(0);
 job.setInputFormat(CompactorInputFormat.class);
 job.setOutputFormat(NullOutputFormat.class);

@SuppressWarnings("deprecation")
public void testJobNoreducer() throws Exception {
 JobConf job = new JobConf();
 job.setNumReduceTasks(0);
 Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath());
 outputPath.getFileSystem(job).delete(outputPath);
 WordCountUtil.writeLinesFile(new File(INPUT_DIR.getRoot(),"lines.avro"));
 job.setJobName("AvroMultipleOutputs_noreducer");
 AvroJob.setInputSchema(job, Schema.create(Schema.Type.STRING));
 AvroJob.setOutputSchema(job, new Pair<Utf8, Long>(new Utf8(""), 0L).getSchema());
 AvroJob.setMapperClass(job, MapImpl.class);
 FileInputFormat.setInputPaths(job, new Path(INPUT_DIR.getRoot().toString()));
 FileOutputFormat.setOutputPath(job, outputPath);
 FileOutputFormat.setCompressOutput(job, false);
 AvroMultipleOutputs.addNamedOutput(job, "myavro2", AvroOutputFormat.class, Schema.create(Schema.Type.STRING));
 JobClient.runJob(job);
}

private JobConf createBaseJobConf(HiveConf conf, String jobName, Table t, StorageDescriptor sd,
                 ValidTxnList txns, CompactionInfo ci) {
 JobConf job = new JobConf(conf);
 job.setJobName(jobName);
 job.setOutputKeyClass(NullWritable.class);
 job.setOutputValueClass(NullWritable.class);
 job.setJarByClass(CompactorMR.class);
 LOG.debug("User jar set to " + job.getJar());
 job.setMapperClass(CompactorMap.class);
 job.setNumReduceTasks(0);
 job.setInputFormat(CompactorInputFormat.class);
 job.setOutputFormat(NullOutputFormat.class);

HiveFileFormatUtils.prepareJobOutput(job);
job.setOutputFormat(HiveOutputFormatImpl.class);
job.setMapperClass(work.getMapperClass());
job.setNumReduceTasks(0);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(NullWritable.class);

job.setInputFormat(work.getInputformatClass());
job.setOutputFormat(HiveOutputFormatImpl.class);
job.setMapperClass(MergeFileMapper.class);
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(NullWritable.class);
job.setNumReduceTasks(0);

HiveFileFormatUtils.prepareJobOutput(job);
job.setOutputFormat(HiveOutputFormatImpl.class);
job.setMapperClass(work.getMapperClass());
job.setNumReduceTasks(0);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(NullWritable.class);

job.setMapperClass(ExecMapper.class);
job.setNumReduceTasks(rWork != null ? rWork.getNumReduceTasks().intValue() : 0);
job.setReducerClass(ExecReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
   console.printInfo("Not enough sampling data.. Rolling back to single reducer task");
   rWork.setNumReduceTasks(1);
   job.setNumReduceTasks(1);
  } catch (Exception e) {
   LOG.error("Sampling error", e);
     "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
   rWork.setNumReduceTasks(1);
   job.setNumReduceTasks(1);

job.setInputFormat(work.getInputformatClass());
job.setOutputFormat(HiveOutputFormatImpl.class);
job.setMapperClass(MergeFileMapper.class);
job.setMapOutputKeyClass(NullWritable.class);
job.setMapOutputValueClass(NullWritable.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(NullWritable.class);
job.setNumReduceTasks(0);

job.setMapperClass(ExecMapper.class);
job.setNumReduceTasks(rWork != null ? rWork.getNumReduceTasks().intValue() : 0);
job.setReducerClass(ExecReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(Text.class);
   console.printInfo("Not enough sampling data.. Rolling back to single reducer task");
   rWork.setNumReduceTasks(1);
   job.setNumReduceTasks(1);
  } catch (Exception e) {
   LOG.error("Sampling error", e);
     "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
   rWork.setNumReduceTasks(1);
   job.setNumReduceTasks(1);

HiveFileFormatUtils.prepareJobOutput(job);
job.setOutputFormat(HiveOutputFormatImpl.class);
job.setMapperClass(work.getMapperClass());
job.setNumReduceTasks(0);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(NullWritable.class);

protected JobConf configStage2 () throws Exception
{
  final JobConf conf = new JobConf(getConf(), ConCmptBlock.class);
  conf.set("block_width", "" + block_width);
  conf.setJobName("ConCmptBlock_pass2");
  
  conf.setMapperClass(MapStage2.class);        
  conf.setReducerClass(RedStage2.class);
  FileInputFormat.setInputPaths(conf, tempbm_path);  
  FileOutputFormat.setOutputPath(conf, nextbm_path);  
  conf.setNumReduceTasks( nreducers );
  conf.setOutputKeyClass(IntWritable.class);
  conf.setOutputValueClass(Text.class);
  return conf;
}

protected JobConf configStage1() throws Exception
{
  final JobConf conf = new JobConf(getConf(), ConCmptBlock.class);
  conf.set("block_width", "" + block_width);
  conf.set("recursive_diagmult", "" + recursive_diagmult);
  conf.setJobName("ConCmptBlock_pass1");
  conf.setMapperClass(MapStage1.class);
  conf.setReducerClass(RedStage1.class);
  FileInputFormat.setInputPaths(conf, edge_path, curbm_path);  
  FileOutputFormat.setOutputPath(conf, tempbm_path);  
  conf.setNumReduceTasks( nreducers );
  conf.setOutputKeyClass(IntWritable.class);
  conf.setOutputValueClass(Text.class);
  return conf;
}

Javadoc

Set the requisite number of reduce tasks for this job.

How many reduces?

The right number of reduces seems to be 0.95 or 1.75 multiplied by (<no. of nodes> * mapred.tasktracker.reduce.tasks.maximum).

With 0.95 all of the reduces can launch immediately and start transfering map outputs as the maps finish. With 1.75 the faster nodes will finish their first round of reduces and launch a second wave of reduces doing a much better job of load balancing.

Increasing the number of reduces increases the framework overhead, but increases load balancing and lowers the cost of failures.

The scaling factors above are slightly less than whole numbers to reserve a few reduce slots in the framework for speculative-tasks, failures etc.

Reducer NONE

It is legal to set the number of reduce-tasks to zero.

In this case the output of the map-tasks directly go to distributed file-system, to the path set by FileOutputFormat#setOutputPath(JobConf,Path). Also, the framework doesn't sort the map-outputs before writing it out to HDFS.

Popular methods of JobConf

<init>
A new map/reduce configuration where the behavior of reading from the default resources can be turne
set
get
setInputFormat
Set the InputFormat implementation for the map-reduce job.
setOutputFormat
Set the OutputFormat implementation for the map-reduce job.
getInt
setMapperClass
Set the Mapper class for the job.
setOutputKeyClass
Set the key class for the job output data.
setOutputValueClass
Set the value class for job outputs.
setReducerClass
Set the Reducer class for the job.
setBoolean
setJobName
Set the user-specified job name.

Popular in Java

Making http requests using okhttp
requestLocationUpdates (LocationManager)
orElseThrow (Optional)
Return the contained value, if present, otherwise throw an exception to be created by the provided s
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
Timestamp (java.sql)
A Java representation of the SQL TIMESTAMP type. It provides the capability of representing the SQL
MessageFormat (java.text)
Produces concatenated messages in language-neutral way. New code should probably use java.util.Forma
AtomicInteger (java.util.concurrent.atomic)
An int value that may be updated atomically. See the java.util.concurrent.atomic package specificati
Stream (java.util.stream)
A sequence of elements supporting sequential and parallel aggregate operations. The following exampl
Window (java.awt)
A Window object is a top-level window with no borders and no menubar. The default layout for a windo
Reference (javax.naming)
Top Sublime Text plugins

How to use setNumReduceTasksmethodin org.apache.hadoop.mapred.JobConf

Best Java code snippets using org.apache.hadoop.mapred.JobConf.setNumReduceTasks (Showing top 20 results out of 648)

Refine search

How to use
setNumReduceTasks
method
in
org.apache.hadoop.mapred.JobConf