org.apache.hadoop.mapred.JobConf.setInt java code examples

/**
 * Sets the number of rows to return and cache with each scanner iteration.
 * Higher caching values will enable faster mapreduce jobs at the expense of
 * requiring more heap to contain the cached rows.
 *
 * @param job The current job configuration to adjust.
 * @param batchSize The number of rows to return in batch with each scanner
 * iteration.
 */
public static void setScannerCaching(JobConf job, int batchSize) {
 job.setInt("hbase.client.scanner.caching", batchSize);
}

/** Set the sync interval to be used by the underlying {@link DataFileWriter}.*/
public static void setSyncInterval(JobConf job, int syncIntervalInBytes) {
 job.setInt(SYNC_INTERVAL_KEY, syncIntervalInBytes);
}

public MiniMrShim(Configuration conf, int numberOfTaskTrackers,
         String nameNode, int numDir) throws IOException {
 this.conf = conf;
 JobConf jConf = new JobConf(conf);
 jConf.set("yarn.scheduler.capacity.root.queues", "default");
 jConf.set("yarn.scheduler.capacity.root.default.capacity", "100");
 jConf.setInt(MRJobConfig.MAP_MEMORY_MB, 512);
 jConf.setInt(MRJobConfig.REDUCE_MEMORY_MB, 512);
 jConf.setInt(MRJobConfig.MR_AM_VMEM_MB, 128);
 jConf.setInt(YarnConfiguration.YARN_MINICLUSTER_NM_PMEM_MB, 512);
 jConf.setInt(YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB, 128);
 jConf.setInt(YarnConfiguration.RM_SCHEDULER_MAXIMUM_ALLOCATION_MB, 512);
 mr = new MiniMRCluster(numberOfTaskTrackers, nameNode, numDir, null, null, jConf);
}

@Override
public void handleEvents(List<Event> arg0) {
 // As of now only used for Bucket MapJoin, there is exactly one event in the list.
 assert arg0.size() <= 1;
 for (Event event : arg0) {
  CustomProcessorEvent cpEvent = (CustomProcessorEvent) event;
  ByteBuffer buffer = cpEvent.getPayload();
  // Get int view of the buffer
  IntBuffer intBuffer = buffer.asIntBuffer();
  jobConf.setInt(Constants.LLAP_NUM_BUCKETS, intBuffer.get(0));
  jobConf.setInt(Constants.LLAP_BUCKET_ID, intBuffer.get(1));
 }
}

/** Enable output compression using the deflate codec and specify its level.*/
public static void setDeflateLevel(JobConf job, int level) {
 FileOutputFormat.setCompressOutput(job, true);
 job.setInt(AvroOutputFormat.DEFLATE_LEVEL_KEY, level);
}

/** Enable output compression using the deflate codec and specify its level.*/
public static void setDeflateLevel(JobConf job, int level) {
 FileOutputFormat.setCompressOutput(job, true);
 job.setInt(DEFLATE_LEVEL_KEY, level);
}

private void setupMRLegacyConfigs(ProcessorContext processorContext) {
 // Hive "insert overwrite local directory" uses task id as dir name
 // Setting the id in jobconf helps to have the similar dir name as MR
 StringBuilder taskAttemptIdBuilder = new StringBuilder("attempt_");
 taskAttemptIdBuilder.append(processorContext.getApplicationId().getClusterTimestamp())
   .append("_")
   .append(jobIdFormat.format(processorContext.getApplicationId().getId()))
   .append("_");
 if (isMap) {
  taskAttemptIdBuilder.append("m_");
 } else {
  taskAttemptIdBuilder.append("r_");
 }
 taskAttemptIdBuilder.append(taskIdFormat.format(processorContext.getTaskIndex()))
  .append("_")
  .append(processorContext.getTaskAttemptNumber());
 // In MR, mapreduce.task.attempt.id is same as mapred.task.id. Go figure.
 String taskAttemptIdStr = taskAttemptIdBuilder.toString();
 this.jobConf.set("mapred.task.id", taskAttemptIdStr);
 this.jobConf.set("mapreduce.task.attempt.id", taskAttemptIdStr);
 this.jobConf.setInt("mapred.task.partition", processorContext.getTaskIndex());
}

/**
 * create the temporary output file for hadoop RecordWriter.
 * @param taskNumber The number of the parallel instance.
 * @param numTasks The number of parallel tasks.
 * @throws java.io.IOException
 */
@Override
public void open(int taskNumber, int numTasks) throws IOException {
  // enforce sequential open() calls
  synchronized (OPEN_MUTEX) {
    if (Integer.toString(taskNumber + 1).length() > 6) {
      throw new IOException("Task id too large.");
    }
    TaskAttemptID taskAttemptID = TaskAttemptID.forName("attempt__0000_r_"
        + String.format("%" + (6 - Integer.toString(taskNumber + 1).length()) + "s", " ").replace(" ", "0")
        + Integer.toString(taskNumber + 1)
        + "_0");
    this.jobConf.set("mapred.task.id", taskAttemptID.toString());
    this.jobConf.setInt("mapred.task.partition", taskNumber + 1);
    // for hadoop 2.2
    this.jobConf.set("mapreduce.task.attempt.id", taskAttemptID.toString());
    this.jobConf.setInt("mapreduce.task.partition", taskNumber + 1);
    this.context = new TaskAttemptContextImpl(this.jobConf, taskAttemptID);
    this.outputCommitter = this.jobConf.getOutputCommitter();
    JobContext jobContext = new JobContextImpl(this.jobConf, new JobID());
    this.outputCommitter.setupJob(jobContext);
    this.recordWriter = this.mapredOutputFormat.getRecordWriter(null, this.jobConf, Integer.toString(taskNumber + 1), new HadoopDummyProgressable());
  }
}

private void setupMRLegacyConfigs(ProcessorContext processorContext) {
 // Hive "insert overwrite local directory" uses task id as dir name
 // Setting the id in jobconf helps to have the similar dir name as MR
 StringBuilder taskAttemptIdBuilder = new StringBuilder("attempt_");
 taskAttemptIdBuilder.append(processorContext.getApplicationId().getClusterTimestamp())
   .append("_")
   .append(jobIdFormat.format(processorContext.getApplicationId().getId()))
   .append("_");
 if (isMap) {
  taskAttemptIdBuilder.append("m_");
 } else {
  taskAttemptIdBuilder.append("r_");
 }
 taskAttemptIdBuilder.append(taskIdFormat.format(processorContext.getTaskIndex()))
  .append("_")
  .append(processorContext.getTaskAttemptNumber());
 // In MR, mapreduce.task.attempt.id is same as mapred.task.id. Go figure.
 String taskAttemptIdStr = taskAttemptIdBuilder.toString();
 this.jobConf.set("mapred.task.id", taskAttemptIdStr);
 this.jobConf.set("mapreduce.task.attempt.id", taskAttemptIdStr);
 this.jobConf.setInt("mapred.task.partition", processorContext.getTaskIndex());
}

 private void setupMRLegacyConfigs() {
  StringBuilder taskAttemptIdBuilder = new StringBuilder("attempt_");
  taskAttemptIdBuilder.append(System.currentTimeMillis())
   .append("_")
   .append(stageIdFormat.format(TaskContext.get().stageId()))
   .append("_");

  if (isMap()) {
   taskAttemptIdBuilder.append("m_");
  } else {
   taskAttemptIdBuilder.append("r_");
  }

  // Hive requires this TaskAttemptId to be unique. MR's TaskAttemptId is composed
  // of "attempt_timestamp_jobNum_m/r_taskNum_attemptNum". The counterpart for
  // Spark should be "attempt_timestamp_stageNum_m/r_partitionId_attemptNum".
  // When there're multiple attempts for a task, Hive will rely on the partitionId
  // to figure out if the data are duplicate or not when collecting the final outputs
  // (see org.apache.hadoop.hive.ql.exec.Utils.removeTempOrDuplicateFiles)
  taskAttemptIdBuilder.append(taskIdFormat.format(TaskContext.get().partitionId()))
   .append("_").append(TaskContext.get().attemptNumber());

  String taskAttemptIdStr = taskAttemptIdBuilder.toString();
  jobConf.set("mapred.task.id", taskAttemptIdStr);
  jobConf.set("mapreduce.task.attempt.id", taskAttemptIdStr);
  jobConf.setInt("mapred.task.partition", TaskContext.get().partitionId());
 }
}

/**
 * Test for {@link Utilities#getInputPaths(JobConf, MapWork, Path, Context, boolean)} with multiple
 * threads.
 */
@Test
public void testGetInputPathsWithMultipleThreads() throws Exception {
 final int NUM_PARTITIONS = 5;
 JobConf jobConf = new JobConf();
 jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 2);
 runTestGetInputPaths(jobConf, NUM_PARTITIONS);
}

/**
 * Test for {@link Utilities#getInputPaths(JobConf, MapWork, Path, Context, boolean)} with a single
 * threaded.
 */
@Test
public void testGetInputPathsWithASingleThread() throws Exception {
 final int NUM_PARTITIONS = 5;
 JobConf jobConf = new JobConf();
 jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 1);
 runTestGetInputPaths(jobConf, NUM_PARTITIONS);
}

public int run(String[] args) throws Exception {
  if(args.length != 3)
    Utils.croak("USAGE: GenerateData input-file output-dir value-size");
  JobConf conf = new JobConf(getConf(), GenerateData.class);
  conf.setJobName("generate-data");
  conf.setOutputKeyClass(Text.class);
  conf.setOutputValueClass(IntWritable.class);
  conf.setMapperClass(GenerateDataMapper.class);
  conf.setReducerClass(IdentityReducer.class);
  conf.setNumReduceTasks(0);
  conf.setInputFormat(TextInputFormat.class);
  conf.setOutputFormat(SequenceFileOutputFormat.class);
  conf.setOutputKeyClass(BytesWritable.class);
  conf.setOutputValueClass(BytesWritable.class);
  Path inputPath = new Path(args[0]);
  FileInputFormat.setInputPaths(conf, inputPath);
  Path outputPath = new Path(args[1]);
  // delete output path if it already exists
  FileSystem fs = outputPath.getFileSystem(conf);
  if(fs.exists(outputPath))
    fs.delete(outputPath, true);
  FileOutputFormat.setOutputPath(conf, outputPath);
  conf.setInt("value.size", Integer.parseInt(args[2]));
  JobClient.runJob(conf);
  return 0;
}

/**
 * Run MR job to check the number of mapper = expectedNumOfSplits
 */
protected void testNumOfSplitsMR(int splitsPerRegion, int expectedNumOfSplits)
  throws IOException, InterruptedException, ClassNotFoundException {
 String jobName = "TestJobForNumOfSplits-MR";
 LOG.info("Before map/reduce startup - job " + jobName);
 JobConf c = new JobConf(TEST_UTIL.getConfiguration());
 Scan scan = new Scan();
 scan.addFamily(INPUT_FAMILYS[0]);
 scan.addFamily(INPUT_FAMILYS[1]);
 c.setInt("hbase.mapreduce.tableinput.mappers.per.region", splitsPerRegion);
 c.set(KEY_STARTROW, "");
 c.set(KEY_LASTROW, "");
 Job job = Job.getInstance(c, jobName);
 TableMapReduceUtil.initTableMapperJob(TABLE_NAME.getNameAsString(), scan, ScanMapper.class,
  ImmutableBytesWritable.class, ImmutableBytesWritable.class, job);
 job.setReducerClass(ScanReducer.class);
 job.setNumReduceTasks(1);
 job.setOutputFormatClass(NullOutputFormat.class);
 assertTrue("job failed!", job.waitForCompletion(true));
 // for some reason, hbase does not expose JobCounter.TOTAL_LAUNCHED_MAPS,
 // we use TaskCounter.SHUFFLED_MAPS to get total launched maps
 assertEquals("Saw the wrong count of mappers per region", expectedNumOfSplits,
  job.getCounters().findCounter(TaskCounter.SHUFFLED_MAPS).getValue());
}

@Test
public void testGetInputSummaryWithASingleThread() throws IOException {
 final int NUM_PARTITIONS = 5;
 final int BYTES_PER_FILE = 5;
 JobConf jobConf = new JobConf();
 Properties properties = new Properties();
 jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 0);
 ContentSummary summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE, HiveInputFormat.class);
 assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength());
 assertEquals(NUM_PARTITIONS, summary.getFileCount());
 assertEquals(NUM_PARTITIONS, summary.getDirectoryCount());
}

@Test
public void testGetInputSummaryWithMultipleThreads() throws IOException {
 final int NUM_PARTITIONS = 5;
 final int BYTES_PER_FILE = 5;
 JobConf jobConf = new JobConf();
 Properties properties = new Properties();
 jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 2);
 ContentSummary summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE, HiveInputFormat.class);
 assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength());
 assertEquals(NUM_PARTITIONS, summary.getFileCount());
 assertEquals(NUM_PARTITIONS, summary.getDirectoryCount());
 // Test deprecated mapred.dfsclient.parallelism.max
 jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 0);
 jobConf.setInt(Utilities.DEPRECATED_MAPRED_DFSCLIENT_PARALLELISM_MAX, 2);
 summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE, HiveInputFormat.class);
 assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength());
 assertEquals(NUM_PARTITIONS, summary.getFileCount());
 assertEquals(NUM_PARTITIONS, summary.getDirectoryCount());
}

@Test
public void testGetInputSummaryWithInputEstimator() throws IOException, HiveException {
 final int NUM_PARTITIONS = 5;
 final int BYTES_PER_FILE = 10;
 final int NUM_OF_ROWS = 5;
 JobConf jobConf = new JobConf();
 Properties properties = new Properties();
 jobConf.setInt(Utilities.DEPRECATED_MAPRED_DFSCLIENT_PARALLELISM_MAX, 2);
 properties.setProperty(hive_metastoreConstants.META_TABLE_STORAGE, InputEstimatorTestClass.class.getName());
 InputEstimatorTestClass.setEstimation(new InputEstimator.Estimation(NUM_OF_ROWS, BYTES_PER_FILE));
 /* Let's write more bytes to the files to test that Estimator is actually working returning the file size not from the filesystem */
 ContentSummary summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE * 2, HiveInputFormat.class);
 assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength());
 assertEquals(NUM_PARTITIONS * -1, summary.getFileCount());        // Current getInputSummary() returns -1 for each file found
 assertEquals(NUM_PARTITIONS * -1, summary.getDirectoryCount());   // Current getInputSummary() returns -1 for each file found
 // Test deprecated mapred.dfsclient.parallelism.max
 jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 0);
 jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 2);
 properties.setProperty(hive_metastoreConstants.META_TABLE_STORAGE, InputEstimatorTestClass.class.getName());
 InputEstimatorTestClass.setEstimation(new InputEstimator.Estimation(NUM_OF_ROWS, BYTES_PER_FILE));
 /* Let's write more bytes to the files to test that Estimator is actually working returning the file size not from the filesystem */
 summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE * 2, HiveInputFormat.class);
 assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength());
 assertEquals(NUM_PARTITIONS * -1, summary.getFileCount());        // Current getInputSummary() returns -1 for each file found
 assertEquals(NUM_PARTITIONS * -1, summary.getDirectoryCount());   // Current getInputSummary() returns -1 for each file found
}

 private List<OrcInputFormat.SplitStrategy<?>> getSplitStrategies() throws Exception {
  conf.setInt(HiveConf.ConfVars.HIVE_TXN_OPERATIONAL_PROPERTIES.varname,
    AcidUtils.AcidOperationalProperties.getDefault().toInt());
  OrcInputFormat.Context context = new OrcInputFormat.Context(conf);
  OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator(
    context, fs, root, false, null);
  OrcInputFormat.AcidDirInfo adi = gen.call();
  return OrcInputFormat.determineSplitStrategies(
    null, context, adi.fs, adi.splitPath, adi.baseFiles, adi.deleteEvents,
    null, null, true);

 }
}

/**
 * Does actual test TeraSort job Through Ignite API
 *
 * @param gzip Whether to use GZIP.
 */
protected final void teraSort(boolean gzip) throws Exception {
  System.out.println("TeraSort ===============================================================");
  getFileSystem().delete(new Path(sortOutDir), true);
  final JobConf jobConf = new JobConf();
  jobConf.setUser(getUser());
  jobConf.set("fs.defaultFS", getFsBase());
  log().info("Desired number of reduces: " + numReduces());
  jobConf.set("mapreduce.job.reduces", String.valueOf(numReduces()));
  log().info("Desired number of maps: " + numMaps());
  final long splitSize = dataSizeBytes() / numMaps();
  log().info("Desired split size: " + splitSize);
  // Force the split to be of the desired size:
  jobConf.set("mapred.min.split.size", String.valueOf(splitSize));
  jobConf.set("mapred.max.split.size", String.valueOf(splitSize));
  jobConf.setBoolean(HadoopJobProperty.SHUFFLE_MAPPER_STRIPED_OUTPUT.propertyName(), true);
  jobConf.setInt(HadoopJobProperty.SHUFFLE_MSG_SIZE.propertyName(), 4096);
  if (gzip)
    jobConf.setBoolean(HadoopJobProperty.SHUFFLE_MSG_GZIP.propertyName(), true);
  jobConf.set(HadoopJobProperty.JOB_PARTIALLY_RAW_COMPARATOR.propertyName(),
    TextPartiallyRawComparator.class.getName());
  Job job = setupConfig(jobConf);
  HadoopJobId jobId = new HadoopJobId(UUID.randomUUID(), 1);
  IgniteInternalFuture<?> fut = grid(0).hadoop().submit(jobId, createJobInfo(job.getConfiguration(), null));
  fut.get();
}

@Test
public void testGetInputSummaryWithContentSummaryInputFormat() throws IOException {
 final int NUM_PARTITIONS = 5;
 final int BYTES_PER_FILE = 10;
 JobConf jobConf = new JobConf();
 Properties properties = new Properties();
 jobConf.setInt(Utilities.DEPRECATED_MAPRED_DFSCLIENT_PARALLELISM_MAX, 2);
 ContentSummaryInputFormatTestClass.setContentSummary(
   new ContentSummary.Builder().length(BYTES_PER_FILE).fileCount(2).directoryCount(1).build());
 /* Let's write more bytes to the files to test that ContentSummaryInputFormat is actually working returning the file size not from the filesystem */
 ContentSummary summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE * 2, ContentSummaryInputFormatTestClass.class);
 assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength());
 assertEquals(NUM_PARTITIONS * 2, summary.getFileCount());
 assertEquals(NUM_PARTITIONS, summary.getDirectoryCount());
}

Popular methods of JobConf

<init>
A new map/reduce configuration where the behavior of reading from the default resources can be turne
set
get
setInputFormat
Set the InputFormat implementation for the map-reduce job.
setOutputFormat
Set the OutputFormat implementation for the map-reduce job.
getInt
setMapperClass
Set the Mapper class for the job.
setOutputKeyClass
Set the key class for the job output data.
setOutputValueClass
Set the value class for job outputs.
setReducerClass
Set the Reducer class for the job.
setNumReduceTasks
Set the requisite number of reduce tasks for this job.HOW MANY REDUCES? The right number of reduces
setBoolean

Popular in Java

Updating database using SQL prepared statement
getContentResolver (Context)
addToBackStack (FragmentTransaction)
getExternalFilesDir (Context)
RandomAccessFile (java.io)
Allows reading from and writing to a file in a random-access manner. This is different from the uni-
SimpleDateFormat (java.text)
Formats and parses dates in a locale-sensitive manner. Formatting turns a Date into a String, and pa
Manifest (java.util.jar)
The Manifest class is used to obtain attribute information for a JarFile and its entries.
BoxLayout (javax.swing)
JLabel (javax.swing)
BasicDataSource (org.apache.commons.dbcp)
Basic implementation of javax.sql.DataSource that is configured via JavaBeans properties. This is no
Top plugins for Android Studio

How to use setIntmethodin org.apache.hadoop.mapred.JobConf

Best Java code snippets using org.apache.hadoop.mapred.JobConf.setInt (Showing top 20 results out of 585)

How to use
setInt
method
in
org.apache.hadoop.mapred.JobConf