/** * Sets the number of rows to return and cache with each scanner iteration. * Higher caching values will enable faster mapreduce jobs at the expense of * requiring more heap to contain the cached rows. * * @param job The current job configuration to adjust. * @param batchSize The number of rows to return in batch with each scanner * iteration. */ public static void setScannerCaching(JobConf job, int batchSize) { job.setInt("hbase.client.scanner.caching", batchSize); }
/** Set the sync interval to be used by the underlying {@link DataFileWriter}.*/ public static void setSyncInterval(JobConf job, int syncIntervalInBytes) { job.setInt(SYNC_INTERVAL_KEY, syncIntervalInBytes); }
public MiniMrShim(Configuration conf, int numberOfTaskTrackers, String nameNode, int numDir) throws IOException { this.conf = conf; JobConf jConf = new JobConf(conf); jConf.set("yarn.scheduler.capacity.root.queues", "default"); jConf.set("yarn.scheduler.capacity.root.default.capacity", "100"); jConf.setInt(MRJobConfig.MAP_MEMORY_MB, 512); jConf.setInt(MRJobConfig.REDUCE_MEMORY_MB, 512); jConf.setInt(MRJobConfig.MR_AM_VMEM_MB, 128); jConf.setInt(YarnConfiguration.YARN_MINICLUSTER_NM_PMEM_MB, 512); jConf.setInt(YarnConfiguration.RM_SCHEDULER_MINIMUM_ALLOCATION_MB, 128); jConf.setInt(YarnConfiguration.RM_SCHEDULER_MAXIMUM_ALLOCATION_MB, 512); mr = new MiniMRCluster(numberOfTaskTrackers, nameNode, numDir, null, null, jConf); }
@Override public void handleEvents(List<Event> arg0) { // As of now only used for Bucket MapJoin, there is exactly one event in the list. assert arg0.size() <= 1; for (Event event : arg0) { CustomProcessorEvent cpEvent = (CustomProcessorEvent) event; ByteBuffer buffer = cpEvent.getPayload(); // Get int view of the buffer IntBuffer intBuffer = buffer.asIntBuffer(); jobConf.setInt(Constants.LLAP_NUM_BUCKETS, intBuffer.get(0)); jobConf.setInt(Constants.LLAP_BUCKET_ID, intBuffer.get(1)); } }
/** Enable output compression using the deflate codec and specify its level.*/ public static void setDeflateLevel(JobConf job, int level) { FileOutputFormat.setCompressOutput(job, true); job.setInt(AvroOutputFormat.DEFLATE_LEVEL_KEY, level); }
/** Enable output compression using the deflate codec and specify its level.*/ public static void setDeflateLevel(JobConf job, int level) { FileOutputFormat.setCompressOutput(job, true); job.setInt(DEFLATE_LEVEL_KEY, level); }
private void setupMRLegacyConfigs(ProcessorContext processorContext) { // Hive "insert overwrite local directory" uses task id as dir name // Setting the id in jobconf helps to have the similar dir name as MR StringBuilder taskAttemptIdBuilder = new StringBuilder("attempt_"); taskAttemptIdBuilder.append(processorContext.getApplicationId().getClusterTimestamp()) .append("_") .append(jobIdFormat.format(processorContext.getApplicationId().getId())) .append("_"); if (isMap) { taskAttemptIdBuilder.append("m_"); } else { taskAttemptIdBuilder.append("r_"); } taskAttemptIdBuilder.append(taskIdFormat.format(processorContext.getTaskIndex())) .append("_") .append(processorContext.getTaskAttemptNumber()); // In MR, mapreduce.task.attempt.id is same as mapred.task.id. Go figure. String taskAttemptIdStr = taskAttemptIdBuilder.toString(); this.jobConf.set("mapred.task.id", taskAttemptIdStr); this.jobConf.set("mapreduce.task.attempt.id", taskAttemptIdStr); this.jobConf.setInt("mapred.task.partition", processorContext.getTaskIndex()); }
/** * create the temporary output file for hadoop RecordWriter. * @param taskNumber The number of the parallel instance. * @param numTasks The number of parallel tasks. * @throws java.io.IOException */ @Override public void open(int taskNumber, int numTasks) throws IOException { // enforce sequential open() calls synchronized (OPEN_MUTEX) { if (Integer.toString(taskNumber + 1).length() > 6) { throw new IOException("Task id too large."); } TaskAttemptID taskAttemptID = TaskAttemptID.forName("attempt__0000_r_" + String.format("%" + (6 - Integer.toString(taskNumber + 1).length()) + "s", " ").replace(" ", "0") + Integer.toString(taskNumber + 1) + "_0"); this.jobConf.set("mapred.task.id", taskAttemptID.toString()); this.jobConf.setInt("mapred.task.partition", taskNumber + 1); // for hadoop 2.2 this.jobConf.set("mapreduce.task.attempt.id", taskAttemptID.toString()); this.jobConf.setInt("mapreduce.task.partition", taskNumber + 1); this.context = new TaskAttemptContextImpl(this.jobConf, taskAttemptID); this.outputCommitter = this.jobConf.getOutputCommitter(); JobContext jobContext = new JobContextImpl(this.jobConf, new JobID()); this.outputCommitter.setupJob(jobContext); this.recordWriter = this.mapredOutputFormat.getRecordWriter(null, this.jobConf, Integer.toString(taskNumber + 1), new HadoopDummyProgressable()); } }
private void setupMRLegacyConfigs(ProcessorContext processorContext) { // Hive "insert overwrite local directory" uses task id as dir name // Setting the id in jobconf helps to have the similar dir name as MR StringBuilder taskAttemptIdBuilder = new StringBuilder("attempt_"); taskAttemptIdBuilder.append(processorContext.getApplicationId().getClusterTimestamp()) .append("_") .append(jobIdFormat.format(processorContext.getApplicationId().getId())) .append("_"); if (isMap) { taskAttemptIdBuilder.append("m_"); } else { taskAttemptIdBuilder.append("r_"); } taskAttemptIdBuilder.append(taskIdFormat.format(processorContext.getTaskIndex())) .append("_") .append(processorContext.getTaskAttemptNumber()); // In MR, mapreduce.task.attempt.id is same as mapred.task.id. Go figure. String taskAttemptIdStr = taskAttemptIdBuilder.toString(); this.jobConf.set("mapred.task.id", taskAttemptIdStr); this.jobConf.set("mapreduce.task.attempt.id", taskAttemptIdStr); this.jobConf.setInt("mapred.task.partition", processorContext.getTaskIndex()); }
private void setupMRLegacyConfigs() { StringBuilder taskAttemptIdBuilder = new StringBuilder("attempt_"); taskAttemptIdBuilder.append(System.currentTimeMillis()) .append("_") .append(stageIdFormat.format(TaskContext.get().stageId())) .append("_"); if (isMap()) { taskAttemptIdBuilder.append("m_"); } else { taskAttemptIdBuilder.append("r_"); } // Hive requires this TaskAttemptId to be unique. MR's TaskAttemptId is composed // of "attempt_timestamp_jobNum_m/r_taskNum_attemptNum". The counterpart for // Spark should be "attempt_timestamp_stageNum_m/r_partitionId_attemptNum". // When there're multiple attempts for a task, Hive will rely on the partitionId // to figure out if the data are duplicate or not when collecting the final outputs // (see org.apache.hadoop.hive.ql.exec.Utils.removeTempOrDuplicateFiles) taskAttemptIdBuilder.append(taskIdFormat.format(TaskContext.get().partitionId())) .append("_").append(TaskContext.get().attemptNumber()); String taskAttemptIdStr = taskAttemptIdBuilder.toString(); jobConf.set("mapred.task.id", taskAttemptIdStr); jobConf.set("mapreduce.task.attempt.id", taskAttemptIdStr); jobConf.setInt("mapred.task.partition", TaskContext.get().partitionId()); } }
/** * Test for {@link Utilities#getInputPaths(JobConf, MapWork, Path, Context, boolean)} with multiple * threads. */ @Test public void testGetInputPathsWithMultipleThreads() throws Exception { final int NUM_PARTITIONS = 5; JobConf jobConf = new JobConf(); jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 2); runTestGetInputPaths(jobConf, NUM_PARTITIONS); }
/** * Test for {@link Utilities#getInputPaths(JobConf, MapWork, Path, Context, boolean)} with a single * threaded. */ @Test public void testGetInputPathsWithASingleThread() throws Exception { final int NUM_PARTITIONS = 5; JobConf jobConf = new JobConf(); jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 1); runTestGetInputPaths(jobConf, NUM_PARTITIONS); }
public int run(String[] args) throws Exception { if(args.length != 3) Utils.croak("USAGE: GenerateData input-file output-dir value-size"); JobConf conf = new JobConf(getConf(), GenerateData.class); conf.setJobName("generate-data"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(GenerateDataMapper.class); conf.setReducerClass(IdentityReducer.class); conf.setNumReduceTasks(0); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(BytesWritable.class); conf.setOutputValueClass(BytesWritable.class); Path inputPath = new Path(args[0]); FileInputFormat.setInputPaths(conf, inputPath); Path outputPath = new Path(args[1]); // delete output path if it already exists FileSystem fs = outputPath.getFileSystem(conf); if(fs.exists(outputPath)) fs.delete(outputPath, true); FileOutputFormat.setOutputPath(conf, outputPath); conf.setInt("value.size", Integer.parseInt(args[2])); JobClient.runJob(conf); return 0; }
/** * Run MR job to check the number of mapper = expectedNumOfSplits */ protected void testNumOfSplitsMR(int splitsPerRegion, int expectedNumOfSplits) throws IOException, InterruptedException, ClassNotFoundException { String jobName = "TestJobForNumOfSplits-MR"; LOG.info("Before map/reduce startup - job " + jobName); JobConf c = new JobConf(TEST_UTIL.getConfiguration()); Scan scan = new Scan(); scan.addFamily(INPUT_FAMILYS[0]); scan.addFamily(INPUT_FAMILYS[1]); c.setInt("hbase.mapreduce.tableinput.mappers.per.region", splitsPerRegion); c.set(KEY_STARTROW, ""); c.set(KEY_LASTROW, ""); Job job = Job.getInstance(c, jobName); TableMapReduceUtil.initTableMapperJob(TABLE_NAME.getNameAsString(), scan, ScanMapper.class, ImmutableBytesWritable.class, ImmutableBytesWritable.class, job); job.setReducerClass(ScanReducer.class); job.setNumReduceTasks(1); job.setOutputFormatClass(NullOutputFormat.class); assertTrue("job failed!", job.waitForCompletion(true)); // for some reason, hbase does not expose JobCounter.TOTAL_LAUNCHED_MAPS, // we use TaskCounter.SHUFFLED_MAPS to get total launched maps assertEquals("Saw the wrong count of mappers per region", expectedNumOfSplits, job.getCounters().findCounter(TaskCounter.SHUFFLED_MAPS).getValue()); }
@Test public void testGetInputSummaryWithASingleThread() throws IOException { final int NUM_PARTITIONS = 5; final int BYTES_PER_FILE = 5; JobConf jobConf = new JobConf(); Properties properties = new Properties(); jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 0); ContentSummary summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE, HiveInputFormat.class); assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength()); assertEquals(NUM_PARTITIONS, summary.getFileCount()); assertEquals(NUM_PARTITIONS, summary.getDirectoryCount()); }
@Test public void testGetInputSummaryWithMultipleThreads() throws IOException { final int NUM_PARTITIONS = 5; final int BYTES_PER_FILE = 5; JobConf jobConf = new JobConf(); Properties properties = new Properties(); jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 2); ContentSummary summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE, HiveInputFormat.class); assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength()); assertEquals(NUM_PARTITIONS, summary.getFileCount()); assertEquals(NUM_PARTITIONS, summary.getDirectoryCount()); // Test deprecated mapred.dfsclient.parallelism.max jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 0); jobConf.setInt(Utilities.DEPRECATED_MAPRED_DFSCLIENT_PARALLELISM_MAX, 2); summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE, HiveInputFormat.class); assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength()); assertEquals(NUM_PARTITIONS, summary.getFileCount()); assertEquals(NUM_PARTITIONS, summary.getDirectoryCount()); }
@Test public void testGetInputSummaryWithInputEstimator() throws IOException, HiveException { final int NUM_PARTITIONS = 5; final int BYTES_PER_FILE = 10; final int NUM_OF_ROWS = 5; JobConf jobConf = new JobConf(); Properties properties = new Properties(); jobConf.setInt(Utilities.DEPRECATED_MAPRED_DFSCLIENT_PARALLELISM_MAX, 2); properties.setProperty(hive_metastoreConstants.META_TABLE_STORAGE, InputEstimatorTestClass.class.getName()); InputEstimatorTestClass.setEstimation(new InputEstimator.Estimation(NUM_OF_ROWS, BYTES_PER_FILE)); /* Let's write more bytes to the files to test that Estimator is actually working returning the file size not from the filesystem */ ContentSummary summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE * 2, HiveInputFormat.class); assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength()); assertEquals(NUM_PARTITIONS * -1, summary.getFileCount()); // Current getInputSummary() returns -1 for each file found assertEquals(NUM_PARTITIONS * -1, summary.getDirectoryCount()); // Current getInputSummary() returns -1 for each file found // Test deprecated mapred.dfsclient.parallelism.max jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 0); jobConf.setInt(HiveConf.ConfVars.HIVE_EXEC_INPUT_LISTING_MAX_THREADS.varname, 2); properties.setProperty(hive_metastoreConstants.META_TABLE_STORAGE, InputEstimatorTestClass.class.getName()); InputEstimatorTestClass.setEstimation(new InputEstimator.Estimation(NUM_OF_ROWS, BYTES_PER_FILE)); /* Let's write more bytes to the files to test that Estimator is actually working returning the file size not from the filesystem */ summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE * 2, HiveInputFormat.class); assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength()); assertEquals(NUM_PARTITIONS * -1, summary.getFileCount()); // Current getInputSummary() returns -1 for each file found assertEquals(NUM_PARTITIONS * -1, summary.getDirectoryCount()); // Current getInputSummary() returns -1 for each file found }
private List<OrcInputFormat.SplitStrategy<?>> getSplitStrategies() throws Exception { conf.setInt(HiveConf.ConfVars.HIVE_TXN_OPERATIONAL_PROPERTIES.varname, AcidUtils.AcidOperationalProperties.getDefault().toInt()); OrcInputFormat.Context context = new OrcInputFormat.Context(conf); OrcInputFormat.FileGenerator gen = new OrcInputFormat.FileGenerator( context, fs, root, false, null); OrcInputFormat.AcidDirInfo adi = gen.call(); return OrcInputFormat.determineSplitStrategies( null, context, adi.fs, adi.splitPath, adi.baseFiles, adi.deleteEvents, null, null, true); } }
/** * Does actual test TeraSort job Through Ignite API * * @param gzip Whether to use GZIP. */ protected final void teraSort(boolean gzip) throws Exception { System.out.println("TeraSort ==============================================================="); getFileSystem().delete(new Path(sortOutDir), true); final JobConf jobConf = new JobConf(); jobConf.setUser(getUser()); jobConf.set("fs.defaultFS", getFsBase()); log().info("Desired number of reduces: " + numReduces()); jobConf.set("mapreduce.job.reduces", String.valueOf(numReduces())); log().info("Desired number of maps: " + numMaps()); final long splitSize = dataSizeBytes() / numMaps(); log().info("Desired split size: " + splitSize); // Force the split to be of the desired size: jobConf.set("mapred.min.split.size", String.valueOf(splitSize)); jobConf.set("mapred.max.split.size", String.valueOf(splitSize)); jobConf.setBoolean(HadoopJobProperty.SHUFFLE_MAPPER_STRIPED_OUTPUT.propertyName(), true); jobConf.setInt(HadoopJobProperty.SHUFFLE_MSG_SIZE.propertyName(), 4096); if (gzip) jobConf.setBoolean(HadoopJobProperty.SHUFFLE_MSG_GZIP.propertyName(), true); jobConf.set(HadoopJobProperty.JOB_PARTIALLY_RAW_COMPARATOR.propertyName(), TextPartiallyRawComparator.class.getName()); Job job = setupConfig(jobConf); HadoopJobId jobId = new HadoopJobId(UUID.randomUUID(), 1); IgniteInternalFuture<?> fut = grid(0).hadoop().submit(jobId, createJobInfo(job.getConfiguration(), null)); fut.get(); }
@Test public void testGetInputSummaryWithContentSummaryInputFormat() throws IOException { final int NUM_PARTITIONS = 5; final int BYTES_PER_FILE = 10; JobConf jobConf = new JobConf(); Properties properties = new Properties(); jobConf.setInt(Utilities.DEPRECATED_MAPRED_DFSCLIENT_PARALLELISM_MAX, 2); ContentSummaryInputFormatTestClass.setContentSummary( new ContentSummary.Builder().length(BYTES_PER_FILE).fileCount(2).directoryCount(1).build()); /* Let's write more bytes to the files to test that ContentSummaryInputFormat is actually working returning the file size not from the filesystem */ ContentSummary summary = runTestGetInputSummary(jobConf, properties, NUM_PARTITIONS, BYTES_PER_FILE * 2, ContentSummaryInputFormatTestClass.class); assertEquals(NUM_PARTITIONS * BYTES_PER_FILE, summary.getLength()); assertEquals(NUM_PARTITIONS * 2, summary.getFileCount()); assertEquals(NUM_PARTITIONS, summary.getDirectoryCount()); }