public int run(String[] args) throws Exception { if(args.length != 3) Utils.croak("USAGE: GenerateData input-file output-dir value-size"); JobConf conf = new JobConf(getConf(), GenerateData.class); conf.setJobName("generate-data"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); conf.setMapperClass(GenerateDataMapper.class); conf.setReducerClass(IdentityReducer.class); conf.setNumReduceTasks(0); conf.setInputFormat(TextInputFormat.class); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(BytesWritable.class); conf.setOutputValueClass(BytesWritable.class); Path inputPath = new Path(args[0]); FileInputFormat.setInputPaths(conf, inputPath); Path outputPath = new Path(args[1]); // delete output path if it already exists FileSystem fs = outputPath.getFileSystem(conf); if(fs.exists(outputPath)) fs.delete(outputPath, true); FileOutputFormat.setOutputPath(conf, outputPath); conf.setInt("value.size", Integer.parseInt(args[2])); JobClient.runJob(conf); return 0; }
/** * Add a {@link Path} with a custom {@link Schema} to the list of * inputs for the map-reduce job. * * @param conf The configuration of the job * @param path {@link Path} to be added to the list of inputs for the job * @param inputSchema {@link Schema} class to use for this path */ private static void addInputPath(JobConf conf, Path path, Schema inputSchema) { String schemaMapping = path.toString() + ";" + toBase64(inputSchema.toString()); String schemas = conf.get(schemaKey); conf.set(schemaKey, schemas == null ? schemaMapping : schemas + "," + schemaMapping); conf.setInputFormat(DelegatingInputFormat.class); }
public void configure(JobConf conf) { this.cluster = new ClusterMapper().readCluster(new StringReader(conf.get("cluster.xml"))); List<StoreDefinition> storeDefs = new StoreDefinitionsMapper().readStoreList(new StringReader(conf.get("stores.xml"))); if(storeDefs.size() != 1) throw new IllegalStateException("Expected to find only a single store, but found multiple!"); this.storeDef = storeDefs.get(0); this.numChunks = conf.getInt(NUM_CHUNKS, -1); if(this.numChunks < 1) { // A bit of defensive code for good measure, but should never happen anymore, now that the config cannot // be overridden by the user. throw new VoldemortException(NUM_CHUNKS + " not specified in the MapReduce JobConf (should NEVER happen)"); } this.saveKeys = conf.getBoolean(VoldemortBuildAndPushJob.SAVE_KEYS, true); this.reducerPerBucket = conf.getBoolean(VoldemortBuildAndPushJob.REDUCER_PER_BUCKET, true); this.buildPrimaryReplicasOnly = conf.getBoolean(VoldemortBuildAndPushJob.BUILD_PRIMARY_REPLICAS_ONLY, false); if (buildPrimaryReplicasOnly && !saveKeys) { throw new IllegalStateException(VoldemortBuildAndPushJob.BUILD_PRIMARY_REPLICAS_ONLY + " can only be true if " + VoldemortBuildAndPushJob.SAVE_KEYS + " is also true."); } }
public static JobConf createConfForMmOriginalsSplit( JobConf conf, List<Path> dirsWithFileOriginals) { JobConf nonRecConf = new JobConf(conf); FileInputFormat.setInputPaths(nonRecConf, dirsWithFileOriginals.toArray(new Path[dirsWithFileOriginals.size()])); nonRecConf.setBoolean(FileInputFormat.INPUT_DIR_RECURSIVE, false); nonRecConf.setBoolean("mapred.input.dir.recursive", false); // TODO: change to FileInputFormat.... field after MAPREDUCE-7086. nonRecConf.setBoolean("mapreduce.input.fileinputformat.input.dir.nonrecursive.ignore.subdirs", true); return nonRecConf; }
private static void configureAvroOutput(JobConf job) { if (job.get("mapred.output.format.class") == null) job.setOutputFormat(AvroOutputFormat.class); if (job.getReducerClass() == IdentityReducer.class) job.setReducerClass(HadoopReducer.class); job.setOutputKeyClass(AvroWrapper.class); configureAvroShuffle(job); }
@Test public void testJob() throws Exception { JobConf job = new JobConf(); Path outputPath = new Path(DIR.getRoot().getPath() + "/out"); outputPath.getFileSystem(job).delete(outputPath); job.setInputFormat(TextInputFormat.class); FileInputFormat.setInputPaths(job, DIR.getRoot().getPath() + "/in"); job.setMapperClass(AvroTestConverter.class); job.setNumReduceTasks(0); FileOutputFormat.setOutputPath(job, outputPath); System.out.println(createSchema()); AvroJob.setOutputSchema(job, Pair.getPairSchema(Schema.create(Schema.Type.LONG), createSchema())); job.setOutputFormat(AvroOutputFormat.class); JobClient.runJob(job); } }
conf.setInt("io.file.buffer.size", DEFAULT_BUFFER_SIZE); conf.set("cluster.xml", new ClusterMapper().writeCluster(cluster)); conf.set("stores.xml", new StoreDefinitionsMapper().writeStoreList(Collections.singletonList(storeDef))); conf.setBoolean(VoldemortBuildAndPushJob.SAVE_KEYS, saveKeys); conf.setBoolean(VoldemortBuildAndPushJob.REDUCER_PER_BUCKET, reducerPerBucket); conf.setBoolean(VoldemortBuildAndPushJob.BUILD_PRIMARY_REPLICAS_ONLY, buildPrimaryReplicasOnly); if(!isAvro) { conf.setPartitionerClass(HadoopStoreBuilderPartitioner.class); conf.setMapperClass(mapperClass); conf.setMapOutputKeyClass(BytesWritable.class); conf.setMapOutputValueClass(BytesWritable.class); conf.setReducerClass(HadoopStoreBuilderReducer.class); conf.setInputFormat(inputFormatClass); conf.setOutputFormat(SequenceFileOutputFormat.class); conf.setOutputKeyClass(BytesWritable.class); conf.setOutputValueClass(BytesWritable.class); conf.setJarByClass(getClass()); conf.setReduceSpeculativeExecution(false); FileInputFormat.setInputPaths(conf, inputPath); conf.set("final.output.dir", outputDir.toString()); conf.set(VoldemortBuildAndPushJob.CHECKSUM_TYPE, CheckSum.toString(checkSumType)); conf.set("dfs.umaskmode", "002"); FileOutputFormat.setOutputPath(conf, tempDir); conf.setInt(AbstractStoreBuilderConfigurable.NUM_CHUNKS, numChunks); conf.setNumReduceTasks(numReducers);
@Override public void abortJob(JobContext context, int status) throws IOException { JobConf conf = ShimLoader.getHadoopShims().getJobConf(context); Path tmpLocation = new Path(conf.get(TMP_LOCATION)); FileSystem fs = tmpLocation.getFileSystem(conf); LOG.debug("Removing " + tmpLocation.toString()); fs.delete(tmpLocation, true); } }
@Test /** * Run the identity job on a "bytes" Avro file using AvroAsTextInputFormat * and AvroTextOutputFormat to produce a sorted "bytes" Avro file. */ public void testSort() throws Exception { JobConf job = new JobConf(); String inputPath = INPUT_DIR.getRoot().getPath(); Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath()); outputPath.getFileSystem(job).delete(outputPath); WordCountUtil.writeLinesBytesFile(inputPath); job.setInputFormat(AvroAsTextInputFormat.class); job.setOutputFormat(AvroTextOutputFormat.class); job.setOutputKeyClass(Text.class); FileInputFormat.setInputPaths(job, new Path(inputPath)); FileOutputFormat.setOutputPath(job, outputPath); JobClient.runJob(job); WordCountUtil.validateSortedFile(outputPath.toString() + "/part-00000.avro"); }
reader = new BufferedReader(new InputStreamReader(xlearningProcess.getInputStream())); List<OutputInfo> outputs = Arrays.asList(amClient.getOutputLocation()); JobConf jobConf = new JobConf(conf); jobConf.setOutputKeyClass(Text.class); jobConf.setOutputValueClass(Text.class); jobConf.setBoolean("mapred.output.compress", true); jobConf.set("mapred.output.compression.codec", "org.apache.hadoop.io.compress.GzipCodec"); jobConf.setOutputFormat(TextMultiOutputFormat.class); Path remotePath = new Path(outputs.get(0).getDfsLocation() + "/_temporary/" + containerId.toString()); FileSystem dfs = remotePath.getFileSystem(jobConf); jobConf.set(XLearningConstants.STREAM_OUTPUT_DIR, remotePath.makeQualified(dfs).toString()); OutputFormat outputFormat = ReflectionUtils.newInstance(conf.getClass(XLearningConfiguration.XLEARNING_OUTPUTFORMAT_CLASS, XLearningConfiguration.DEFAULT_XLEARNING_OUTPUTF0RMAT_CLASS, OutputFormat.class), jobConf); JobID jobID = new JobID(new SimpleDateFormat("yyyyMMddHHmm").format(new Date()), 0); TaskAttemptID taId = new TaskAttemptID(new TaskID(jobID, true, 0), 0); jobConf.set("mapred.tip.id", taId.getTaskID().toString()); jobConf.set("mapred.task.id", taId.toString()); jobConf.set("mapred.job.id", jobID.toString()); amClient.reportMapedTaskID(containerId, taId.toString()); RecordWriter writer = outputFormat.getRecordWriter(dfs, jobConf, "part-r", Reporter.NULL); dfs.close(); } catch (Exception e) { LOG.warn("Exception in thread stdoutRedirectThread");
job.setInputFormat(work.getInputformatClass()); job.setOutputFormat(HiveOutputFormatImpl.class); job.setMapperClass(MergeFileMapper.class); job.setMapOutputKeyClass(NullWritable.class); job.setMapOutputValueClass(NullWritable.class); job.setOutputKeyClass(NullWritable.class); job.setOutputValueClass(NullWritable.class); job.setNumReduceTasks(0); FileSystem fs = tempOutPath.getFileSystem(job); if (!fs.exists(tempOutPath)) { fs.mkdirs(tempOutPath); boolean noName = StringUtils.isEmpty(job.get(MRJobConfig.JOB_NAME)); job.set(MRJobConfig.JOB_NAME, jobName != null ? jobName : "JOB" + Utilities.randGen.nextInt()); JobClient jc = new JobClient(job); job.set("tmpjars", addedJars); rj = jc.submitJob(job); this.jobID = rj.getJobID(); returnVal = jobExecHelper.progress(rj, jc, ctx);
@Test public void testJob() throws Exception { JobConf job = new JobConf(); Path inputPath1 = new Path(INPUT_DIR_1.getRoot().getPath()); Path inputPath2 = new Path(INPUT_DIR_2.getRoot().getPath()); Path outputPath = new Path(OUTPUT_DIR.getRoot().getPath()); outputPath.getFileSystem(job).delete(outputPath); writeNamesFiles(new File(inputPath1.toUri().getPath())); writeBalancesFiles(new File(inputPath2.toUri().getPath())); job.setJobName("multiple-inputs-join"); AvroMultipleInputs.addInputPath(job, inputPath1, NamesMapImpl.class, ReflectData.get().getSchema(NamesRecord.class)); AvroMultipleInputs.addInputPath(job, inputPath2, BalancesMapImpl.class, ReflectData.get().getSchema(BalancesRecord.class)); Schema keySchema = ReflectData.get().getSchema(KeyRecord.class); Schema valueSchema = ReflectData.get().getSchema(JoinableRecord.class); AvroJob.setMapOutputSchema(job, Pair.getPairSchema(keySchema, valueSchema)); AvroJob.setOutputSchema(job, ReflectData.get().getSchema(CompleteRecord.class)); AvroJob.setReducerClass(job, ReduceImpl.class); job.setNumReduceTasks(1); FileOutputFormat.setOutputPath(job, outputPath); AvroJob.setReflect(job); JobClient.runJob(job); validateCompleteFile(new File(OUTPUT_DIR.getRoot(), "part-00000.avro")); }
private static FileSinkOperator.RecordWriter createOrcRecordWriter(File outputFile, Format format, CompressionKind compression, ObjectInspector columnObjectInspector) throws IOException { JobConf jobConf = new JobConf(); jobConf.set("hive.exec.orc.write.format", format == ORC_12 ? "0.12" : "0.11"); jobConf.set("hive.exec.orc.default.compress", compression.name()); Properties tableProperties = new Properties(); tableProperties.setProperty("columns", "test"); tableProperties.setProperty("columns.types", columnObjectInspector.getTypeName()); tableProperties.setProperty("orc.stripe.size", "1200000"); return new OrcOutputFormat().getHiveRecordWriter( jobConf, new Path(outputFile.toURI()), Text.class, compression != NONE, tableProperties, () -> {}); }
public LogRetriever(String statusDir, JobType jobType, Configuration conf) throws IOException { this.statusDir = statusDir; this.jobType = jobType; attemptDetailPattern = Pattern.compile(attemptDetailPatternInString); attemptLogPattern = Pattern.compile(attemptLogPatternInString); attemptIDPattern = Pattern.compile(attemptIDPatternInString); attemptStartTimePattern = Pattern.compile(attemptStartTimePatternInString); attemptEndTimePattern = Pattern.compile(attemptEndTimePatternInString); Path statusPath = new Path(statusDir); fs = statusPath.getFileSystem(conf); jobClient = new JobClient(new JobConf(conf)); this.conf = conf; }
/** * On Tez we're not creating dummy files when getting/setting input paths. * We let Tez handle the situation. We're also setting the paths in the AM * so we don't want to depend on scratch dir and context. */ public static List<Path> getInputPathsTez(JobConf job, MapWork work) throws Exception { String scratchDir = job.get(DagUtils.TEZ_TMP_DIR_KEY); List<Path> paths = getInputPaths(job, work, new Path(scratchDir), null, true); return paths; }
TupleDomain<HiveColumnHandle> effectivePredicate = (TupleDomain<HiveColumnHandle>) compactEffectivePredicate; Path path = new Path(getPartitionLocation(table, partition.getPartition())); Configuration configuration = hdfsEnvironment.getConfiguration(hdfsContext, path); InputFormat<?, ?> inputFormat = getInputFormat(configuration, schema, false); FileSystem fs = hdfsEnvironment.getFileSystem(hdfsContext, path); boolean s3SelectPushdownEnabled = shouldEnablePushdownForTable(session, table, path.toString(), partition.getPartition()); JobConf targetJob = toJobConf(targetFilesystem.getConf()); targetJob.setInputFormat(TextInputFormat.class); targetInputFormat.configure(targetJob); FileInputFormat.setInputPaths(targetJob, targetPath); InputSplit[] targetSplits = targetInputFormat.getSplits(targetJob, 0); FileInputFormat.setInputPaths(jobConf, path); InputSplit[] splits = inputFormat.getSplits(jobConf, 0);
@Override protected void runJob(String jobName, Configuration c, List<Scan> scans) throws IOException, InterruptedException, ClassNotFoundException { JobConf job = new JobConf(TEST_UTIL.getConfiguration()); job.setJobName(jobName); job.setMapperClass(Mapper.class); job.setReducerClass(Reducer.class); TableMapReduceUtil.initMultiTableSnapshotMapperJob(getSnapshotScanMapping(scans), Mapper.class, ImmutableBytesWritable.class, ImmutableBytesWritable.class, job, true, restoreDir); TableMapReduceUtil.addDependencyJars(job); job.setReducerClass(Reducer.class); job.setNumReduceTasks(1); // one to get final "first" and "last" key FileOutputFormat.setOutputPath(job, new Path(job.getJobName())); LOG.info("Started " + job.getJobName()); RunningJob runningJob = JobClient.runJob(job); runningJob.waitForCompletion(); assertTrue(runningJob.isSuccessful()); LOG.info("After map/reduce completion - job " + jobName); }
/** * Gets fully configured JobConf instance. * * @param input input file name. * @param output output directory name. * @return Job configuration */ public static JobConf getJob(String input, String output) { JobConf conf = new JobConf(HadoopWordCount1.class); conf.setJobName("wordcount"); conf.setOutputKeyClass(Text.class); conf.setOutputValueClass(IntWritable.class); setTasksClasses(conf, true, true, true); FileInputFormat.setInputPaths(conf, new Path(input)); FileOutputFormat.setOutputPath(conf, new Path(output)); return conf; }