org.apache.hadoop.mapreduce.lib.output.FileOutputFormat java code examples

Refine search

@Test
public void testAvroSpecificOutput() throws Exception {
 Job job = new Job();
 FileInputFormat.setInputPaths(job, new Path(getClass()
     .getResource("/org/apache/avro/mapreduce/mapreduce-test-input.txt")
     .toURI().toString()));
 job.setInputFormatClass(TextInputFormat.class);
 job.setMapperClass(LineCountMapper.class);
 job.setMapOutputKeyClass(Text.class);
 job.setMapOutputValueClass(IntWritable.class);
 job.setOutputFormatClass(AvroKeyOutputFormat.class);
 Path outputPath = new Path(DIR.getRoot().getPath() + "/testAvroSpecificOutput");
 outputPath.getFileSystem(job.getConfiguration()).delete(outputPath);
 FileOutputFormat.setOutputPath(job, outputPath);
 Assert.assertTrue(job.waitForCompletion(true));
 FileSystem fileSystem = FileSystem.get(job.getConfiguration());
 FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/myavro3-*"));
 Assert.assertEquals(1, outputFiles.length);
 Map<String, Integer> counts = new HashMap<>();

this.isSpeculativeEnabled =
  isSpeculativeExecutionEnabled(HadoopUtils.getStateFromConf(context.getConfiguration()).getProperties());
this.fs = FileSystem.get(context.getConfiguration());
this.taskStateStore =
  new FsStateStore<>(this.fs, FileOutputFormat.getOutputPath(context).toUri().getPath(), TaskState.class);
String jobStateFileName = context.getConfiguration().get(ConfigurationKeys.JOB_STATE_DISTRIBUTED_CACHE_NAME);
boolean foundStateFile = false;
for (Path dcPath : DistributedCache.getLocalCacheFiles(context.getConfiguration())) {
 if (dcPath.getName().equals(jobStateFileName)) {
  SerializationUtils.deserializeStateFromInputStream(
    closer.register(new FileInputStream(dcPath.toUri().getPath())), this.jobState);
  foundStateFile = true;
  break;
configuration.set(entry.getKey(), entry.getValue().unwrapped().toString());
 configuration.get(ConfigurationKeys.METRICS_ENABLED_KEY, ConfigurationKeys.DEFAULT_METRICS_ENABLED))) {
this.jobMetrics = Optional.of(JobMetrics.get(this.jobState));
this.jobMetrics.get()

parseOptions(options, args);
job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));
String cubeName = getOptionValue(OPTION_CUBE_NAME);
Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH));
CubeInstance cube = cubeMgr.getCube(cubeName);
job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentId);
logger.info("Starting: " + job.getJobName());
job.getConfiguration().addResource(new Path(jobEngineConfig.getHadoopJobConfFilePath(null)));
KafkaConsumerProperties kafkaConsumerProperties = KafkaConsumerProperties.getInstanceFromEnv();
job.getConfiguration().addResource(new Path(kafkaConsumerProperties.getKafkaConsumerHadoopJobConf()));
appendKafkaOverrideProperties(cube.getConfig(), job.getConfiguration());
job.getConfiguration().set(CONFIG_KAFKA_BROKERS, brokers);
setupMapper(cube.getSegmentById(segmentId));
job.setNumReduceTasks(0);
FileOutputFormat.setOutputPath(job, output);
FileOutputFormat.setCompressOutput(job, true);
org.apache.log4j.Logger.getRootLogger().info("Output hdfs location: " + output);
org.apache.log4j.Logger.getRootLogger().info("Output hdfs compression: " + true);

/**
 * Constructor.
 * @param context The TaskAttempContext to supply the writer with information form the job configuration
 */
public AvroTrevniRecordWriterBase(TaskAttemptContext context) throws IOException {
 schema = initSchema(context);
 meta = filterMetadata(context.getConfiguration());
 writer = new AvroColumnWriter<>(schema, meta, ReflectData.get());
 Path outputPath = FileOutputFormat.getOutputPath(context);
 String dir = FileOutputFormat.getUniqueFile(context, "part", "");
 dirPath = new Path(outputPath.toString() + "/" + dir);
 fs = dirPath.getFileSystem(context.getConfiguration());
 fs.mkdirs(dirPath);
 blockSize = fs.getDefaultBlockSize();
}

public void checkOutputFormat() throws Exception {
 Job job = new Job();
 WordCountUtil wordCountUtil = new WordCountUtil("trevniMapReduceKeyTest", "part-r-00000");
 wordCountUtil.writeLinesFile();
 AvroJob.setInputKeySchema(job, STRING);
 AvroJob.setOutputKeySchema(job, Pair.getPairSchema(STRING,LONG));
 job.setMapperClass(WordCountMapper.class);
 job.setReducerClass(WordCountReducer.class);
 job.setMapOutputKeyClass(Text.class);
 job.setMapOutputValueClass(LongWritable.class);
 FileInputFormat.setInputPaths(job, new Path(wordCountUtil.getDir().toString() + "/in"));
 FileOutputFormat.setOutputPath(job, new Path(wordCountUtil.getDir().toString() + "/out"));
 FileOutputFormat.setCompressOutput(job, true);
 job.setInputFormatClass(AvroKeyInputFormat.class);
 job.setOutputFormatClass(AvroTrevniKeyOutputFormat.class);
 job.waitForCompletion(true);
 wordCountUtil.validateCountsFile();
}

LOGGER.info("Starting {}", getClass().getSimpleName());
if (fs.exists(outputDir)) {
 LOGGER.warn("Found the output folder {}, deleting it", _outputDir);
 fs.delete(outputDir, true);
fs.mkdirs(outputDir);
Job job = Job.getInstance(getConf());
job.setJarByClass(SegmentCreationJob.class);
job.setJobName(_jobName);
 job.getConfiguration().set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
job.setMapOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(_stagingDir + "/input/"));
FileOutputFormat.setOutputPath(job, new Path(_stagingDir + "/output/"));
job.getConfiguration().setInt(JobContext.NUM_MAPS, inputDataFiles.size());
if (_dataSchema != null) {
 job.getConfiguration().set(JobConfigConstants.SCHEMA, _dataSchema.toString());
setOutputPath(job.getConfiguration());

/**
 * Prepare job with mappers to cancel.
 * @return Fully configured job.
 * @throws Exception If fails.
 */
private Configuration prepareJobForCancelling() throws Exception {
  prepareFile("/testFile", 1500);
  executedTasks.set(0);
  cancelledTasks.set(0);
  failMapperId.set(0);
  splitsCount.set(0);
  Configuration cfg = new Configuration();
  setupFileSystems(cfg);
  Job job = Job.getInstance(cfg);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(IntWritable.class);
  job.setMapperClass(CancellingTestMapper.class);
  job.setNumReduceTasks(0);
  job.setInputFormatClass(InFormat.class);
  FileInputFormat.setInputPaths(job, new Path("igfs://" + igfsName + "@/"));
  FileOutputFormat.setOutputPath(job, new Path("igfs://" + igfsName  + "@/output/"));
  job.setJarByClass(getClass());
  return job.getConfiguration();
}

job.setMapperClass(CellSortImporter.class);
job.setReducerClass(CellReducer.class);
Path outputDir = new Path(hfileOutPath);
FileOutputFormat.setOutputPath(job, outputDir);
job.setMapOutputKeyClass(CellWritableComparable.class);
job.setMapOutputValueClass(MapReduceExtendedCell.class);
  RawComparator.class);
Path partitionsPath =
  new Path(TotalOrderPartitioner.getPartitionFile(job.getConfiguration()));
FileSystem fs = FileSystem.get(job.getConfiguration());
fs.deleteOnExit(partitionsPath);
job.setPartitionerClass(CellWritableComparablePartitioner.class);
job.setNumReduceTasks(regionLocator.getStartKeys().length);
job.setReducerClass(CellSortReducer.class);
Path outputDir = new Path(hfileOutPath);
FileOutputFormat.setOutputPath(job, outputDir);
job.setMapOutputKeyClass(ImmutableBytesWritable.class);
job.setMapOutputValueClass(MapReduceExtendedCell.class);

@Test
 conf.set("hive.io.file.read.all.columns", "false");
 conf.set("hive.io.file.readcolumn.ids", "1,3");
 Job job = new Job(conf, "orc test");
 job.setInputFormatClass(OrcNewInputFormat.class);
 job.setJarByClass(TestNewInputOutputFormat.class);
 job.setMapperClass(OrcTestMapper1.class);
 job.setNumReduceTasks(0);
 job.setOutputKeyClass(Text.class);
 job.setOutputValueClass(IntWritable.class);
 FileInputFormat.addInputPath(job, new Path(HiveTestUtils
   .getFileFromClasspath("orc-file-11-format.orc")));
 Path outputPath = new Path(workDir, "TestOrcFile." +
   testCaseName.getMethodName() + ".txt");
 localFs.delete(outputPath, true);
 FileOutputFormat.setOutputPath(job, outputPath);
 boolean result = job.waitForCompletion(true);
 assertTrue(result);
 Path outputFilePath = new Path(outputPath, "part-m-00000");
   new InputStreamReader(localFs.open(outputFilePath)));
 String line=reader.readLine();
   "null, null, null, null, null, null, null}");
 localFs.delete(outputPath, true);

private void runLinkedListMRJob(int iteration) throws Exception {
 String jobName =  IntegrationTestBulkLoad.class.getSimpleName() + " - " +
   EnvironmentEdgeManager.currentTime();
 Configuration conf = new Configuration(util.getConfiguration());
 Path p = null;
 if (conf.get(ImportTsv.BULK_OUTPUT_CONF_KEY) == null) {
  p = util.getDataTestDirOnTestFS(getTablename() + "-" + iteration);
 } else {
  p = new Path(conf.get(ImportTsv.BULK_OUTPUT_CONF_KEY));
 conf.setInt(ROUND_NUM_KEY, iteration);
 Job job = new Job(conf);
 job.setJobName(jobName);
 job.setInputFormatClass(ITBulkLoadInputFormat.class);
 FileOutputFormat.setOutputPath(job, p);
 try (Connection conn = ConnectionFactory.createConnection(conf);
   Admin admin = conn.getAdmin();
 util.getTestFileSystem().delete(p, true);

Configuration conf = new Configuration();
String numMaps =  new GenericOptionsParser(conf, args).getRemainingArgs()[0];
conf.set(MRJobConfig.NUM_MAPS, numMaps);
createHdfsFilesystem(conf);
Job job = Job.getInstance(conf, "MapReduceIntegrationChecker");
job.setJarByClass(MapReduceIntegrationChecker.class);
job.setMapperClass(CheckerMapper.class);
job.setCombinerClass(CheckerReducer.class);
job.setReducerClass(CheckerReducer.class);
job.setOutputValueClass(Text.class);
job.setInputFormatClass(EmptyInputFormat.class);
FileOutputFormat.setOutputPath(job, mOutputFilePath);
   : (resultStatus.equals(Status.FAIL_TO_FIND_CLASS) ? 2 : 1);
} finally {
 if (mFileSystem.exists(mOutputFilePath)) {
  mFileSystem.delete(mOutputFilePath, true);
 mFileSystem.close();

job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));
logger.info("Starting: " + job.getJobName());
attachSegmentMetadataWithAll(cubeSeg, job.getConfiguration());
job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentID);
job.getConfiguration().set(BatchConstants.CFG_CUBOID_MODE, cuboidModeName);
job.getConfiguration().set(BatchConstants.CFG_UPDATE_SHARD, ifNeedUpdateBaseCuboidShard);
FileInputFormat.setInputPaths(job, new Path(input));
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setOutputValueClass(Text.class);
Path outputPath = new Path(output);
FileOutputFormat.setOutputPath(job, outputPath);
HadoopUtil.deletePath(job.getConfiguration(), outputPath);

/**
 * Sets up the actual job.
 *
 * @param conf The current configuration.
 * @param args The command line parameters.
 * @return The newly created job.
 * @throws IOException When setting up the job fails.
 */
public static Job createSubmittableJob(Configuration conf, String[] args)
  throws IOException {
 String tableName = args[0];
 Path outputDir = new Path(args[1]);
 String reportSeparatorString = (args.length > 2) ? args[2]: ":";
 conf.set("ReportSeparator", reportSeparatorString);
 Job job = Job.getInstance(conf, conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName));
 job.setJarByClass(CellCounter.class);
 Scan scan = getConfiguredScanForJob(conf, args);
 TableMapReduceUtil.initTableMapperJob(tableName, scan,
   CellCounterMapper.class, ImmutableBytesWritable.class, Result.class, job);
 job.setNumReduceTasks(1);
 job.setMapOutputKeyClass(Text.class);
 job.setMapOutputValueClass(IntWritable.class);
 job.setOutputFormatClass(TextOutputFormat.class);
 job.setOutputKeyClass(Text.class);
 job.setOutputValueClass(IntWritable.class);
 FileOutputFormat.setOutputPath(job, outputDir);
 job.setReducerClass(IntSumReducer.class);
 return job;
}

/**
 * @throws Exception If failed.
 */
@Test
public void testSimpleTaskSubmit() throws Exception {
  String testInputFile = "/test";
  prepareTestFile(testInputFile);
  Configuration cfg = new Configuration();
  setupFileSystems(cfg);
  Job job = Job.getInstance(cfg);
  job.setMapperClass(TestMapper.class);
  job.setCombinerClass(TestReducer.class);
  job.setReducerClass(TestReducer.class);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(IntWritable.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(IntWritable.class);
  job.setNumReduceTasks(1);
  FileInputFormat.setInputPaths(job, new Path("igfs://:" + getTestIgniteInstanceName(0) + "@/" + testInputFile));
  FileOutputFormat.setOutputPath(job, new Path("igfs://:" + getTestIgniteInstanceName(0) + "@/output"));
  job.setJarByClass(getClass());
  IgniteInternalFuture<?> fut = grid(0).hadoop().submit(new HadoopJobId(UUID.randomUUID(), 1),
    createJobInfo(job.getConfiguration(), null));
  fut.get();
}

 groupByJob = Job.getInstance(
   new Configuration(),
   StringUtils.format("%s-determine_partitions_groupby-%s", config.getDataSource(), config.getIntervals())
 );
 config.addJobProperties(groupByJob);
 groupByJob.setMapperClass(DeterminePartitionsGroupByMapper.class);
 groupByJob.setMapOutputKeyClass(BytesWritable.class);
 groupByJob.setMapOutputValueClass(NullWritable.class);
 groupByJob.setCombinerClass(DeterminePartitionsGroupByReducer.class);
 FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());
 groupByJob.submit();
dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19");
FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath());
dimSelectionJob.submit();
 final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity);
 if (fileSystem == null) {
  fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration());

@Test
 conf.set("hive.exec.orc.default.compress", "SNAPPY");
 Job job = new Job(conf, "orc test");
 job.setOutputFormatClass(OrcNewOutputFormat.class);
 job.setJarByClass(TestNewInputOutputFormat.class);
 job.setMapperClass(OrcTestMapper2.class);
 job.setNumReduceTasks(0);
 job.setOutputKeyClass(NullWritable.class);
 job.setOutputValueClass(OrcSerdeRow.class);
 FileInputFormat.addInputPath(job, inputPath);
 FileOutputFormat.setOutputPath(job, outputPath);
 boolean result = job.waitForCompletion(true);
 assertTrue(result);
 Path outputFilePath = new Path(outputPath, "part-m-00000");
 Reader reader = OrcFile.createReader(outputFilePath,
   OrcFile.readerOptions(conf).filesystem(localFs));
 assertEquals(reader.getCompression(), CompressionKind.SNAPPY);
 localFs.delete(outputPath, true);

/**
 * Tests an MR Scan initialized from properties set in the Configuration.
 */
protected void testScanFromConfiguration(String start, String stop, String last)
  throws IOException, InterruptedException, ClassNotFoundException {
 String jobName = "ScanFromConfig" + (start != null ? start.toUpperCase(Locale.ROOT) : "Empty") +
  "To" + (stop != null ? stop.toUpperCase(Locale.ROOT) : "Empty");
 Configuration c = new Configuration(TEST_UTIL.getConfiguration());
 c.set(TableInputFormat.INPUT_TABLE, TABLE_NAME.getNameAsString());
 c.set(TableInputFormat.SCAN_COLUMN_FAMILY,
  Bytes.toString(INPUT_FAMILYS[0]) + ", " + Bytes.toString(INPUT_FAMILYS[1]));
 c.set(KEY_STARTROW, start != null ? start : "");
 c.set(KEY_LASTROW, last != null ? last : "");
 if (start != null) {
  c.set(TableInputFormat.SCAN_ROW_START, start);
 }
 if (stop != null) {
  c.set(TableInputFormat.SCAN_ROW_STOP, stop);
 }
 Job job = Job.getInstance(c, jobName);
 job.setMapperClass(ScanMapper.class);
 job.setReducerClass(ScanReducer.class);
 job.setMapOutputKeyClass(ImmutableBytesWritable.class);
 job.setMapOutputValueClass(ImmutableBytesWritable.class);
 job.setInputFormatClass(TableInputFormat.class);
 job.setNumReduceTasks(1);
 FileOutputFormat.setOutputPath(job, new Path(job.getJobName()));
 TableMapReduceUtil.addDependencyJars(job);
 assertTrue(job.waitForCompletion(true));
}

parseOptions(options, args);
job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));
String cubeName = getOptionValue(OPTION_CUBE_NAME).toUpperCase(Locale.ROOT);
String segmentID = getOptionValue(OPTION_SEGMENT_ID);
Path input = new Path(getOptionValue(OPTION_INPUT_PATH));
Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH));
CubeSegment originalSegment = cube.getOriginalSegmentToOptimize(optSegment);
logger.info("Starting: " + job.getJobName());
job.setMapperClass(UpdateOldCuboidShardMapper.class);
job.setInputFormatClass(SequenceFileInputFormat.class);
FileInputFormat.setInputPaths(job, input);
FileOutputFormat.setOutputPath(job, output);
job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentID);
attachSegmentsMetadataWithDict(Lists.newArrayList(optSegment, originalSegment), job.getConfiguration());

public Job createSubmittableJob(String[] args) throws IOException {
 Path partitionsPath = new Path(destPath, PARTITIONS_FILE_NAME);
 generatePartitions(partitionsPath);
 Job job = Job.getInstance(getConf(),
    getConf().get("mapreduce.job.name", "hashTable_" + tableHash.tableName));
 Configuration jobConf = job.getConfiguration();
 jobConf.setLong(HASH_BATCH_SIZE_CONF_KEY, tableHash.batchSize);
 job.setJarByClass(HashTable.class);
 TableMapReduceUtil.initTableMapperJob(tableHash.tableName, tableHash.initScan(),
   HashMapper.class, ImmutableBytesWritable.class, ImmutableBytesWritable.class, job);
 // use a TotalOrderPartitioner and reducers to group region output into hash files
 job.setPartitionerClass(TotalOrderPartitioner.class);
 TotalOrderPartitioner.setPartitionFile(jobConf, partitionsPath);
 job.setReducerClass(Reducer.class);  // identity reducer
 job.setNumReduceTasks(tableHash.numHashFiles);
 job.setOutputKeyClass(ImmutableBytesWritable.class);
 job.setOutputValueClass(ImmutableBytesWritable.class);
 job.setOutputFormatClass(MapFileOutputFormat.class);
 FileOutputFormat.setOutputPath(job, new Path(destPath, HASH_DATA_DIR));
 return job;
}

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
 List<String> dirs = Splitter.on(",").splitToList(state.getProp(INPUT_DIRECTORIES_KEY));
 String outputBase = state.getProp(OUTPUT_LOCATION);
 List<WorkUnit> workUnits = Lists.newArrayList();
 for (String dir : dirs) {
  try {
   Path input = new Path(dir);
   Path output = new Path(outputBase, input.getName());
   WorkUnit workUnit = new WorkUnit();
   TaskUtils.setTaskFactoryClass(workUnit, MRTaskFactory.class);
   Configuration conf = new Configuration();
   Job job = Job.getInstance(conf, "WordCount_" + input.getName());
   job.setJarByClass(MRTaskFactoryTest.class);
   job.setMapperClass(TokenizerMapper.class);
   job.setCombinerClass(IntSumReducer.class);
   job.setReducerClass(IntSumReducer.class);
   job.setOutputKeyClass(Text.class);
   job.setOutputValueClass(IntWritable.class);
   job.setNumReduceTasks(1);
   FileInputFormat.addInputPath(job, input);
   FileOutputFormat.setOutputPath(job, output);
   MRTask.serializeJobToState(workUnit, job);
   workUnits.add(workUnit);
  } catch (IOException ioe) {
   log.error("Failed to create MR job for " + dir, ioe);
  }
 }
 return workUnits;
}

Javadoc

A base class for OutputFormats that read from FileSystems.

Most used methods

setOutputPath
Set the Path of the output directory for the map-reduce job.
getOutputPath
Get the Path to the output directory for the map-reduce job.
setCompressOutput
Set whether the output of the job is compressed.
getUniqueFile
Generate a unique filename, based on the task id, name, and extension
setOutputCompressorClass
Set the CompressionCodec to be used to compress job outputs.
getCompressOutput
Is the job output compressed?
getWorkOutputPath
Get the Path to the task's temporary output directory for the map-reduce job TASKS' SIDE-EFFECT FILE
getOutputCommitter
getOutputCompressorClass
Get the CompressionCodec for compressing the job outputs.
getOutputName
Get the base output name for the output file.
checkOutputSpecs
setOutputName
Set the base output name for output file to be created.

Popular in Java

Start an intent from android
scheduleAtFixedRate (Timer)
getExternalFilesDir (Context)
addToBackStack (FragmentTransaction)
InputStream (java.io)
A readable source of bytes.Most clients will use input streams that read data from the file system (
Calendar (java.util)
Calendar is an abstract base class for converting between a Date object and a set of integer fields
Collectors (java.util.stream)
XPath (javax.xml.xpath)
XPath provides access to the XPath evaluation environment and expressions. Evaluation of XPath Expr
Component (java.awt)
A component is an object having a graphical representation that can be displayed on the screen and t
Table (org.hibernate.mapping)
A relational table
Top PhpStorm plugins

How to useFileOutputFormat in org.apache.hadoop.mapreduce.lib.output

Best Java code snippets using org.apache.hadoop.mapreduce.lib.output.FileOutputFormat (Showing top 20 results out of 1,980)

Refine search

How to use
FileOutputFormat
in
org.apache.hadoop.mapreduce.lib.output