org.apache.hadoop.mapreduce.lib.input.FileInputFormat java code examples

Refine search

@Test
public void testAvroSpecificOutput() throws Exception {
 Job job = new Job();
 FileInputFormat.setInputPaths(job, new Path(getClass()
     .getResource("/org/apache/avro/mapreduce/mapreduce-test-input.txt")
     .toURI().toString()));
 job.setInputFormatClass(TextInputFormat.class);
 job.setMapperClass(LineCountMapper.class);
 job.setMapOutputKeyClass(Text.class);
 job.setMapOutputValueClass(IntWritable.class);
 job.setOutputFormatClass(AvroKeyOutputFormat.class);
 Path outputPath = new Path(DIR.getRoot().getPath() + "/testAvroSpecificOutput");
 outputPath.getFileSystem(job.getConfiguration()).delete(outputPath);
 FileOutputFormat.setOutputPath(job, outputPath);
 Assert.assertTrue(job.waitForCompletion(true));
 FileSystem fileSystem = FileSystem.get(job.getConfiguration());
 FileStatus[] outputFiles = fileSystem.globStatus(outputPath.suffix("/myavro3-*"));
 Assert.assertEquals(1, outputFiles.length);
 Map<String, Integer> counts = new HashMap<>();

LOGGER.info("Starting {}", getClass().getSimpleName());
if (fs.exists(outputDir)) {
 LOGGER.warn("Found the output folder {}, deleting it", _outputDir);
 fs.delete(outputDir, true);
fs.mkdirs(outputDir);
Job job = Job.getInstance(getConf());
job.setJarByClass(SegmentCreationJob.class);
job.setJobName(_jobName);
 job.getConfiguration().set("mapreduce.job.credentials.binary", System.getenv("HADOOP_TOKEN_FILE_LOCATION"));
job.setMapOutputValueClass(Text.class);
FileInputFormat.addInputPath(job, new Path(_stagingDir + "/input/"));
FileOutputFormat.setOutputPath(job, new Path(_stagingDir + "/output/"));
job.getConfiguration().setInt(JobContext.NUM_MAPS, inputDataFiles.size());
if (_dataSchema != null) {
 job.getConfiguration().set(JobConfigConstants.SCHEMA, _dataSchema.toString());
setOutputPath(job.getConfiguration());

 throw new IOException("Druid broker address not specified in configuration");
String druidQuery = StringEscapeUtils.unescapeJava(conf.get(Constants.DRUID_QUERY_JSON));
  LOG.warn("Druid query is empty; creating Select query");
 String dataSource = conf.get(Constants.DRUID_DATA_SOURCE);
 if (dataSource == null || dataSource.isEmpty()) {
  throw new IOException("Druid data source cannot be empty or null");
 conf.set(Constants.DRUID_QUERY_TYPE, druidQueryType);
} else {
 druidQueryType = conf.get(Constants.DRUID_QUERY_TYPE);
Job job = Job.getInstance(conf);
JobContext jobContext = ShimLoader.getHadoopShims().newJobContext(job);
Path[] paths = FileInputFormat.getInputPaths(jobContext);

/**
 * Configures the Hadoop MapReduce job.
 *
 * @return Instance of the Hadoop MapRed job.
 * @throws IOException If failed.
 */
@SuppressWarnings("deprecation")
private Job createConfigBasedHadoopJob() throws IOException {
  Job jobCfg = new Job();
  Configuration cfg = jobCfg.getConfiguration();
  // Use explicit configuration of distributed file system, if provided.
  cfg.addResource(U.resolveIgniteUrl(DFS_CFG));
  jobCfg.setJobName("HadoopPopularWordExample");
  jobCfg.setJarByClass(HadoopPopularWords.class);
  jobCfg.setInputFormatClass(TextInputFormat.class);
  jobCfg.setOutputKeyClass(Text.class);
  jobCfg.setOutputValueClass(IntWritable.class);
  jobCfg.setMapperClass(TokenizingMapper.class);
  jobCfg.setReducerClass(TopNWordsReducer.class);
  FileInputFormat.setInputPaths(jobCfg, BOOKS_DFS_DIR);
  FileOutputFormat.setOutputPath(jobCfg, RESULT_DFS_DIR);
  // Local job tracker allows the only task per wave, but text input format
  // replaces it with the calculated value based on input split size option.
  if ("local".equals(cfg.get("mapred.job.tracker", "local"))) {
    // Split job into tasks using 32MB split size.
    FileInputFormat.setMinInputSplitSize(jobCfg, 32L * 1024 * 1024);
    FileInputFormat.setMaxInputSplitSize(jobCfg, Long.MAX_VALUE);
  }
  return jobCfg;
}

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
 try {
  Job job = Job.getInstance(new Configuration());
    FileInputFormat.addInputPath(job, new Path(inputPath));
  FileInputFormat<K, V> fileInputFormat = getFileInputFormat(state, job.getConfiguration());
  List<InputSplit> fileSplits = fileInputFormat.getSplits(job);
  if (fileSplits == null || fileSplits.isEmpty()) {
   return ImmutableList.of();
   WorkUnit workUnit = WorkUnit.create(extract);
   workUnit.setProp(FILE_SPLIT_BYTES_STRING_KEY, HadoopUtils.serializeToString(fileSplit));
   workUnit.setProp(FILE_SPLIT_PATH_KEY, fileSplit.getPath().toString());
   workUnits.add(workUnit);

/**
 * Prepare job with mappers to cancel.
 * @return Fully configured job.
 * @throws Exception If fails.
 */
private Configuration prepareJobForCancelling() throws Exception {
  prepareFile("/testFile", 1500);
  executedTasks.set(0);
  cancelledTasks.set(0);
  failMapperId.set(0);
  splitsCount.set(0);
  Configuration cfg = new Configuration();
  setupFileSystems(cfg);
  Job job = Job.getInstance(cfg);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(IntWritable.class);
  job.setMapperClass(CancellingTestMapper.class);
  job.setNumReduceTasks(0);
  job.setInputFormatClass(InFormat.class);
  FileInputFormat.setInputPaths(job, new Path("igfs://" + igfsName + "@/"));
  FileOutputFormat.setOutputPath(job, new Path("igfs://" + igfsName  + "@/output/"));
  job.setJarByClass(getClass());
  return job.getConfiguration();
}

throws IOException {
 TableName tableName = TableName.valueOf(args[0]);
 conf.set(TABLE_NAME, tableName.getNameAsString());
 Path inputDir = new Path(args[1]);
 Job job = Job.getInstance(conf, conf.get(JOB_NAME_CONF_KEY, NAME + "_" + tableName));
 job.setJarByClass(Importer.class);
 FileInputFormat.setInputPaths(job, inputDir);
 job.setInputFormatClass(SequenceFileInputFormat.class);
 String hfileOutPath = conf.get(BULK_OUTPUT_CONF_KEY);
   job.setMapperClass(CellSortImporter.class);
   job.setReducerClass(CellReducer.class);
   Path outputDir = new Path(hfileOutPath);
   FileOutputFormat.setOutputPath(job, outputDir);
   job.setMapOutputKeyClass(CellWritableComparable.class);
   job.setMapOutputValueClass(MapReduceExtendedCell.class);
   job.setReducerClass(CellSortReducer.class);
   Path outputDir = new Path(hfileOutPath);
   FileOutputFormat.setOutputPath(job, outputDir);
   job.setMapOutputKeyClass(ImmutableBytesWritable.class);
   job.setMapOutputValueClass(MapReduceExtendedCell.class);

@Test
 conf.set("hive.io.file.read.all.columns", "false");
 conf.set("hive.io.file.readcolumn.ids", "1,3");
 Job job = new Job(conf, "orc test");
 job.setInputFormatClass(OrcNewInputFormat.class);
 job.setJarByClass(TestNewInputOutputFormat.class);
 job.setMapperClass(OrcTestMapper1.class);
 job.setNumReduceTasks(0);
 job.setOutputKeyClass(Text.class);
 job.setOutputValueClass(IntWritable.class);
 FileInputFormat.addInputPath(job, new Path(HiveTestUtils
   .getFileFromClasspath("orc-file-11-format.orc")));
 Path outputPath = new Path(workDir, "TestOrcFile." +
   testCaseName.getMethodName() + ".txt");
 localFs.delete(outputPath, true);
 FileOutputFormat.setOutputPath(job, outputPath);
 boolean result = job.waitForCompletion(true);
 assertTrue(result);
 Path outputFilePath = new Path(outputPath, "part-m-00000");
   new InputStreamReader(localFs.open(outputFilePath)));
 String line=reader.readLine();
   "null, null, null, null, null, null, null}");
 localFs.delete(outputPath, true);

@Test
 conf.set("hive.exec.orc.default.compress", "SNAPPY");
 Job job = new Job(conf, "orc test");
 job.setOutputFormatClass(OrcNewOutputFormat.class);
 job.setJarByClass(TestNewInputOutputFormat.class);
 job.setMapperClass(OrcTestMapper2.class);
 job.setNumReduceTasks(0);
 job.setOutputKeyClass(NullWritable.class);
 job.setOutputValueClass(OrcSerdeRow.class);
 FileInputFormat.addInputPath(job, inputPath);
 FileOutputFormat.setOutputPath(job, outputPath);
 boolean result = job.waitForCompletion(true);
 assertTrue(result);
 Path outputFilePath = new Path(outputPath, "part-m-00000");
 Reader reader = OrcFile.createReader(outputFilePath,
   OrcFile.readerOptions(conf).filesystem(localFs));
 assertEquals(reader.getCompression(), CompressionKind.SNAPPY);
 localFs.delete(outputPath, true);

job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));
logger.info("Starting: " + job.getJobName());
attachSegmentMetadataWithAll(cubeSeg, job.getConfiguration());
job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentID);
job.getConfiguration().set(BatchConstants.CFG_CUBOID_MODE, cuboidModeName);
job.getConfiguration().set(BatchConstants.CFG_UPDATE_SHARD, ifNeedUpdateBaseCuboidShard);
FileInputFormat.setInputPaths(job, new Path(input));
job.setInputFormatClass(SequenceFileInputFormat.class);
job.setOutputValueClass(Text.class);
Path outputPath = new Path(output);
FileOutputFormat.setOutputPath(job, outputPath);
HadoopUtil.deletePath(job.getConfiguration(), outputPath);

/**
 * @throws Exception If failed.
 */
@Test
public void testSimpleTaskSubmit() throws Exception {
  String testInputFile = "/test";
  prepareTestFile(testInputFile);
  Configuration cfg = new Configuration();
  setupFileSystems(cfg);
  Job job = Job.getInstance(cfg);
  job.setMapperClass(TestMapper.class);
  job.setCombinerClass(TestReducer.class);
  job.setReducerClass(TestReducer.class);
  job.setMapOutputKeyClass(Text.class);
  job.setMapOutputValueClass(IntWritable.class);
  job.setOutputKeyClass(Text.class);
  job.setOutputValueClass(IntWritable.class);
  job.setNumReduceTasks(1);
  FileInputFormat.setInputPaths(job, new Path("igfs://:" + getTestIgniteInstanceName(0) + "@/" + testInputFile));
  FileOutputFormat.setOutputPath(job, new Path("igfs://:" + getTestIgniteInstanceName(0) + "@/output"));
  job.setJarByClass(getClass());
  IgniteInternalFuture<?> fut = grid(0).hadoop().submit(new HadoopJobId(UUID.randomUUID(), 1),
    createJobInfo(job.getConfiguration(), null));
  fut.get();
}

 groupByJob = Job.getInstance(
   new Configuration(),
   StringUtils.format("%s-determine_partitions_groupby-%s", config.getDataSource(), config.getIntervals())
 );
 config.addJobProperties(groupByJob);
 FileOutputFormat.setOutputPath(groupByJob, config.makeGroupedDataDir());
 groupByJob.submit();
dimSelectionJob.getConfiguration().set("io.sort.record.percent", "0.19");
 dimSelectionJob.setMapperClass(DeterminePartitionsDimSelectionPostGroupByMapper.class);
 dimSelectionJob.setInputFormatClass(SequenceFileInputFormat.class);
 FileInputFormat.addInputPath(dimSelectionJob, config.makeGroupedDataDir());
} else {
FileOutputFormat.setOutputPath(dimSelectionJob, config.makeIntermediatePath());
dimSelectionJob.submit();
 final Path partitionInfoPath = config.makeSegmentPartitionInfoPath(segmentGranularity);
 if (fileSystem == null) {
  fileSystem = partitionInfoPath.getFileSystem(dimSelectionJob.getConfiguration());

parseOptions(options, args);
job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));
String cubeName = getOptionValue(OPTION_CUBE_NAME).toUpperCase(Locale.ROOT);
String segmentID = getOptionValue(OPTION_SEGMENT_ID);
Path input = new Path(getOptionValue(OPTION_INPUT_PATH));
Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH));
CubeSegment originalSegment = cube.getOriginalSegmentToOptimize(optSegment);
logger.info("Starting: " + job.getJobName());
job.setMapperClass(UpdateOldCuboidShardMapper.class);
job.setInputFormatClass(SequenceFileInputFormat.class);
FileInputFormat.setInputPaths(job, input);
FileOutputFormat.setOutputPath(job, output);
job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
job.getConfiguration().set(BatchConstants.CFG_CUBE_SEGMENT_ID, segmentID);
attachSegmentsMetadataWithDict(Lists.newArrayList(optSegment, originalSegment), job.getConfiguration());

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
 List<String> dirs = Splitter.on(",").splitToList(state.getProp(INPUT_DIRECTORIES_KEY));
 String outputBase = state.getProp(OUTPUT_LOCATION);
 List<WorkUnit> workUnits = Lists.newArrayList();
 for (String dir : dirs) {
  try {
   Path input = new Path(dir);
   Path output = new Path(outputBase, input.getName());
   WorkUnit workUnit = new WorkUnit();
   TaskUtils.setTaskFactoryClass(workUnit, MRTaskFactory.class);
   Configuration conf = new Configuration();
   Job job = Job.getInstance(conf, "WordCount_" + input.getName());
   job.setJarByClass(MRTaskFactoryTest.class);
   job.setMapperClass(TokenizerMapper.class);
   job.setCombinerClass(IntSumReducer.class);
   job.setReducerClass(IntSumReducer.class);
   job.setOutputKeyClass(Text.class);
   job.setOutputValueClass(IntWritable.class);
   job.setNumReduceTasks(1);
   FileInputFormat.addInputPath(job, input);
   FileOutputFormat.setOutputPath(job, output);
   MRTask.serializeJobToState(workUnit, job);
   workUnits.add(workUnit);
  } catch (IOException ioe) {
   log.error("Failed to create MR job for " + dir, ioe);
  }
 }
 return workUnits;
}

@Test
public void testGetSplits()
  throws Exception {
 URI baseUri = new URI(GobblinWorkUnitsInputFormatTest.class.getSimpleName() + "://testGetSplits");
 Configuration configuration = new Configuration();
 Path workUnitsDir = new Path(new Path(baseUri), "/workUnits");
 FileSystem fs = Mockito.mock(FileSystem.class);
 FileStatus[] statuses = createFileStatuses(20, workUnitsDir);
 Mockito.when(fs.listStatus(workUnitsDir)).thenReturn(statuses);
 Mockito.when(fs.makeQualified(Mockito.any(Path.class))).thenAnswer(new Answer<Path>() {
  @Override
  public Path answer(InvocationOnMock invocation)
    throws Throwable {
   return (Path) invocation.getArguments()[0];
  }
 });
 FileSystemTestUtils.addFileSystemForTest(baseUri, configuration, fs);
 GobblinWorkUnitsInputFormat inputFormat = new GobblinWorkUnitsInputFormat();
 Job job = Job.getInstance(configuration);
 FileInputFormat.addInputPath(job, workUnitsDir);
 List<InputSplit> splits = inputFormat.getSplits(job);
 Assert.assertEquals(splits.size(), 20);
 verifyPaths(splits, statuses);
}

@Test
public void testAddInputPath() throws IOException {
 final Configuration conf = new Configuration();
 conf.set("fs.defaultFS", "file:///abc/");
 final Job j = Job.getInstance(conf);
 //setup default fs
 final FileSystem defaultfs = FileSystem.get(conf);
 System.out.println("defaultfs.getUri() = " + defaultfs.getUri());
 {
  //test addInputPath
  final Path original = new Path("file:/foo");
  System.out.println("original = " + original);
  FileInputFormat.addInputPath(j, original);
  final Path[] results = FileInputFormat.getInputPaths(j);
  System.out.println("results = " + Arrays.asList(results));
  assertEquals(1, results.length);
  assertEquals(original, results[0]);
 }
 {
  //test setInputPaths
  final Path original = new Path("file:/bar");
  System.out.println("original = " + original);
  FileInputFormat.setInputPaths(j, original);
  final Path[] results = FileInputFormat.getInputPaths(j);
  System.out.println("results = " + Arrays.asList(results));
  assertEquals(1, results.length);
  assertEquals(original, results[0]);
 }
}

public void checkOutputFormat() throws Exception {
 Job job = new Job();
 WordCountUtil wordCountUtil = new WordCountUtil("trevniMapReduceKeyTest", "part-r-00000");
 wordCountUtil.writeLinesFile();
 AvroJob.setInputKeySchema(job, STRING);
 AvroJob.setOutputKeySchema(job, Pair.getPairSchema(STRING,LONG));
 job.setMapperClass(WordCountMapper.class);
 job.setReducerClass(WordCountReducer.class);
 job.setMapOutputKeyClass(Text.class);
 job.setMapOutputValueClass(LongWritable.class);
 FileInputFormat.setInputPaths(job, new Path(wordCountUtil.getDir().toString() + "/in"));
 FileOutputFormat.setOutputPath(job, new Path(wordCountUtil.getDir().toString() + "/out"));
 FileOutputFormat.setCompressOutput(job, true);
 job.setInputFormatClass(AvroKeyInputFormat.class);
 job.setOutputFormatClass(AvroTrevniKeyOutputFormat.class);
 job.waitForCompletion(true);
 wordCountUtil.validateCountsFile();
}

parseOptions(options, args);
job = Job.getInstance(getConf(), getOptionValue(OPTION_JOB_NAME));
String job_id = getOptionValue(OPTION_CUBING_JOB_ID);
String cubeName = getOptionValue(OPTION_CUBE_NAME);
Path output = new Path(getOptionValue(OPTION_OUTPUT_PATH));
Path input = new Path(getOptionValue(OPTION_INPUT_PATH));
attachCubeMetadata(cube, job.getConfiguration());
  Path path = new Path(input.toString() + "/" + tblColRef.getIdentity());
  if (HadoopUtil.getFileSystem(path).exists(path)) {
    FileInputFormat.addInputPath(job, path);
    hasUHCValue = true;
setupReducer(output, reducerCount);
job.getConfiguration().set(BatchConstants.CFG_CUBE_NAME, cubeName);
job.getConfiguration().set(BatchConstants.ARG_CUBING_JOB_ID, job_id);
job.getConfiguration().set(BatchConstants.CFG_GLOBAL_DICT_BASE_DIR, KylinConfig.getInstanceFromEnv().getHdfsWorkingDirectory());
job.getConfiguration().set(BatchConstants.CFG_MAPRED_OUTPUT_COMPRESS, "false");

if (inp.endsWith("/*")) {
  inp = inp.substring(0, inp.length() - 2);
  FileSystem fs = HadoopUtil.getWorkingFileSystem(job.getConfiguration());
  Path path = new Path(inp);
    logger.warn("Path not exist:" + path.toString());
    continue;
  FileStatus[] fileStatuses = fs.listStatus(path);
  boolean hasDir = false;
  for (FileStatus stat : fileStatuses) {
    if (stat.isDirectory() && !stat.getPath().getName().startsWith("_")) {
      hasDir = true;
      ret += addInputDirs(new String[] { stat.getPath().toString() }, job);
  FileInputFormat.addInputPath(job, new Path(inp));
  ret++;

@Test
public void testListStatusErrorOnNonExistantDir() throws IOException {
 Configuration conf = new Configuration();
 conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads);
 configureTestErrorOnNonExistantDir(conf, localFs);
 Job job  = Job.getInstance(conf);
 FileInputFormat<?, ?> fif = new TextInputFormat();
 try {
  fif.listStatus(job);
  Assert.fail("Expecting an IOException for a missing Input path");
 } catch (IOException e) {
  Path expectedExceptionPath = new Path(TEST_ROOT_DIR, "input2");
  expectedExceptionPath = localFs.makeQualified(expectedExceptionPath);
  Assert.assertTrue(e instanceof InvalidInputException);
  Assert.assertEquals(
    "Input path does not exist: " + expectedExceptionPath.toString(),
    e.getMessage());
 }
}

Javadoc

A base class for file-based InputFormats.

FileInputFormat is the base class for all file-based InputFormats. This provides a generic implementation of #getSplits(JobContext). Subclasses of FileInputFormat can also override the #isSplitable(JobContext,Path) method to ensure input-files are not split-up and are processed as a whole by Mappers.

Most used methods

setInputPaths
Set the array of Paths as the list of inputs for the map-reduce job.
addInputPath
Add a Path to the list of inputs for the map-reduce job.
getInputPaths
Get the list of input Paths for the map-reduce job.
getSplits
Generate the list of files and make them into FileSplits.
listStatus
List input directories. Subclasses may override to, e.g., select only files matching a regular expre
addInputPaths
Add the given comma separated paths to the list of inputs for the map-reduce job.
setMaxInputSplitSize
Set the maximum split size
isSplitable
Is the given filename splitable? Usually, true, but if the file is stream compressed, it will not be
setMinInputSplitSize
Set the minimum input split size
createRecordReader
setInputDirRecursive
computeSplitSize

Popular in Java

Reactive rest calls using spring rest template
orElseThrow (Optional)
Return the contained value, if present, otherwise throw an exception to be created by the provided s
scheduleAtFixedRate (Timer)
onRequestPermissionsResult (Fragment)
SocketException (java.net)
This SocketException may be thrown during socket creation or setting options, and is the superclass
ResourceBundle (java.util)
ResourceBundle is an abstract class which is the superclass of classes which provide Locale-specifi
ConcurrentHashMap (java.util.concurrent)
A plug-in replacement for JDK1.5 java.util.concurrent.ConcurrentHashMap. This version is based on or
CountDownLatch (java.util.concurrent)
A synchronization aid that allows one or more threads to wait until a set of operations being perfor
LoggerFactory (org.slf4j)
The LoggerFactory is a utility class producing Loggers for various logging APIs, most notably for lo
JFileChooser (javax.swing)
Top PhpStorm plugins

How to useFileInputFormat in org.apache.hadoop.mapreduce.lib.input

Best Java code snippets using org.apache.hadoop.mapreduce.lib.input.FileInputFormat (Showing top 20 results out of 1,944)

Refine search

How to use
FileInputFormat
in
org.apache.hadoop.mapreduce.lib.input