org.apache.hadoop.mapreduce.lib.input.TextInputFormat java code examples

Refine search

  input = env.createInput(HadoopInputs.readHadoopFile(new TextInputFormat(),
    LongWritable.class, Text.class, textPath));
} else {
  input = env.createInput(readHadoopFile(new TextInputFormat(),
    LongWritable.class, Text.class, textPath));
Job job = Job.getInstance();
HadoopOutputFormat<Text, LongWritable> hadoopOutputFormat =
    new HadoopOutputFormat<Text, LongWritable>(new TextOutputFormat<Text, LongWritable>(), job);
job.getConfiguration().set("mapred.textoutputformat.separator", " ");
TextOutputFormat.setOutputPath(job, new Path(resultPath));

  @Override public List<InputSplit> getSplits(JobContext ctx) throws IOException {
    List<InputSplit> res = super.getSplits(ctx);
    splitsCount.set(res.size());
    X.println("___ split of input: " + splitsCount.get());
    return res;
  }
}

void runMRCreateFail(
 String dbName, String tableName, Map<String, String> partitionValues,
 List<HCatFieldSchema> columns) throws Exception {
 Job job = new Job(mrConf, "hcat mapreduce write fail test");
 job.setJarByClass(this.getClass());
 job.setMapperClass(TestHCatPartitionPublish.MapFail.class);
 // input/output settings
 job.setInputFormatClass(TextInputFormat.class);
 Path path = new Path(fs.getWorkingDirectory(),
   "mapred/testHCatMapReduceInput");
 // The write count does not matter, as the map will fail in its first
 // call.
 createInputFile(path, 5);
 TextInputFormat.setInputPaths(job, path);
 job.setOutputFormatClass(HCatOutputFormat.class);
 OutputJobInfo outputJobInfo = OutputJobInfo.create(dbName, tableName,
   partitionValues);
 HCatOutputFormat.setOutput(job, outputJobInfo);
 job.setMapOutputKeyClass(BytesWritable.class);
 job.setMapOutputValueClass(DefaultHCatRecord.class);
 job.setNumReduceTasks(0);
 HCatOutputFormat.setSchema(job, new HCatSchema(columns));
 boolean success = job.waitForCompletion(true);
 Assert.assertTrue(success == false);
}

public static void main(String[] args) throws Exception {
  if (args.length < 2) {
    System.err.println("Usage: WordCount <input path> <result path>");
    return;
  }
  final String inputPath = args[0];
  final String outputPath = args[1];
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  // Set up the Hadoop Input Format
  Job job = Job.getInstance();
  HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, job);
  TextInputFormat.addInputPath(job, new Path(inputPath));
  // Create a Flink job with it
  DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);
  // Tokenize the line and convert from Writable "Text" to String for better handling
  DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer());
  // Sum up the words
  DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1);
  // Convert String back to Writable "Text" for use with Hadoop Output Format
  DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper());
  // Set up Hadoop Output Format
  HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), job);
  hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " ");
  hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", " "); // set the value for both, since this test
  TextOutputFormat.setOutputPath(job, new Path(outputPath));
  // Output & Execute
  hadoopResult.output(hadoopOutputFormat);
  env.execute("Word Count");
}

Configuration conf = new Configuration();
conf.set("hive.metastore.uris", "thrift://no.such.machine:10888");
Job job = new Job(conf, "Write-hcat-seq-table");
job.setJarByClass(TestPassProperties.class);
job.setMapperClass(Map.class);
job.setOutputKeyClass(NullWritable.class);
job.setOutputValueClass(DefaultHCatRecord.class);
job.setInputFormatClass(TextInputFormat.class);
TextInputFormat.setInputPaths(job, INPUT_FILE_NAME);

public int run(String[] args) throws IOException {
  Configuration conf = getConf();
  Job job = new Job(conf);
  job.setJobName("MySQLBulkLoading");
  job.setMapperClass(DelimitedLoadMapper.class);
  job.setJarByClass(DelimitedLoadMapper.class);
  job.setNumReduceTasks(0);
  job.setInputFormatClass(TextInputFormat.class);
  TextInputFormat.addInputPath(job, new Path(args[0]));
  job.setMapOutputKeyClass(NullWritable.class);
  job.setMapOutputValueClass(NullWritable.class);
  job.setOutputFormatClass(DBOutputFormat.class);
  int ret = 0;
  try {
    ret = job.waitForCompletion(true) ? 0 : 1;
  } catch (Exception e) {
    e.printStackTrace();
  }
  return ret;
}

Job countingJob = new Job(conf, "JobChaining-Counting");
countingJob.setJarByClass(BasicJobChaining.class);
countingJob.setMapperClass(UserIdCountMapper.class);
countingJob.setCombinerClass(LongSumReducer.class);
countingJob.setReducerClass(UserIdSumReducer.class);
countingJob.setInputFormatClass(TextInputFormat.class);
TextInputFormat.addInputPath(countingJob, postInput);
countingJob.setOutputFormatClass(TextOutputFormat.class);
  binningJob.setInputFormatClass(TextInputFormat.class);
  TextInputFormat.addInputPath(binningJob, outputDirIntermediate);

@Override
protected void configureJob(Job job) throws IOException {
 Configuration conf = job.getConfiguration();
 
 job.setJarByClass(PartialBuilder.class);
 
 FileInputFormat.setInputPaths(job, getDataPath());
 FileOutputFormat.setOutputPath(job, getOutputPath(conf));
 
 job.setOutputKeyClass(TreeID.class);
 job.setOutputValueClass(MapredOutput.class);
 
 job.setMapperClass(Step1Mapper.class);
 job.setNumReduceTasks(0); // no reducers
 
 job.setInputFormatClass(TextInputFormat.class);
 job.setOutputFormatClass(SequenceFileOutputFormat.class);
 // For this implementation to work, mapred.map.tasks needs to be set to the actual
 // number of mappers Hadoop will use:
 TextInputFormat inputFormat = new TextInputFormat();
 List<?> splits = inputFormat.getSplits(job);
 if (splits == null || splits.isEmpty()) {
  log.warn("Unable to compute number of splits?");
 } else {
  int numSplits = splits.size();
  log.info("Setting mapred.map.tasks = {}", numSplits);
  conf.setInt("mapred.map.tasks", numSplits);
 }
}

final Job job = new Job(getConf(), "Write HDFS Index to Accumulo");
job.setJarByClass(this.getClass());
final Configuration jobConf = job.getConfiguration();
jobConf.setBoolean("mapred.map.tasks.speculative.execution", false);
setVarOrders(sparql, jobConf);
TextInputFormat.setInputPaths(job, inputDir);
job.setInputFormatClass(TextInputFormat.class);

JobConf conf = new JobConf();
conf.set("mapreduce.framework.name", "local");
Job job = new Job(conf);
TextInputFormat.setInputPaths(job, new Path(in.getPath()));
TextOutputFormat.setOutputPath(job, new Path(out.getPath()));
job.setMapperClass(Map.class);
job.setReducerClass(Reduce.class);
job.setInputFormatClass(TextInputFormat.class);
job.setMapOutputKeyClass(Text.class);

conf.set(LilyJythonMapper.BULK_MODE, Boolean.toString(bulkMode));
Job job = new Job(conf);
job.setJarByClass(BulkImportTool.class);
job.setMapperClass(LilyJythonMapper.class);
job.setInputFormatClass(TextInputFormat.class);
job.setOutputFormatClass(HFileOutputFormat.class);
job.setOutputValueClass(KeyValue.class);
job.setJobName(formatJobName());
TextInputFormat.addInputPath(job, new Path(inputPath));
HFileOutputFormat.setOutputPath(job, tmpDir);
conf.set(HFILE_PATH, tmpDir.toUri().toString());

public static void main(String [] args) throws Exception
{
 Path outDir = new Path("output");
 Configuration conf = new Configuration();
 Job job = Job.getInstance(conf, "user name check"); 
     
     
 job.setJarByClass(UserNamePermission.class);
 job.setMapperClass(UserNamePermission.UserNameMapper.class);
 job.setCombinerClass(UserNamePermission.UserNameReducer.class);
 job.setMapOutputKeyClass(Text.class);
 job.setMapOutputValueClass(Text.class);
 job.setReducerClass(UserNamePermission.UserNameReducer.class);
 job.setNumReduceTasks(1);
     
 job.setInputFormatClass(TextInputFormat.class);
 TextInputFormat.addInputPath(job, new Path("input"));
 FileOutputFormat.setOutputPath(job, outDir);
     
 System.exit(job.waitForCompletion(true) ? 0 : 1);
}

private int createParitionFile(String sequenceFileInput, String outputPath, float frequency) throws IOException, ClassNotFoundException, InterruptedException {
  Configuration config = getConf();
  config.setFloat(SAMPLE_FREQUENCY, frequency);
  Job sampler = new Job(config);
  sampler.setInputFormatClass(TextInputFormat.class);
  sampler.setOutputFormatClass(TextOutputFormat.class);
  sampler.setOutputKeyClass(Text.class);
  sampler.setOutputValueClass(NullWritable.class);
  sampler.setNumReduceTasks(0);
  sampler.setMapperClass(Map.class);
  TextInputFormat.addInputPath(sampler, new Path(sequenceFileInput));
  TextOutputFormat.setOutputPath(sampler, new Path(outputPath));
  sampler.waitForCompletion(true);
  return 0;
}

public int run(String[] args) throws Exception {
  Configuration conf = getConf();
  populateConfiguration(args, conf);
  try {
    checkMandatoryConfs(conf);
  } catch (HIHOException e1) {
    e1.printStackTrace();
    throw new Exception(e1);
  }        
  Job job = new Job(conf);
  job.getConfiguration().setInt(MRJobConfig.NUM_MAPS,
      conf.getInt(HIHOConf.NUMBER_MAPPERS, 1));
  job.setJobName("HihoDBExport");
  job.setMapperClass(GenericDBLoadDataMapper.class);
  job.setJarByClass(ExportToDB.class);
  job.setNumReduceTasks(0);
  job.setInputFormatClass(TextInputFormat.class);
  TextInputFormat.addInputPath(job, new Path(inputPath));
  GenericDBOutputFormat.setOutput(job, tableName, columnNames);
  int ret = 0;
  try {
    ret = job.waitForCompletion(true) ? 0 : 1;
  } catch (Exception e) {
    e.printStackTrace();
  }
  return ret;
}

@Test
public void testListStatusSimple() throws IOException {
 Configuration conf = new Configuration();
 conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads);
 List<Path> expectedPaths = configureTestSimple(conf, localFs);
 
 Job job  = Job.getInstance(conf);
 FileInputFormat<?, ?> fif = new TextInputFormat();
 List<FileStatus> statuses = fif.listStatus(job);
 verifyFileStatuses(expectedPaths, statuses, localFs);
}

  private void initialiseInput(final Job job, final MapReduce operation) throws IOException {
    job.setInputFormatClass(TextInputFormat.class);

    for (final Map.Entry<String, String> entry : operation.getInputMapperPairs().entrySet()) {
      if (entry.getValue().contains(job.getConfiguration().get(MAPPER_GENERATOR))) {
        TextInputFormat.addInputPath(job, new Path(entry.getKey()));
      }
    }
  }
}

private int createParitionFile(String inputPath, String outputFile, float frequency, int samplesCnt) throws IOException, ClassNotFoundException, InterruptedException {
  Path input = new Path(inputPath);
  Job sampler = new Job(getConf());
  
  TextInputFormat.addInputPath(sampler, input);
  InputSampler.Sampler<LongWritable, Text> inputSampler =
      new InputSampler.RandomSampler<LongWritable, Text>(frequency, samplesCnt);
  Path partitionFile = new Path(outputFile);
  TotalOrderPartitioner.setPartitionFile(sampler.getConfiguration(), partitionFile);
  InputSampler.writePartitionFile(sampler, inputSampler);
  return 0;
}

public static Path compressAndIndex(Path file, Configuration conf)
  throws IOException {
 Configuration tmpConfig = new Configuration(conf);
 tmpConfig.setLong("dfs.block.size", 512);
 tmpConfig.setInt(LzoCodec.LZO_BUFFER_SIZE_KEY, 512);
 Path compressedFile = LzopFileReadWrite.compress(file, tmpConfig);
 compressedFile.getFileSystem(tmpConfig).delete(new Path(
   compressedFile.toString() + LzoIndex.LZO_INDEX_SUFFIX), false);
 new LzoIndexer(tmpConfig).index(compressedFile);
 LzoIndex index = LzoIndex
   .readIndex(compressedFile.getFileSystem(tmpConfig),
     compressedFile);
 for (int i = 0; i < index.getNumberOfBlocks(); i++) {
  System.out.println("block[" + i + "] = " + index.getPosition(i));
 }
 Job job = new Job(conf);
 job.setInputFormatClass(LzoTextInputFormat.class);
 LzoTextInputFormat inputFormat = new LzoTextInputFormat();
 TextInputFormat.setInputPaths(job, compressedFile);
 List<InputSplit> is = inputFormat.getSplits(job);
 System.out.println("input splits = " + is.size());
 return compressedFile;
}

 @Test
 public void testNumInputFiles() throws Exception {
  Configuration conf = spy(new Configuration());
  Job job = make(stub(Job.class).returning(conf).from.getConfiguration());
  FileStatus stat = make(stub(FileStatus.class).returning(0L).from.getLen());
  TextInputFormat ispy = spy(new TextInputFormat());
  doReturn(Arrays.asList(stat)).when(ispy).listStatus(job);

  ispy.getSplits(job);
  verify(conf).setLong(FileInputFormat.NUM_INPUT_FILES, 1);
 }
}

@Test
public void testSplitLocationInfo() throws Exception {
 Configuration conf = getConfiguration();
 conf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR,
   "test:///a1/a2");
 Job job = Job.getInstance(conf);
 TextInputFormat fileInputFormat = new TextInputFormat();
 List<InputSplit> splits = fileInputFormat.getSplits(job);
 String[] locations = splits.get(0).getLocations();
 Assert.assertEquals(2, locations.length);
 SplitLocationInfo[] locationInfo = splits.get(0).getLocationInfo();
 Assert.assertEquals(2, locationInfo.length);
 SplitLocationInfo localhostInfo = locations[0].equals("localhost") ?
   locationInfo[0] : locationInfo[1];
 SplitLocationInfo otherhostInfo = locations[0].equals("otherhost") ?
   locationInfo[0] : locationInfo[1];
 Assert.assertTrue(localhostInfo.isOnDisk());
 Assert.assertTrue(localhostInfo.isInMemory());
 Assert.assertTrue(otherhostInfo.isOnDisk());
 Assert.assertFalse(otherhostInfo.isInMemory());
}

Javadoc

An InputFormat for plain text files. Files are broken into lines. Either linefeed or carriage-return are used to signal end of line. Keys are the position in the file, and values are the line of text..

Most used methods

Popular in Java

Finding current android device location
setContentView (Activity)
scheduleAtFixedRate (ScheduledExecutorService)
setRequestProperty (URLConnection)
PrintWriter (java.io)
Wraps either an existing OutputStream or an existing Writerand provides convenience methods for prin
RandomAccessFile (java.io)
Allows reading from and writing to a file in a random-access manner. This is different from the uni-
GregorianCalendar (java.util)
GregorianCalendar is a concrete subclass of Calendarand provides the standard calendar used by most
Color (java.awt)
The Color class is used to encapsulate colors in the default sRGB color space or colors in arbitrary
JFileChooser (javax.swing)
Response (javax.ws.rs.core)
Defines the contract between a returned instance and the runtime when an application needs to provid
Best IntelliJ plugins

How to useTextInputFormat in org.apache.hadoop.mapreduce.lib.input

Best Java code snippets using org.apache.hadoop.mapreduce.lib.input.TextInputFormat (Showing top 20 results out of 324)

Refine search

How to use
TextInputFormat
in
org.apache.hadoop.mapreduce.lib.input