org.apache.hadoop.mapreduce.lib.input.TextInputFormat.<init> java code examples

public static void main(String[] args) throws Exception {
  if (args.length < 2) {
    System.err.println("Usage: WordCount <input path> <result path>");
    return;
  }
  final String inputPath = args[0];
  final String outputPath = args[1];
  final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
  // Set up the Hadoop Input Format
  Job job = Job.getInstance();
  HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, job);
  TextInputFormat.addInputPath(job, new Path(inputPath));
  // Create a Flink job with it
  DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat);
  // Tokenize the line and convert from Writable "Text" to String for better handling
  DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer());
  // Sum up the words
  DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1);
  // Convert String back to Writable "Text" for use with Hadoop Output Format
  DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper());
  // Set up Hadoop Output Format
  HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), job);
  hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " ");
  hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", " "); // set the value for both, since this test
  TextOutputFormat.setOutputPath(job, new Path(outputPath));
  // Output & Execute
  hadoopResult.output(hadoopOutputFormat);
  env.execute("Word Count");
}

  input = env.createInput(HadoopInputs.readHadoopFile(new TextInputFormat(),
    LongWritable.class, Text.class, textPath));
} else {
  input = env.createInput(readHadoopFile(new TextInputFormat(),
    LongWritable.class, Text.class, textPath));

 public TextRecordReaderWrapper(org.apache.hadoop.mapreduce.lib.input.CombineFileSplit split,
  TaskAttemptContext context, Integer idx)
   throws IOException, InterruptedException {
  super(new TextInputFormat(), split, context, idx);
 }
}

@Test
public void testNumInputFilesIgnoreDirs() throws Exception {
 Configuration conf = getConfiguration();
 conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads);
 conf.setBoolean(FileInputFormat.INPUT_DIR_NONRECURSIVE_IGNORE_SUBDIRS, true);
 Job job = Job.getInstance(conf);
 FileInputFormat<?, ?> fileInputFormat = new TextInputFormat();
 List<InputSplit> splits = fileInputFormat.getSplits(job);
 Assert.assertEquals("Input splits are not correct", 1, splits.size());
 verifySplits(Lists.newArrayList("test:/a1/file1"), splits);
}

@Test
public void testListStatusSimple() throws IOException {
 Configuration conf = new Configuration();
 conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads);
 List<Path> expectedPaths = configureTestSimple(conf, localFs);
 
 Job job  = Job.getInstance(conf);
 FileInputFormat<?, ?> fif = new TextInputFormat();
 List<FileStatus> statuses = fif.listStatus(job);
 verifyFileStatuses(expectedPaths, statuses, localFs);
}

@Test
public void testListStatusNestedNonRecursive() throws IOException {
 Configuration conf = new Configuration();
 conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads);
 List<Path> expectedPaths = configureTestNestedNonRecursive(conf, localFs);
 Job job  = Job.getInstance(conf);
 FileInputFormat<?, ?> fif = new TextInputFormat();
 List<FileStatus> statuses = fif.listStatus(job);
 verifyFileStatuses(expectedPaths, statuses, localFs);
}

@Test
public void testListStatusNestedRecursive() throws IOException {
 Configuration conf = new Configuration();
 conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads);
 List<Path> expectedPaths = configureTestNestedRecursive(conf, localFs);
 Job job  = Job.getInstance(conf);
 FileInputFormat<?, ?> fif = new TextInputFormat();
 List<FileStatus> statuses = fif.listStatus(job);
 verifyFileStatuses(expectedPaths, statuses, localFs);
}

@Test
public void testNumInputFilesWithoutRecursively() throws Exception {
 Configuration conf = getConfiguration();
 conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads);
 Job job = Job.getInstance(conf);
 FileInputFormat<?, ?> fileInputFormat = new TextInputFormat();
 List<InputSplit> splits = fileInputFormat.getSplits(job);
 Assert.assertEquals("Input splits are not correct", 2, splits.size());
 verifySplits(Lists.newArrayList("test:/a1/a2", "test:/a1/file1"), splits);
}

@Test
public void testListStatusErrorOnNonExistantDir() throws IOException {
 Configuration conf = new Configuration();
 conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads);
 configureTestErrorOnNonExistantDir(conf, localFs);
 Job job  = Job.getInstance(conf);
 FileInputFormat<?, ?> fif = new TextInputFormat();
 try {
  fif.listStatus(job);
  Assert.fail("Expecting an IOException for a missing Input path");
 } catch (IOException e) {
  Path expectedExceptionPath = new Path(TEST_ROOT_DIR, "input2");
  expectedExceptionPath = localFs.makeQualified(expectedExceptionPath);
  Assert.assertTrue(e instanceof InvalidInputException);
  Assert.assertEquals(
    "Input path does not exist: " + expectedExceptionPath.toString(),
    e.getMessage());
 }
}

@Test
public void testMaxBlockLocationsNewSplitsWithErasureCoding()
  throws Exception {
 Job job = Job.getInstance(conf);
 final FileInputFormat<?, ?> fileInputFormat = new TextInputFormat();
 final List<InputSplit> splits = fileInputFormat.getSplits(job);
 JobSplitWriter.createSplitFiles(submitDir, conf, fs, splits);
 validateSplitMetaInfo();
}

@Test
public void testListLocatedStatus() throws Exception {
 Configuration conf = getConfiguration();
 conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads);
 conf.setBoolean("fs.test.impl.disable.cache", false);
 conf.set(FileInputFormat.INPUT_DIR, "test:///a1/a2");
 MockFileSystem mockFs =
   (MockFileSystem) new Path("test:///").getFileSystem(conf);
 Assert.assertEquals("listLocatedStatus already called",
   0, mockFs.numListLocatedStatusCalls);
 Job job = Job.getInstance(conf);
 FileInputFormat<?, ?> fileInputFormat = new TextInputFormat();
 List<InputSplit> splits = fileInputFormat.getSplits(job);
 Assert.assertEquals("Input splits are not correct", 2, splits.size());
 Assert.assertEquals("listLocatedStatuss calls",
   1, mockFs.numListLocatedStatusCalls);
 FileSystem.closeAll();
}

@Test
public void testNumInputFilesRecursively() throws Exception {
 Configuration conf = getConfiguration();
 conf.set(FileInputFormat.INPUT_DIR_RECURSIVE, "true");
 conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads);
 Job job = Job.getInstance(conf);
 FileInputFormat<?, ?> fileInputFormat = new TextInputFormat();
 List<InputSplit> splits = fileInputFormat.getSplits(job);
 Assert.assertEquals("Input splits are not correct", 3, splits.size());
 verifySplits(Lists.newArrayList("test:/a1/a2/file2", "test:/a1/a2/file3",
   "test:/a1/file1"), splits);
 // Using the deprecated configuration
 conf = getConfiguration();
 conf.set("mapred.input.dir.recursive", "true");
 job = Job.getInstance(conf);
 splits = fileInputFormat.getSplits(job);
 verifySplits(Lists.newArrayList("test:/a1/a2/file2", "test:/a1/a2/file3",
   "test:/a1/file1"), splits);
}

 public TextRecordReaderWrapper(CombineFileSplit split,
  TaskAttemptContext context, Integer idx)
  throws IOException, InterruptedException {
  super(new TextInputFormat(), split, context, idx);
 }
}

@Override
@SuppressWarnings("unchecked")
public InputFormat getInputFormat() throws IOException {
  // We will use TextInputFormat, the default Hadoop input format for
  // text.  It has a LongWritable key that we will ignore, and the value
  // is a Text (a string writable) that the JSON data is in.
  return new TextInputFormat();
}

 public TextRecordReaderWrapper(CombineFileSplit split,
  TaskAttemptContext context, Integer idx)
  throws IOException, InterruptedException {
  super(new TextInputFormat(), split, context, idx);
 }
}

 public TextRecordReaderWrapper(CombineFileSplit split,
  TaskAttemptContext context, Integer idx)
  throws IOException, InterruptedException {
  super(new TextInputFormat(), split, context, idx);
 }
}

@Override
@SuppressWarnings("rawtypes")
public InputFormat getInputFormat() throws IOException {
  return new TextInputFormat();
}

@Test
public void testSplitLocationInfo() throws Exception {
 Configuration conf = getConfiguration();
 conf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR,
   "test:///a1/a2");
 Job job = Job.getInstance(conf);
 TextInputFormat fileInputFormat = new TextInputFormat();
 List<InputSplit> splits = fileInputFormat.getSplits(job);
 String[] locations = splits.get(0).getLocations();
 Assert.assertEquals(2, locations.length);
 SplitLocationInfo[] locationInfo = splits.get(0).getLocationInfo();
 Assert.assertEquals(2, locationInfo.length);
 SplitLocationInfo localhostInfo = locations[0].equals("localhost") ?
   locationInfo[0] : locationInfo[1];
 SplitLocationInfo otherhostInfo = locations[0].equals("otherhost") ?
   locationInfo[0] : locationInfo[1];
 Assert.assertTrue(localhostInfo.isOnDisk());
 Assert.assertTrue(localhostInfo.isInMemory());
 Assert.assertTrue(otherhostInfo.isOnDisk());
 Assert.assertFalse(otherhostInfo.isInMemory());
}

@Override
public List<InputSplit> getSplits( JobContext context ) throws IOException {
  if( input == null ) {
    input = new TextInputFormat();
  }
  return input.getSplits( context );
}

 @Test
 public void testNumInputFiles() throws Exception {
  Configuration conf = spy(new Configuration());
  Job job = make(stub(Job.class).returning(conf).from.getConfiguration());
  FileStatus stat = make(stub(FileStatus.class).returning(0L).from.getLen());
  TextInputFormat ispy = spy(new TextInputFormat());
  doReturn(Arrays.asList(stat)).when(ispy).listStatus(job);

  ispy.getSplits(job);
  verify(conf).setLong(FileInputFormat.NUM_INPUT_FILES, 1);
 }
}

Popular methods of TextInputFormat

Popular in Java

Finding current android device location
compareTo (BigDecimal)
setRequestProperty (URLConnection)
getSharedPreferences (Context)
InputStream (java.io)
A readable source of bytes.Most clients will use input streams that read data from the file system (
Socket (java.net)
Provides a client-side TCP socket.
Collection (java.util)
Collection is the root of the collection hierarchy. It defines operations on data collections and t
PriorityQueue (java.util)
A PriorityQueue holds elements on a priority heap, which orders the elements according to their natu
HttpServlet (javax.servlet.http)
Provides an abstract class to be subclassed to create an HTTP servlet suitable for a Web site. A sub
JTable (javax.swing)
From CI to AI: The AI layer in your organization

How to use org.apache.hadoop.mapreduce.lib.input.TextInputFormatconstructor

Best Java code snippets using org.apache.hadoop.mapreduce.lib.input.TextInputFormat.<init> (Showing top 20 results out of 315)

How to use
org.apache.hadoop.mapreduce.lib.input.TextInputFormat
constructor