public static void main(String[] args) throws Exception { if (args.length < 2) { System.err.println("Usage: WordCount <input path> <result path>"); return; } final String inputPath = args[0]; final String outputPath = args[1]; final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // Set up the Hadoop Input Format Job job = Job.getInstance(); HadoopInputFormat<LongWritable, Text> hadoopInputFormat = new HadoopInputFormat<LongWritable, Text>(new TextInputFormat(), LongWritable.class, Text.class, job); TextInputFormat.addInputPath(job, new Path(inputPath)); // Create a Flink job with it DataSet<Tuple2<LongWritable, Text>> text = env.createInput(hadoopInputFormat); // Tokenize the line and convert from Writable "Text" to String for better handling DataSet<Tuple2<String, Integer>> words = text.flatMap(new Tokenizer()); // Sum up the words DataSet<Tuple2<String, Integer>> result = words.groupBy(0).aggregate(Aggregations.SUM, 1); // Convert String back to Writable "Text" for use with Hadoop Output Format DataSet<Tuple2<Text, IntWritable>> hadoopResult = result.map(new HadoopDatatypeMapper()); // Set up Hadoop Output Format HadoopOutputFormat<Text, IntWritable> hadoopOutputFormat = new HadoopOutputFormat<Text, IntWritable>(new TextOutputFormat<Text, IntWritable>(), job); hadoopOutputFormat.getConfiguration().set("mapreduce.output.textoutputformat.separator", " "); hadoopOutputFormat.getConfiguration().set("mapred.textoutputformat.separator", " "); // set the value for both, since this test TextOutputFormat.setOutputPath(job, new Path(outputPath)); // Output & Execute hadoopResult.output(hadoopOutputFormat); env.execute("Word Count"); }
input = env.createInput(HadoopInputs.readHadoopFile(new TextInputFormat(), LongWritable.class, Text.class, textPath)); } else { input = env.createInput(readHadoopFile(new TextInputFormat(), LongWritable.class, Text.class, textPath));
public TextRecordReaderWrapper(org.apache.hadoop.mapreduce.lib.input.CombineFileSplit split, TaskAttemptContext context, Integer idx) throws IOException, InterruptedException { super(new TextInputFormat(), split, context, idx); } }
@Test public void testNumInputFilesIgnoreDirs() throws Exception { Configuration conf = getConfiguration(); conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads); conf.setBoolean(FileInputFormat.INPUT_DIR_NONRECURSIVE_IGNORE_SUBDIRS, true); Job job = Job.getInstance(conf); FileInputFormat<?, ?> fileInputFormat = new TextInputFormat(); List<InputSplit> splits = fileInputFormat.getSplits(job); Assert.assertEquals("Input splits are not correct", 1, splits.size()); verifySplits(Lists.newArrayList("test:/a1/file1"), splits); }
@Test public void testListStatusSimple() throws IOException { Configuration conf = new Configuration(); conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads); List<Path> expectedPaths = configureTestSimple(conf, localFs); Job job = Job.getInstance(conf); FileInputFormat<?, ?> fif = new TextInputFormat(); List<FileStatus> statuses = fif.listStatus(job); verifyFileStatuses(expectedPaths, statuses, localFs); }
@Test public void testListStatusNestedNonRecursive() throws IOException { Configuration conf = new Configuration(); conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads); List<Path> expectedPaths = configureTestNestedNonRecursive(conf, localFs); Job job = Job.getInstance(conf); FileInputFormat<?, ?> fif = new TextInputFormat(); List<FileStatus> statuses = fif.listStatus(job); verifyFileStatuses(expectedPaths, statuses, localFs); }
@Test public void testListStatusNestedRecursive() throws IOException { Configuration conf = new Configuration(); conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads); List<Path> expectedPaths = configureTestNestedRecursive(conf, localFs); Job job = Job.getInstance(conf); FileInputFormat<?, ?> fif = new TextInputFormat(); List<FileStatus> statuses = fif.listStatus(job); verifyFileStatuses(expectedPaths, statuses, localFs); }
@Test public void testNumInputFilesWithoutRecursively() throws Exception { Configuration conf = getConfiguration(); conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads); Job job = Job.getInstance(conf); FileInputFormat<?, ?> fileInputFormat = new TextInputFormat(); List<InputSplit> splits = fileInputFormat.getSplits(job); Assert.assertEquals("Input splits are not correct", 2, splits.size()); verifySplits(Lists.newArrayList("test:/a1/a2", "test:/a1/file1"), splits); }
@Test public void testListStatusErrorOnNonExistantDir() throws IOException { Configuration conf = new Configuration(); conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads); configureTestErrorOnNonExistantDir(conf, localFs); Job job = Job.getInstance(conf); FileInputFormat<?, ?> fif = new TextInputFormat(); try { fif.listStatus(job); Assert.fail("Expecting an IOException for a missing Input path"); } catch (IOException e) { Path expectedExceptionPath = new Path(TEST_ROOT_DIR, "input2"); expectedExceptionPath = localFs.makeQualified(expectedExceptionPath); Assert.assertTrue(e instanceof InvalidInputException); Assert.assertEquals( "Input path does not exist: " + expectedExceptionPath.toString(), e.getMessage()); } }
@Test public void testMaxBlockLocationsNewSplitsWithErasureCoding() throws Exception { Job job = Job.getInstance(conf); final FileInputFormat<?, ?> fileInputFormat = new TextInputFormat(); final List<InputSplit> splits = fileInputFormat.getSplits(job); JobSplitWriter.createSplitFiles(submitDir, conf, fs, splits); validateSplitMetaInfo(); }
@Test public void testListLocatedStatus() throws Exception { Configuration conf = getConfiguration(); conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads); conf.setBoolean("fs.test.impl.disable.cache", false); conf.set(FileInputFormat.INPUT_DIR, "test:///a1/a2"); MockFileSystem mockFs = (MockFileSystem) new Path("test:///").getFileSystem(conf); Assert.assertEquals("listLocatedStatus already called", 0, mockFs.numListLocatedStatusCalls); Job job = Job.getInstance(conf); FileInputFormat<?, ?> fileInputFormat = new TextInputFormat(); List<InputSplit> splits = fileInputFormat.getSplits(job); Assert.assertEquals("Input splits are not correct", 2, splits.size()); Assert.assertEquals("listLocatedStatuss calls", 1, mockFs.numListLocatedStatusCalls); FileSystem.closeAll(); }
@Test public void testNumInputFilesRecursively() throws Exception { Configuration conf = getConfiguration(); conf.set(FileInputFormat.INPUT_DIR_RECURSIVE, "true"); conf.setInt(FileInputFormat.LIST_STATUS_NUM_THREADS, numThreads); Job job = Job.getInstance(conf); FileInputFormat<?, ?> fileInputFormat = new TextInputFormat(); List<InputSplit> splits = fileInputFormat.getSplits(job); Assert.assertEquals("Input splits are not correct", 3, splits.size()); verifySplits(Lists.newArrayList("test:/a1/a2/file2", "test:/a1/a2/file3", "test:/a1/file1"), splits); // Using the deprecated configuration conf = getConfiguration(); conf.set("mapred.input.dir.recursive", "true"); job = Job.getInstance(conf); splits = fileInputFormat.getSplits(job); verifySplits(Lists.newArrayList("test:/a1/a2/file2", "test:/a1/a2/file3", "test:/a1/file1"), splits); }
public TextRecordReaderWrapper(CombineFileSplit split, TaskAttemptContext context, Integer idx) throws IOException, InterruptedException { super(new TextInputFormat(), split, context, idx); } }
@Override @SuppressWarnings("unchecked") public InputFormat getInputFormat() throws IOException { // We will use TextInputFormat, the default Hadoop input format for // text. It has a LongWritable key that we will ignore, and the value // is a Text (a string writable) that the JSON data is in. return new TextInputFormat(); }
public TextRecordReaderWrapper(CombineFileSplit split, TaskAttemptContext context, Integer idx) throws IOException, InterruptedException { super(new TextInputFormat(), split, context, idx); } }
public TextRecordReaderWrapper(CombineFileSplit split, TaskAttemptContext context, Integer idx) throws IOException, InterruptedException { super(new TextInputFormat(), split, context, idx); } }
@Override @SuppressWarnings("rawtypes") public InputFormat getInputFormat() throws IOException { return new TextInputFormat(); }
@Test public void testSplitLocationInfo() throws Exception { Configuration conf = getConfiguration(); conf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR, "test:///a1/a2"); Job job = Job.getInstance(conf); TextInputFormat fileInputFormat = new TextInputFormat(); List<InputSplit> splits = fileInputFormat.getSplits(job); String[] locations = splits.get(0).getLocations(); Assert.assertEquals(2, locations.length); SplitLocationInfo[] locationInfo = splits.get(0).getLocationInfo(); Assert.assertEquals(2, locationInfo.length); SplitLocationInfo localhostInfo = locations[0].equals("localhost") ? locationInfo[0] : locationInfo[1]; SplitLocationInfo otherhostInfo = locations[0].equals("otherhost") ? locationInfo[0] : locationInfo[1]; Assert.assertTrue(localhostInfo.isOnDisk()); Assert.assertTrue(localhostInfo.isInMemory()); Assert.assertTrue(otherhostInfo.isOnDisk()); Assert.assertFalse(otherhostInfo.isInMemory()); }
@Override public List<InputSplit> getSplits( JobContext context ) throws IOException { if( input == null ) { input = new TextInputFormat(); } return input.getSplits( context ); }
@Test public void testNumInputFiles() throws Exception { Configuration conf = spy(new Configuration()); Job job = make(stub(Job.class).returning(conf).from.getConfiguration()); FileStatus stat = make(stub(FileStatus.class).returning(0L).from.getLen()); TextInputFormat ispy = spy(new TextInputFormat()); doReturn(Arrays.asList(stat)).when(ispy).listStatus(job); ispy.getSplits(job); verify(conf).setLong(FileInputFormat.NUM_INPUT_FILES, 1); } }