@Override public List<InputSplit> getSplits(JobContext ctx) throws IOException { List<InputSplit> res = super.getSplits(ctx); splitsCount.set(res.size()); X.println("___ split of input: " + splitsCount.get()); return res; } }
@Test public void testSplitLocationInfo() throws Exception { Configuration conf = getConfiguration(); conf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR, "test:///a1/a2"); Job job = Job.getInstance(conf); TextInputFormat fileInputFormat = new TextInputFormat(); List<InputSplit> splits = fileInputFormat.getSplits(job); String[] locations = splits.get(0).getLocations(); Assert.assertEquals(2, locations.length); SplitLocationInfo[] locationInfo = splits.get(0).getLocationInfo(); Assert.assertEquals(2, locationInfo.length); SplitLocationInfo localhostInfo = locations[0].equals("localhost") ? locationInfo[0] : locationInfo[1]; SplitLocationInfo otherhostInfo = locations[0].equals("otherhost") ? locationInfo[0] : locationInfo[1]; Assert.assertTrue(localhostInfo.isOnDisk()); Assert.assertTrue(localhostInfo.isInMemory()); Assert.assertTrue(otherhostInfo.isOnDisk()); Assert.assertFalse(otherhostInfo.isInMemory()); }
@Override public List<InputSplit> getSplits( JobContext context ) throws IOException { if( input == null ) { input = new TextInputFormat(); } return input.getSplits( context ); }
@Override public List<InputSplit> getSplits( JobContext context ) throws IOException { if( input == null ) { input = new TextInputFormat(); } return input.getSplits( context ); }
@Override public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException { return delegate.getSplits(context); }
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { List<InputSplit> superSplits = super.getSplits(job); List<InputSplit> splits = new ArrayList<InputSplit>(); int numGroups = WikipediaConfiguration.getNumGroups(job.getConfiguration()); for(int group = 0; group < numGroups; group++) { for(InputSplit split:superSplits) { FileSplit fileSplit = (FileSplit)split; splits.add(new WikipediaInputSplit(fileSplit,group)); } } return splits; }
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { List<InputSplit> splits = super.getSplits(job); List<InputSplit> newSplits = new ArrayList<InputSplit>(); for(int i = 0; i < job.getConfiguration().getInt(NNConstants.NN_TEST_SCALE, 1); i++) { for(InputSplit inputSplit: splits) { if(isNotPigOrHadoopMetaFile(((FileSplit) inputSplit).getPath())) { newSplits.add(new GuaguaInputSplit(false, new FileSplit[] { (FileSplit) inputSplit })); } } } newSplits.add(new GuaguaInputSplit(true, (FileSplit) null)); int mapperSize = newSplits.size(); LOG.info("inputs size including master: {}", mapperSize); LOG.debug("input splits inclduing: {}", newSplits); job.getConfiguration().set(GuaguaConstants.GUAGUA_WORKER_NUMBER, (mapperSize - 1) + ""); return newSplits; }
@Override public List<InputSplit> getSplits(JobContext job) throws IOException { List<InputSplit> newSplits = null; boolean combinable = job.getConfiguration().getBoolean(SHIFU_VS_SPLIT_COMBINABLE, false); if(combinable) { @SuppressWarnings("deprecation") // use this deprecation method to make it works on 0.20.2 long blockSize = FileSystem.get(job.getConfiguration()).getDefaultBlockSize(); long combineSize = job.getConfiguration().getLong(SHIFU_VS_SPLIT_MAX_COMBINED_SPLIT_SIZE, blockSize); if(combineSize == 0) { combineSize = blockSize; } job.getConfiguration().setLong(GuaguaMapReduceConstants.MAPRED_MIN_SPLIT_SIZE, 1l); job.getConfiguration().setLong(GuaguaMapReduceConstants.MAPRED_MAX_SPLIT_SIZE, combineSize); // in hadoop 2.0 such keys are changed job.getConfiguration().setLong("mapreduce.input.fileinputformat.split.minsize", 1); job.getConfiguration().setLong("mapreduce.input.fileinputformat.split.maxsize", combineSize); List<InputSplit> splits = super.getSplits(job); LOG.debug("combine size:{}, splits:{}", combineSize, splits); newSplits = getFinalCombineSplits(splits, combineSize); } else { newSplits = getCommonSplits(job); } LOG.info("Input size: {}", newSplits.size()); return newSplits; }
@Override protected void configureJob(Job job) throws IOException { Configuration conf = job.getConfiguration(); job.setJarByClass(PartialBuilder.class); FileInputFormat.setInputPaths(job, getDataPath()); FileOutputFormat.setOutputPath(job, getOutputPath(conf)); job.setOutputKeyClass(TreeID.class); job.setOutputValueClass(MapredOutput.class); job.setMapperClass(Step1Mapper.class); job.setNumReduceTasks(0); // no reducers job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); // For this implementation to work, mapred.map.tasks needs to be set to the actual // number of mappers Hadoop will use: TextInputFormat inputFormat = new TextInputFormat(); List<?> splits = inputFormat.getSplits(job); if (splits == null || splits.isEmpty()) { log.warn("Unable to compute number of splits?"); } else { int numSplits = splits.size(); log.info("Setting mapred.map.tasks = {}", numSplits); conf.setInt("mapred.map.tasks", numSplits); } }
@Override protected void configureJob(Job job) throws IOException { Configuration conf = job.getConfiguration(); job.setJarByClass(PartialBuilder.class); FileInputFormat.setInputPaths(job, getDataPath()); FileOutputFormat.setOutputPath(job, getOutputPath(conf)); job.setOutputKeyClass(TreeID.class); job.setOutputValueClass(MapredOutput.class); job.setMapperClass(Step1Mapper.class); job.setNumReduceTasks(0); // no reducers job.setInputFormatClass(TextInputFormat.class); job.setOutputFormatClass(SequenceFileOutputFormat.class); // For this implementation to work, mapred.map.tasks needs to be set to the actual // number of mappers Hadoop will use: TextInputFormat inputFormat = new TextInputFormat(); List<?> splits = inputFormat.getSplits(job); if (splits == null || splits.isEmpty()) { log.warn("Unable to compute number of splits?"); } else { int numSplits = splits.size(); log.info("Setting mapred.map.tasks = {}", numSplits); conf.setInt("mapred.map.tasks", numSplits); } }
@Test public void testNumInputFiles() throws Exception { Configuration conf = spy(new Configuration()); Job job = make(stub(Job.class).returning(conf).from.getConfiguration()); FileStatus stat = make(stub(FileStatus.class).returning(0L).from.getLen()); TextInputFormat ispy = spy(new TextInputFormat()); doReturn(Arrays.asList(stat)).when(ispy).listStatus(job); ispy.getSplits(job); verify(conf).setLong(FileInputFormat.NUM_INPUT_FILES, 1); } }
List<InputSplit> splits = super.getSplits(job); LOG.debug("combine size:{}, splits:{}", combineSize, splits); newSplits = getFinalCombineGuaguaSplits(splits, combineSize);
List<InputSplit> splits = super.getSplits(job); LOG.debug("combine size:{}, splits:{}", combineSize, splits); newSplits = getFinalCombineGuaguaSplits(splits, combineSize);
@Test public void testNumInputFiles() throws Exception { Configuration conf = spy(new Configuration()); Job job = make(stub(Job.class).returning(conf).from.getConfiguration()); FileStatus stat = make(stub(FileStatus.class).returning(0L).from.getLen()); TextInputFormat ispy = spy(new TextInputFormat()); doReturn(Arrays.asList(stat)).when(ispy).listStatus(job); ispy.getSplits(job); verify(conf).setLong(FileInputFormat.NUM_INPUT_FILES, 1); }