org.apache.hadoop.mapreduce.lib.input.TextInputFormat.getSplits java code examples

  @Override public List<InputSplit> getSplits(JobContext ctx) throws IOException {
    List<InputSplit> res = super.getSplits(ctx);
    splitsCount.set(res.size());
    X.println("___ split of input: " + splitsCount.get());
    return res;
  }
}

@Test
public void testSplitLocationInfo() throws Exception {
 Configuration conf = getConfiguration();
 conf.set(org.apache.hadoop.mapreduce.lib.input.FileInputFormat.INPUT_DIR,
   "test:///a1/a2");
 Job job = Job.getInstance(conf);
 TextInputFormat fileInputFormat = new TextInputFormat();
 List<InputSplit> splits = fileInputFormat.getSplits(job);
 String[] locations = splits.get(0).getLocations();
 Assert.assertEquals(2, locations.length);
 SplitLocationInfo[] locationInfo = splits.get(0).getLocationInfo();
 Assert.assertEquals(2, locationInfo.length);
 SplitLocationInfo localhostInfo = locations[0].equals("localhost") ?
   locationInfo[0] : locationInfo[1];
 SplitLocationInfo otherhostInfo = locations[0].equals("otherhost") ?
   locationInfo[0] : locationInfo[1];
 Assert.assertTrue(localhostInfo.isOnDisk());
 Assert.assertTrue(localhostInfo.isInMemory());
 Assert.assertTrue(otherhostInfo.isOnDisk());
 Assert.assertFalse(otherhostInfo.isInMemory());
}

@Override
public List<InputSplit> getSplits( JobContext context ) throws IOException {
  if( input == null ) {
    input = new TextInputFormat();
  }
  return input.getSplits( context );
}

@Override
public List<InputSplit> getSplits( JobContext context ) throws IOException {
  if( input == null ) {
    input = new TextInputFormat();
  }
  return input.getSplits( context );
}

@Override
public List<InputSplit> getSplits(JobContext context) throws IOException, InterruptedException {
 return delegate.getSplits(context);
}

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
 List<InputSplit> superSplits = super.getSplits(job);
 List<InputSplit> splits = new ArrayList<InputSplit>();
 
 int numGroups = WikipediaConfiguration.getNumGroups(job.getConfiguration());
 for(int group = 0; group < numGroups; group++)
 {
  for(InputSplit split:superSplits)
  {
   FileSplit fileSplit = (FileSplit)split;
   splits.add(new WikipediaInputSplit(fileSplit,group));
  }
 }
 return splits;
}

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
  List<InputSplit> splits = super.getSplits(job);
  List<InputSplit> newSplits = new ArrayList<InputSplit>();
  for(int i = 0; i < job.getConfiguration().getInt(NNConstants.NN_TEST_SCALE, 1); i++) {
    for(InputSplit inputSplit: splits) {
      if(isNotPigOrHadoopMetaFile(((FileSplit) inputSplit).getPath())) {
        newSplits.add(new GuaguaInputSplit(false, new FileSplit[] { (FileSplit) inputSplit }));
      }
    }
  }
  newSplits.add(new GuaguaInputSplit(true, (FileSplit) null));
  int mapperSize = newSplits.size();
  LOG.info("inputs size including master: {}", mapperSize);
  LOG.debug("input splits inclduing: {}", newSplits);
  job.getConfiguration().set(GuaguaConstants.GUAGUA_WORKER_NUMBER, (mapperSize - 1) + "");
  return newSplits;
}

@Override
public List<InputSplit> getSplits(JobContext job) throws IOException {
  List<InputSplit> newSplits = null;
  boolean combinable = job.getConfiguration().getBoolean(SHIFU_VS_SPLIT_COMBINABLE, false);
  if(combinable) {
    @SuppressWarnings("deprecation")
    // use this deprecation method to make it works on 0.20.2
    long blockSize = FileSystem.get(job.getConfiguration()).getDefaultBlockSize();
    long combineSize = job.getConfiguration().getLong(SHIFU_VS_SPLIT_MAX_COMBINED_SPLIT_SIZE, blockSize);
    if(combineSize == 0) {
      combineSize = blockSize;
    }
    job.getConfiguration().setLong(GuaguaMapReduceConstants.MAPRED_MIN_SPLIT_SIZE, 1l);
    job.getConfiguration().setLong(GuaguaMapReduceConstants.MAPRED_MAX_SPLIT_SIZE, combineSize);
    // in hadoop 2.0 such keys are changed
    job.getConfiguration().setLong("mapreduce.input.fileinputformat.split.minsize", 1);
    job.getConfiguration().setLong("mapreduce.input.fileinputformat.split.maxsize", combineSize);
    List<InputSplit> splits = super.getSplits(job);
    LOG.debug("combine size:{}, splits:{}", combineSize, splits);
    newSplits = getFinalCombineSplits(splits, combineSize);
  } else {
    newSplits = getCommonSplits(job);
  }
  LOG.info("Input size: {}", newSplits.size());
  return newSplits;
}

@Override
protected void configureJob(Job job) throws IOException {
 Configuration conf = job.getConfiguration();
 
 job.setJarByClass(PartialBuilder.class);
 
 FileInputFormat.setInputPaths(job, getDataPath());
 FileOutputFormat.setOutputPath(job, getOutputPath(conf));
 
 job.setOutputKeyClass(TreeID.class);
 job.setOutputValueClass(MapredOutput.class);
 
 job.setMapperClass(Step1Mapper.class);
 job.setNumReduceTasks(0); // no reducers
 
 job.setInputFormatClass(TextInputFormat.class);
 job.setOutputFormatClass(SequenceFileOutputFormat.class);
 // For this implementation to work, mapred.map.tasks needs to be set to the actual
 // number of mappers Hadoop will use:
 TextInputFormat inputFormat = new TextInputFormat();
 List<?> splits = inputFormat.getSplits(job);
 if (splits == null || splits.isEmpty()) {
  log.warn("Unable to compute number of splits?");
 } else {
  int numSplits = splits.size();
  log.info("Setting mapred.map.tasks = {}", numSplits);
  conf.setInt("mapred.map.tasks", numSplits);
 }
}

@Override
protected void configureJob(Job job) throws IOException {
 Configuration conf = job.getConfiguration();
 
 job.setJarByClass(PartialBuilder.class);
 
 FileInputFormat.setInputPaths(job, getDataPath());
 FileOutputFormat.setOutputPath(job, getOutputPath(conf));
 
 job.setOutputKeyClass(TreeID.class);
 job.setOutputValueClass(MapredOutput.class);
 
 job.setMapperClass(Step1Mapper.class);
 job.setNumReduceTasks(0); // no reducers
 
 job.setInputFormatClass(TextInputFormat.class);
 job.setOutputFormatClass(SequenceFileOutputFormat.class);
 // For this implementation to work, mapred.map.tasks needs to be set to the actual
 // number of mappers Hadoop will use:
 TextInputFormat inputFormat = new TextInputFormat();
 List<?> splits = inputFormat.getSplits(job);
 if (splits == null || splits.isEmpty()) {
  log.warn("Unable to compute number of splits?");
 } else {
  int numSplits = splits.size();
  log.info("Setting mapred.map.tasks = {}", numSplits);
  conf.setInt("mapred.map.tasks", numSplits);
 }
}

 @Test
 public void testNumInputFiles() throws Exception {
  Configuration conf = spy(new Configuration());
  Job job = make(stub(Job.class).returning(conf).from.getConfiguration());
  FileStatus stat = make(stub(FileStatus.class).returning(0L).from.getLen());
  TextInputFormat ispy = spy(new TextInputFormat());
  doReturn(Arrays.asList(stat)).when(ispy).listStatus(job);

  ispy.getSplits(job);
  verify(conf).setLong(FileInputFormat.NUM_INPUT_FILES, 1);
 }
}

List<InputSplit> splits = super.getSplits(job);
LOG.debug("combine size:{}, splits:{}", combineSize, splits);
newSplits = getFinalCombineGuaguaSplits(splits, combineSize);

List<InputSplit> splits = super.getSplits(job);
LOG.debug("combine size:{}, splits:{}", combineSize, splits);
newSplits = getFinalCombineGuaguaSplits(splits, combineSize);

@Test
public void testNumInputFiles() throws Exception {
 Configuration conf = spy(new Configuration());
 Job job = make(stub(Job.class).returning(conf).from.getConfiguration());
 FileStatus stat = make(stub(FileStatus.class).returning(0L).from.getLen());
 TextInputFormat ispy = spy(new TextInputFormat());
 doReturn(Arrays.asList(stat)).when(ispy).listStatus(job);
 ispy.getSplits(job);
 verify(conf).setLong(FileInputFormat.NUM_INPUT_FILES, 1);
}

Popular methods of TextInputFormat

Popular in Java

Running tasks concurrently on multiple threads
onRequestPermissionsResult (Fragment)
getSystemService (Context)
scheduleAtFixedRate (Timer)
Properties (java.util)
A Properties object is a Hashtable where the keys and values must be Strings. Each property can have
ExecutorService (java.util.concurrent)
An Executor that provides methods to manage termination and methods that can produce a Future for tr
FileUtils (org.apache.commons.io)
General file manipulation utilities. Facilities are provided in the following areas: * writing to a
LoggerFactory (org.slf4j)
The LoggerFactory is a utility class producing Loggers for various logging APIs, most notably for lo
BufferedImage (java.awt.image)
The BufferedImage subclass describes an java.awt.Image with an accessible buffer of image data. All
Option (scala)
CodeWhisperer alternatives

How to use getSplitsmethodin org.apache.hadoop.mapreduce.lib.input.TextInputFormat

Best Java code snippets using org.apache.hadoop.mapreduce.lib.input.TextInputFormat.getSplits (Showing top 14 results out of 315)

How to use
getSplits
method
in
org.apache.hadoop.mapreduce.lib.input.TextInputFormat