org.apache.hadoop.mapred.FileSplit java code examples

Refine search

@Override
public RecordReader<BytesWritable, BytesWritable> getRecordReader(InputSplit split,
                                 JobConf conf,
                                 Reporter reporter)
    throws IOException {
  String inputPathString = ((FileSplit) split).getPath().toUri().getPath();
  log.info("Input file path:" + inputPathString);
  Path inputPath = new Path(inputPathString);
  SequenceFile.Reader reader = new SequenceFile.Reader(inputPath.getFileSystem(conf),
                             inputPath,
                             conf);
  SequenceFile.Metadata meta = reader.getMetadata();
  try {
    Text keySchema = meta.get(new Text("key.schema"));
    Text valueSchema = meta.get(new Text("value.schema"));
    if(0 == keySchema.getLength() || 0 == valueSchema.getLength()) {
      throw new Exception();
    }
    // update Joboconf with schemas
    conf.set("mapper.input.key.schema", keySchema.toString());
    conf.set("mapper.input.value.schema", valueSchema.toString());
  } catch(Exception e) {
    throw new IOException("Failed to Load Schema from file:" + inputPathString + "\n");
  }
  return super.getRecordReader(split, conf, reporter);
}

public Boolean readFileWithCache(long startTime) throws IOException, InterruptedException {
 if (fileKey == null) return false;
 BooleanRef gotAllData = new BooleanRef();
 long endOfSplit = split.getStart() + split.getLength();
 this.cachedData = cache.getFileData(fileKey, split.getStart(),
   endOfSplit, writerIncludes, CC_FACTORY, counters, gotAllData);
 if (cachedData == null) {
   uncachedSuffixStart = slices.get(slices.size() - 1).getLastEnd();
 Ref<Integer> stripeIx = Ref.from(0);
 if (uncachedPrefixEnd > split.getStart()) {
  FileSplit sliceSplit = new FileSplit(split.getPath(), split.getStart(),
    uncachedPrefixEnd - split.getStart(), hosts, inMemoryHosts);
  if (!processOneFileSplit(sliceSplit, startTime, stripeIx, null)) return null;
  long start = slice.getKnownTornStart();
  long len = slice.getLastStart() - start; // Will also read the last row.
  FileSplit sliceSplit = new FileSplit(split.getPath(), start, len, hosts, inMemoryHosts);
  if (!processOneFileSplit(sliceSplit, startTime, stripeIx, slice)) return null;
  long size =  split.getPath().getFileSystem(
    daemonConf).getFileStatus(split.getPath()).getLen();
  isUnfortunate = size > endOfSplit;
  if (isUnfortunate) {
   LlapIoImpl.LOG.warn("One-row mismatch at the end of split " + split.getPath()
     + " at " + endOfSplit + "; file size is " + size);
  FileSplit splitPart = new FileSplit(split.getPath(), uncachedSuffixStart,

  /**
   * @param clsName Input split class name.
   * @param in Input stream.
   * @param hosts Optional hosts.
   * @return File block or {@code null} if it is not a {@link FileSplit} instance.
   * @throws IgniteCheckedException If failed.
   */
  @Nullable public static HadoopFileBlock readFileBlock(String clsName, FSDataInputStream in,
    @Nullable String[] hosts) throws IgniteCheckedException {
    if (!FileSplit.class.getName().equals(clsName))
      return null;

    FileSplit split = U.newInstance(FileSplit.class);

    try {
      split.readFields(in);
    }
    catch (IOException e) {
      throw new IgniteCheckedException(e);
    }

    if (hosts == null)
      hosts = EMPTY_HOSTS;

    return new HadoopFileBlock(hosts, split.getPath().toUri(), split.getStart(), split.getLength());
  }
}

public Optional<InternalHiveSplit> createInternalHiveSplit(FileSplit split)
    throws IOException
{
  FileStatus file = fileSystem.getFileStatus(split.getPath());
  return createInternalHiveSplit(
      split.getPath(),
      fileSystem.getFileBlockLocations(file, split.getStart(), split.getLength()),
      split.getStart(),
      split.getLength(),
      file.getLen(),
      OptionalInt.empty(),
      false);
}

  @Override
  public String getInputSplitSignature(InputSplit inputSplit) {
    FileSplit baseSplit = (FileSplit) inputSplit;
    return baseSplit.getPath().getName() + "_" + baseSplit.getStart() + "_" + baseSplit.getLength();
  }
}

 public long get(FileSplit split) {
  Path path = split.getPath();
  long start = split.getStart();
  String key = path.toString()+"+"+String.format("%d",start);
  RCFileSyncEntry entry = cache.get(key);
  if(entry != null) {
   return entry.endSync;
  }
  return -1;
 }
}

  fileSizes = true;
 } else if (fileName == null){
  fileName = new Path(arg);
 } else {
  printUsage(null);
FileSystem fs = FileSystem.get(fileName.toUri(), conf);
long fileLen = fs.getFileStatus(fileName).getLen();
if (start < 0) {
 start = 0;
FileSplit split = new FileSplit(fileName,start, length, new JobConf(conf));
RCFileRecordReader recordReader = new RCFileRecordReader(conf, split);

public ArrayList<String> readRecords(URL testFileUrl, int splitSize)
  throws IOException {
 // Set up context
 File testFile = new File(testFileUrl.getFile());
 long testFileSize = testFile.length();
 Path testFilePath = new Path(testFile.getAbsolutePath());
 Configuration conf = new Configuration();
 conf.setInt("io.file.buffer.size", 1);
 // Gather the records returned by the record reader
 ArrayList<String> records = new ArrayList<String>();
 long offset = 0;
 LongWritable key = new LongWritable();
 Text value = new Text();
 while (offset < testFileSize) {
  FileSplit split =
    new FileSplit(testFilePath, offset, splitSize, (String[]) null);
  LineRecordReader reader = new LineRecordReader(conf, split);
  while (reader.next(key, value)) {
   records.add(value.toString());
  }
  offset += splitSize;
 }
 return records;
}

try {
 boolean sendSerializedEvents =
   conf.getBoolean("mapreduce.tez.input.initializer.serialize.event.payload", true);
 boolean generateConsistentSplits = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_TEZ_GENERATE_CONSISTENT_SPLITS);
 LOG.info("GenerateConsistentSplitsInHive=" + generateConsistentSplits);
 String realInputFormatName = conf.get("mapred.input.format.class");
 boolean groupingEnabled = userPayloadProto.getGroupingEnabled();
 if (groupingEnabled) {
   final long blockSize = conf.getLong(DFSConfigKeys.DFS_BLOCK_SIZE_KEY,
    FileSplit fileSplit = new FileSplit(fileStatus.getPath(), 0, fileStatus.getLen(), hosts);
    String alias = mapWork.getAliases().get(0);
    PartitionDesc partDesc = mapWork.getAliasToPartnInfo().get(alias);
    if (inputSplit instanceof FileSplit) {
     final FileSplit fileSplit = (FileSplit) inputSplit;
     final Path path = fileSplit.getPath();
     final String fileStr = path.toString();
     if (!files.contains(fileStr)) {
      files.add(fileStr);

Path dir = new Path(tPart.getSd().getLocation());
long numRows = 0;
long rawDataSize = 0;
long fileSize = 0;
long numFiles = 0;
FileSystem fs = dir.getFileSystem(conf);
FileStatus[] fileList = HiveStatsUtils.getFileStatusRecurse(dir, -1, fs);
 if (!file.isDir()) {
  InputFormat<?, ?> inputFormat = ReflectionUtil.newInstance(
    partn.getInputFormatClass(), jc);
  InputSplit dummySplit = new FileSplit(file.getPath(), 0, 0,
    new String[] { partn.getLocation() });
  org.apache.hadoop.mapred.RecordReader<?, ?> recordReader =
   rawDataSize += statsRR.getStats().getRawDataSize();
   numRows += statsRR.getStats().getRowCount();
   fileSize += file.getLen();
   numFiles += 1;
   statsAvailable = true;

@SuppressWarnings("unchecked") // InputFormat instantiation
static long readBench(JobConf conf) throws IOException {
 InputFormat inf = conf.getInputFormat();
 final String fn = conf.get("test.filebench.name", "");
 Path pin = new Path(FileInputFormat.getInputPaths(conf)[0], fn);
 FileStatus in = pin.getFileSystem(conf).getFileStatus(pin);
 RecordReader rr = inf.getRecordReader(new FileSplit(pin, 0, in.getLen(), 
                    (String[])null), conf, Reporter.NULL);
 try {
  Object key = rr.createKey();
  Object val = rr.createValue();
  Date start = new Date();
  while (rr.next(key, val));
  Date end = new Date();
  return end.getTime() - start.getTime();
 } finally {
  rr.close();
 }
}

public OmnitureDataFileRecordReader(Configuration job, FileSplit split)
    throws IOException {
  this.maxLineLength = job.getInt("mapred.escapedlinereader.maxlength", Integer.MAX_VALUE);
  this.start = split.getStart();
  this.end = start + split.getLength();
  final Path file = split.getPath();
  CompressionCodecFactory compressionCodecs = new CompressionCodecFactory(job);
  final CompressionCodec codec = compressionCodecs.getCodec(file);
  // Open the file and seek to the start of the split
  FileSystem fs = file.getFileSystem(job);
  FSDataInputStream fileIn = fs.open(split.getPath());
  boolean skipFirstLine = false;
  if (codec != null) {
    lineReader = new EscapedLineReader(codec.createInputStream(fileIn), job);
    end = Long.MAX_VALUE;
  } else {
    if (start != 0) {
      skipFirstLine = true;
      --start;
      fileIn.seek(start);
    }
    lineReader = new EscapedLineReader(fileIn, job);
  }
  if (skipFirstLine) {
    start += lineReader.readLine(new Text(), 0, (int) Math.min((long) Integer.MAX_VALUE, end - start));
  }
  this.pos = start;
}

public ConnectorPageSource newPageSource(FileFormatDataSourceStats stats, ConnectorSession session)
{
  OrcPageSourceFactory orcPageSourceFactory = new OrcPageSourceFactory(TYPE_MANAGER, false, HDFS_ENVIRONMENT, stats);
  return HivePageSourceProvider.createHivePageSource(
      ImmutableSet.of(),
      ImmutableSet.of(orcPageSourceFactory),
      new Configuration(),
      session,
      fileSplit.getPath(),
      OptionalInt.empty(),
      fileSplit.getStart(),
      fileSplit.getLength(),
      fileSplit.getLength(),
      schema,
      TupleDomain.all(),
      columns,
      partitionKeys,
      DateTimeZone.UTC,
      TYPE_MANAGER,
      ImmutableMap.of(),
      Optional.empty(),
      false)
      .get();
}

@Override
public List<WorkUnit> getWorkunits(SourceState state) {
 JobConf jobConf = new JobConf(new Configuration());
 for (String key : state.getPropertyNames()) {
  jobConf.set(key, state.getProp(key));
   FileInputFormat.addInputPath(jobConf, new Path(inputPath));
   WorkUnit workUnit = WorkUnit.create(extract);
   workUnit.setProp(HadoopFileInputSource.FILE_SPLIT_BYTES_STRING_KEY, HadoopUtils.serializeToString(fileSplit));
   workUnit.setProp(HadoopFileInputSource.FILE_SPLIT_PATH_KEY, fileSplit.getPath().toString());
   workUnits.add(workUnit);

public AbstractFeatureReader(
    final InputSplit inputSplit,
    final JobConf jobConf) throws IOException
{
  super(inputSplit, jobConf);
  // No need to check if instance of FileSplit as it is done in super class.
  final FileSplit fileSplit = (FileSplit) inputSplit;
  // Get .shp file
  final Path shpPath = fileSplit.getPath();
  final String dbfName = shpPath.getName().replace(".shp", ".dbf");
  final Path dbfPath = new Path(shpPath.getParent(), dbfName);
  m_dbfStream = dbfPath.getFileSystem(jobConf).open(dbfPath);
  m_dbfReader = new DBFReader(m_dbfStream);
  // Create a list of field name as Hadoop Text instances
  final List<DBFField> fields = m_dbfReader.getFields();
  m_keys = new ArrayList<Text>(fields.size());
  for (final DBFField field : fields)
  {
    m_keys.add(new Text(field.fieldName));
  }
}

tableProperties.setProperty("columns", Joiner.on(',').join(transform(testColumns, TestColumn::getName)));
tableProperties.setProperty("columns.types", Joiner.on(',').join(transform(testColumns, TestColumn::getType)));
serDe.initialize(new Configuration(), tableProperties);
JobConf jobConf = new JobConf();
configureCompression(jobConf, compressionCodec);
    new Path(filePath),
    Text.class,
    compressionCodec != HiveCompressionCodec.NONE,
  serDe.initialize(new Configuration(), tableProperties);
Path path = new Path(filePath);
path.getFileSystem(new Configuration()).setVerifyChecksum(true);
File file = new File(filePath);
return new FileSplit(path, 0, file.length(), new String[0]);

 private void setIncrementalConfigParams(InputSplit inputSplit) {
  if (inputSplit instanceof FileSplit) {
   FileSplit fileSplit = (FileSplit) inputSplit;
   this.incrementalConf = new Configuration(false);

   this.incrementalConf.set(JobContext.MAP_INPUT_FILE, fileSplit.getPath().toString());
   this.incrementalConf.setLong(JobContext.MAP_INPUT_START, fileSplit.getStart());
   this.incrementalConf.setLong(JobContext.MAP_INPUT_PATH, fileSplit.getLength());
  }
  LOG.info("Processing split: " + inputSplit);
 }
}

  throws Exception
JobConf configuration = new JobConf(new Configuration(false));
configuration.set(READ_COLUMN_IDS_CONF_STR, "0");
configuration.setBoolean(READ_ALL_COLUMNS, false);
    new FileSplit(new Path(tempFile.getFile().getAbsolutePath()), 0, tempFile.getFile().length(), (String[]) null),
    configuration,
    NULL);

 @Test
 public void testMaxBlockLocationsOldSplits() throws Exception {
  TEST_DIR.mkdirs();
  try {
   Configuration conf = new Configuration();
   conf.setInt(MRConfig.MAX_BLOCK_LOCATIONS_KEY, 4);
   Path submitDir = new Path(TEST_DIR.getAbsolutePath());
   FileSystem fs = FileSystem.getLocal(conf);
   org.apache.hadoop.mapred.FileSplit split =
     new org.apache.hadoop.mapred.FileSplit(new Path("/some/path"), 0, 1,
       new String[] { "loc1", "loc2", "loc3", "loc4", "loc5" });
   JobSplitWriter.createSplitFiles(submitDir, conf, fs,
     new org.apache.hadoop.mapred.InputSplit[] { split });
   JobSplit.TaskSplitMetaInfo[] infos =
     SplitMetaInfoReader.readSplitMetaInfo(new JobID(), fs, conf,
       submitDir);
   assertEquals("unexpected number of splits", 1, infos.length);
   assertEquals("unexpected number of split locations",
     4, infos[0].getLocations().length);
  } finally {
   FileUtil.fullyDelete(TEST_DIR);
  }
 }
}

serDe.initialize(CONFIGURATION, tableProperties);
JobConf jobConf = new JobConf();
if (compressionCodec != null) {
  CompressionCodec codec = new CompressionCodecFactory(CONFIGURATION).getCodecByName(compressionCodec);
  jobConf.set(COMPRESS_CODEC, codec.getClass().getName());
  jobConf.set(COMPRESS_TYPE, SequenceFile.CompressionType.BLOCK.toString());
RecordWriter recordWriter = createRecordWriter(new Path(filePath), CONFIGURATION);
Path path = new Path(filePath);
path.getFileSystem(CONFIGURATION).setVerifyChecksum(true);
File file = new File(filePath);
return new FileSplit(path, 0, file.length(), new String[0]);

Javadoc

A section of an input file. Returned by InputFormat#getSplits(JobConf,int) and passed to InputFormat#getRecordReader(InputSplit,JobConf,Reporter).

Most used methods

getPath
The file containing this split's data.
getLength
The number of bytes in the file to process.
getStart
The position of the first byte in the file to process.
<init>
readFields
write
getLocations
toString
getLocationInfo

Popular in Java

Start an intent from android
scheduleAtFixedRate (ScheduledExecutorService)
onCreateOptionsMenu (Activity)
getApplicationContext (Context)
Pointer (com.sun.jna)
An abstraction for a native pointer data type. A Pointer instance represents, on the Java side, a na
String (java.lang)
NumberFormat (java.text)
The abstract base class for all number formats. This class provides the interface for formatting and
AtomicInteger (java.util.concurrent.atomic)
An int value that may be updated atomically. See the java.util.concurrent.atomic package specificati
ReentrantLock (java.util.concurrent.locks)
A reentrant mutual exclusion Lock with the same basic behavior and semantics as the implicit monitor
Loader (org.hibernate.loader)
Abstract superclass of object loading (and querying) strategies. This class implements useful common
Best IntelliJ plugins

How to useFileSplit in org.apache.hadoop.mapred

Best Java code snippets using org.apache.hadoop.mapred.FileSplit (Showing top 20 results out of 1,179)

Refine search

How to use
FileSplit
in
org.apache.hadoop.mapred