org.apache.hadoop.mapred.RecordReader java code examples

Refine search

/**
 * {@inheritDoc}.
 *
 * This method will throw a {@link ClassCastException} if type {@link #<D>} is not compatible
 * with type {@link #<K>} if keys are supposed to be read, or if it is not compatible with type
 * {@link #<V>} if values are supposed to be read.
 */
@Override
@SuppressWarnings("unchecked")
public D readRecord(@Deprecated D reuse) throws DataRecordException, IOException {
 K key = this.recordReader.createKey();
 V value = this.recordReader.createValue();
 if (this.recordReader.next(key, value)) {
  return this.readKeys ? (D) key : (D) value;
 }
 return null;
}

protected org.apache.hadoop.mapred.RecordReader setReaderAtSplit(int splitNum)
  throws IOException {
 JobConf localJc = getLocalFSJobConfClone(jc);
 currentSplitPointer = splitNum;
 if ( rr != null ) {
  rr.close();
 }
 // open record reader to read next split
 rr = inputFormat.getRecordReader(inputSplits[currentSplitPointer], jobCloneUsingLocalFs,
   reporter);
 currentSplitPointer++;
 return rr;
}

  @Override
  public float getProgress()
      throws IOException
  {
    return delegate.getProgress();
  }
}

PassThruOffsetReader(RecordReader sourceReader) {
 this.sourceReader = sourceReader;
 key = sourceReader.createKey();
 value = (Writable)sourceReader.createValue();
}

@Override
public void open(HadoopInputSplit split) throws IOException {
  // enforce sequential open() calls
  synchronized (OPEN_MUTEX) {
    this.recordReader = this.mapredInputFormat.getRecordReader(split.getHadoopInputSplit(), jobConf, new HadoopDummyReporter());
    if (this.recordReader instanceof Configurable) {
      ((Configurable) this.recordReader).setConf(jobConf);
    }
    key = this.recordReader.createKey();
    value = this.recordReader.createValue();
    this.fetched = false;
  }
}

Configuration conf = new Configuration();
OrcOutputFormat of = new OrcOutputFormat();
FileSystem fs = FileSystem.getLocal(conf);
Path root = new Path(tmpDir, "testRecordReaderDelta").makeQualified(fs);
fs.delete(root, true);
ObjectInspector inspector;
synchronized (TestOrcFile.class) {
job.setBoolean(hive_metastoreConstants.TABLE_IS_TRANSACTIONAL, true);
InputSplit[] splits = inf.getSplits(job, 5);
assertEquals(2, splits.length);
org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr;
for(int j = 0; j < splits.length; j++) {
 InputSplit split = splits[j];
 rr = inf.getRecordReader(split, job, Reporter.NULL);
 OrcStruct row = rr.createValue();
 for (int i = 0; i < values[j].length; ++i) {
  System.out.println("Checking " + i);
  String msg = "split[" + j + "] at i=" + i;
  assertEquals(msg, true, rr.next(NullWritable.get(), row));
  assertEquals(msg, values[j][i], row.getFieldValue(0).toString());
 assertEquals(false, rr.next(NullWritable.get(), row));

@Test
public void readExcelInputFormatExcel2003Empty() throws IOException {
  JobConf job = new JobConf(defaultConf);
  ClassLoader classLoader = getClass().getClassLoader();
  String fileName = "excel2003empty.xls";
  String fileNameSpreadSheet = classLoader.getResource(fileName).getFile();
  Path file = new Path(fileNameSpreadSheet);
  FileInputFormat.setInputPaths(job, file);
  // set locale to the one of the test data
  job.set("hadoopoffice.locale.bcp47", "de");
  ExcelFileInputFormat format = new ExcelFileInputFormat();
  format.configure(job);
  InputSplit[] inputSplits = format.getSplits(job, 1);
  assertEquals(1, inputSplits.length, "Only one split generated for Excel file");
  RecordReader<Text, ArrayWritable> reader = format.getRecordReader(inputSplits[0], job, reporter);
  assertNotNull(reader, "Format returned  null RecordReader");
  Text spreadSheetKey = new Text();
  ArrayWritable spreadSheetValue = new ArrayWritable(SpreadSheetCellDAO.class);
  assertTrue(reader.next(spreadSheetKey, spreadSheetValue), "Input Split for Excel file contains row 1");
  assertEquals(0, spreadSheetValue.get().length, "Input Split for Excel file contain row 1 and is empty");
  assertFalse(reader.next(spreadSheetKey, spreadSheetValue),
      "Input Split for Excel file contains no further row");
}

@SuppressWarnings("unchecked") // InputFormat instantiation
static long readBench(JobConf conf) throws IOException {
 InputFormat inf = conf.getInputFormat();
 final String fn = conf.get("test.filebench.name", "");
 Path pin = new Path(FileInputFormat.getInputPaths(conf)[0], fn);
 FileStatus in = pin.getFileSystem(conf).getFileStatus(pin);
 RecordReader rr = inf.getRecordReader(new FileSplit(pin, 0, in.getLen(), 
                    (String[])null), conf, Reporter.NULL);
 try {
  Object key = rr.createKey();
  Object val = rr.createValue();
  Date start = new Date();
  while (rr.next(key, val));
  Date end = new Date();
  return end.getTime() - start.getTime();
 } finally {
  rr.close();
 }
}

@Test
public void readEthereumBlockInputFormatBlock3346406() throws IOException, EthereumBlockReadException, ParseException, InterruptedException {
 JobConf job = new JobConf(defaultConf);
 ClassLoader classLoader = getClass().getClassLoader();
 String fileName="eth3346406.bin";
 String fileNameBlock=classLoader.getResource("testdata/"+fileName).getFile();	
 Path file = new Path(fileNameBlock);
 FileInputFormat.setInputPaths(job, file);
 EthereumBlockFileInputFormat format = new EthereumBlockFileInputFormat();
 format.configure(job);
 InputSplit[] inputSplits = format.getSplits(job,1);

 assertEquals( 1, inputSplits.length,"Only one split generated for genesis block");
   RecordReader<BytesWritable, EthereumBlock> reader = format.getRecordReader(inputSplits[0], job, reporter);
 assertNotNull( reader,"Format returned  null RecordReader");
 BytesWritable key = new BytesWritable();	
 EthereumBlock block = new EthereumBlock();
 assertTrue( reader.next(key,block),"Input Split for block 3346406 contains at least one block");
 assertEquals( 7, block.getEthereumTransactions().size(),"Block 3346406 must have 7 transactions");
   assertFalse( reader.next(key,block),"No further blocks in block 3346406");
   reader.close();
 }

private void writeThenReadByRecordReader(int intervalRecordCount,
  int writeCount, int splitNumber, long minSplitSize, CompressionCodec codec)
  throws IOException {
 Path testDir = new Path(System.getProperty("test.tmp.dir", ".")
   + "/mapred/testsmallfirstsplit");
 Path testFile = new Path(testDir, "test_rcfile");
 fs.delete(testFile, true);
 Configuration cloneConf = new Configuration(conf);
 RCFileOutputFormat.setColumnNumber(cloneConf, bytesArray.length);
 JobConf jonconf = new JobConf(cloneConf);
 jonconf.set("mapred.input.dir", testDir.toString());
 HiveConf.setLongVar(jonconf, HiveConf.ConfVars.MAPREDMINSPLITSIZE, minSplitSize);
 InputSplit[] splits = inputFormat.getSplits(jonconf, splitNumber);
  int previousReadCount = readCount;
  RecordReader rr = inputFormat.getRecordReader(splits[i], jonconf, Reporter.NULL);
  Object key = rr.createKey();
  Object value = rr.createValue();
  while (rr.next(key, value)) {
   readCount++;
  rr.close();
  System.out.println("The " + i + "th split read "
    + (readCount - previousReadCount));

OutputFormat<?, ?> outFormat = new OrcOutputFormat();
RecordWriter writer =
  outFormat.getRecordWriter(fs, conf, testFilePath.toString(),
    Reporter.NULL);
writer.write(NullWritable.get(),
inspector = (StructObjectInspector) serde.getObjectInspector();
InputFormat<?,?> in = new OrcInputFormat();
FileInputFormat.setInputPaths(conf, testFilePath.toString());
InputSplit[] splits = in.getSplits(conf, 1);
assertEquals(1, splits.length);
ColumnProjectionUtils.appendReadColumns(conf, Collections.singletonList(1));
conf.set("columns", "z,r");
conf.set("columns.types", "int:struct<x:int,y:int>");
org.apache.hadoop.mapred.RecordReader reader =
  in.getRecordReader(splits[0], conf, Reporter.NULL);
Object key = reader.createKey();
Object value = reader.createValue();
int rowNum = 0;
List<? extends StructField> fields = inspector.getAllStructFieldRefs();
IntObjectInspector intInspector =
  (IntObjectInspector) fields.get(0).getFieldObjectInspector();
while (reader.next(key, value)) {
 assertEquals(null, inspector.getStructFieldData(value, fields.get(0)));
 Object sub = inspector.getStructFieldData(value, fields.get(1));
reader.close();

FileSystem fs = dataDir1.getFileSystem(job);
int symbolLinkedFileSize = 0;
symbolLinkedFileSize += fs.getFileStatus(dir1_file1).getLen();
Path dir1_file2 = new Path(dataDir1, "file2");
writeTextFile(dir1_file2,
       "dir1_file2_line1\n" +
       "dir2_file2_line2\n");
symbolLinkedFileSize += fs.getFileStatus(dir2_file2).getLen();
assertEquals(0, cs.getDirectoryCount());
FileInputFormat.setInputPaths(job, symlinkDir);
InputSplit[] splits = inputFormat.getSplits(job, 2);
   inputFormat.getRecordReader(split, job, reporter);
 LongWritable key = reader.createKey();
 Text value = reader.createValue();
 while (reader.next(key, value)) {
  received.add(value.toString());
 reader.close();

  String fileName = ((FileSplit) (inputSplits[currentSplitIndex])).getPath().toUri().getPath();
  FileStatus fileStatus = hdfs.getFileStatus(new Path(fileName));
reader.close();
reader = getRecordReader(currentSplitIndex);
return true;

JobConf conf = new JobConf();
FileInputFormat.addInputPath(conf, new Path(path));
InputSplit[] splits = informat.getSplits(conf, 10000);
assertTrue(splits.length > 3); //want to test that splitting is working b/c i made really big files
for(InputSplit split: splits) {
  RecordReader<Text, BytesWritable> rr = informat.getRecordReader(split, conf, Reporter.NULL);
  Text t = new Text();
  BytesWritable b = new BytesWritable();
  while(rr.next(t, b)) {
    results.put(t.toString(), new String(Utils.getBytes(b)));
  rr.close();

@Test
public void testVectorizationWithAcid() throws Exception {
 StructObjectInspector inspector = new BigRowInspector();
 JobConf conf = createMockExecutionEnvironment(workDir, new Path("mock:///"),
   "vectorizationAcid", inspector, true, 1);
 conf.set(ValidTxnList.VALID_TXNS_KEY,
   new ValidReadTxnList(new long[0], new BitSet(), 1000, Long.MAX_VALUE).writeToString());
 Path partDir = new Path(conf.get("mapred.input.dir"));
 OrcRecordUpdater writer = new OrcRecordUpdater(partDir,
   new AcidOutputFormat.Options(conf).maximumWriteId(10)
 Path path = new Path("mock:/vectorizationAcid/p=0/base_0000010/bucket_00000");
 setBlocks(path, conf, new MockBlock("host0", "host1"));
 assertEquals(1, splits.length);
 conf.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, BigRow.getColumnNamesProperty());
 NullWritable key = reader.createKey();
 VectorizedRowBatch value = reader.createValue();
 assertEquals(true, reader.next(key, value));
 assertEquals(100, value.count());
 LongColumnVector booleanColumn = (LongColumnVector) value.cols[0];
    timestampColumn.getTime(i));
 assertEquals(false, reader.next(key, value));

private void verifyInputFormatForSequenceFile() {
 try {
  JobConf conf = new JobConf();
  String TMP_DIR = System.getProperty("test.build.data", "/tmp");
  Path filename = new Path("file:///" + TMP_DIR + "/tmpSeqFile");
  SequenceFile.Writer sfw = SequenceFile.createWriter(FileSystem
    .getLocal(conf), conf, filename, ChukwaArchiveKey.class,
    ChunkImpl.class, SequenceFile.CompressionType.NONE, Reporter.NULL);
  long len = FileSystem.getLocal(conf).getFileStatus(filename).getLen();
  InputSplit split = new FileSplit(filename, 0, len, (String[]) null);
  ChukwaInputFormat in = new ChukwaInputFormat();
    Reporter.NULL);
  LongWritable l = r.createKey();
  Text line = r.createValue();
  for (int i = 0; i < lines.length * 2; ++i) {
   boolean succeeded = r.next(l, line);
   assertTrue(succeeded);
   assertEquals(i, l.get());
   assertEquals(lines[i % lines.length], line.toString());
   System.out.println("read line: " + l.get() + " " + line);
  boolean succeeded = r.next(l, line);
  assertFalse(succeeded);

     ObjectInspectorFactory.ObjectInspectorOptions.JAVA);
JobConf conf = createMockExecutionEnvironment(workDir, new Path("mock:///"),
  "vectorBuckets", inspector, true, 1);
Path path = new Path(conf.get("mapred.input.dir") + "/0_0");
Writer writer =
  OrcFile.createWriter(path,
conf.setInt(hive_metastoreConstants.BUCKET_COUNT, 3);
HiveInputFormat<?,?> inputFormat =
  new HiveInputFormat<WritableComparable, Writable>();
NullWritable key = reader.createKey();
VectorizedRowBatch value = reader.createValue();
assertEquals(true, reader.next(key, value));
assertEquals(10, value.count());
LongColumnVector col0 = (LongColumnVector) value.cols[0];
 assertEquals("checking " + i, i, col0.vector[i]);
assertEquals(false, reader.next(key, value));

TeraInputFormat inFormat = new TeraInputFormat();
TextSampler sampler = new TextSampler();
Text key = new Text();
Text value = new Text();
int partitions = conf.getNumReduceTasks();
long sampleSize = conf.getLong(SAMPLE_SIZE, 100000);
InputSplit[] splits = inFormat.getSplits(conf, conf.getNumMapTasks());
int samples = Math.min(10, splits.length);
long recordsPerSample = sampleSize / samples;
  RecordReader<Text, Text> reader =
      inFormat.getRecordReader(splits[sampleStep * i], conf, null);
  while (reader.next(key, value)) {
    sampler.addKey(key);
    records += 1;
FileSystem outFs = partFile.getFileSystem(conf);
if (outFs.exists(partFile)) {
  outFs.delete(partFile, false);

System.out.println("Files found: ");
for (AcidUtils.ParsedDelta pd : current) {
 System.out.println(pd.getPath().toString());
JobConf job = new JobConf();
job.set("mapred.input.dir", partitionPath.toString());
job.set(BUCKET_COUNT, Integer.toString(buckets));
job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS, "id,msg");
job.set(IOConstants.SCHEMA_EVOLUTION_COLUMNS_TYPES, "bigint:string");
job.set(ValidWriteIdList.VALID_WRITEIDS_KEY, writeIds.toString());
job.set(ValidTxnList.VALID_TXNS_KEY, conf.get(ValidTxnList.VALID_TXNS_KEY));
InputSplit[] splits = inf.getSplits(job, buckets);
Assert.assertEquals(numExpectedFiles, splits.length);
org.apache.hadoop.mapred.RecordReader<NullWritable, OrcStruct> rr =
    inf.getRecordReader(splits[0], job, Reporter.NULL);
NullWritable key = rr.createKey();
OrcStruct value = rr.createValue();
for (String record : records) {
 Assert.assertEquals(true, rr.next(key, value));
 Assert.assertEquals(record, value.toString());
Assert.assertEquals(false, rr.next(key, value));

  throws Exception
JobConf configuration = new JobConf(new Configuration(false));
configuration.set(READ_COLUMN_IDS_CONF_STR, "0");
configuration.setBoolean(READ_ALL_COLUMNS, false);
RecordReader<K, V> recordReader = inputFormat.getRecordReader(
    new FileSplit(new Path(tempFile.getFile().getAbsolutePath()), 0, tempFile.getFile().length(), (String[]) null),
    configuration,
    NULL);
K key = recordReader.createKey();
V value = recordReader.createValue();
while (recordReader.next(key, value)) {
  Object expectedValue = iterator.next();

Javadoc

RecordReader reads <key, value> pairs from an InputSplit.

RecordReader, typically, converts the byte-oriented view of the input, provided by the InputSplit, and presents a record-oriented view for the Mapper & Reducer tasks for processing. It thus assumes the responsibility of processing record boundaries and presenting the tasks with keys and values.

Most used methods

next
Reads the next key/value pair from the input for processing.
createValue
Create an object of the appropriate type to be used as a value.
createKey
Create an object of the appropriate type to be used as a key.
close
Close this InputSplit to future operations.
getProgress
How much of the input has the RecordReader consumed i.e. has been processed by?
getPos
Returns the current position in the input.
hasNext

Popular in Java

Start an intent from android
getResourceAsStream (ClassLoader)
getSharedPreferences (Context)
onRequestPermissionsResult (Fragment)
FileInputStream (java.io)
An input stream that reads bytes from a file. File file = ...finally if (in != null) in.clos
String (java.lang)
Enumeration (java.util)
A legacy iteration interface.New code should use Iterator instead. Iterator replaces the enumeration
TimeUnit (java.util.concurrent)
A TimeUnit represents time durations at a given unit of granularity and provides utility methods to
IsNull (org.hamcrest.core)
Is the value null?
JFileChooser (javax.swing)
Top Sublime Text plugins

How to useRecordReader in org.apache.hadoop.mapred

Best Java code snippets using org.apache.hadoop.mapred.RecordReader (Showing top 20 results out of 981)

Refine search

How to use
RecordReader
in
org.apache.hadoop.mapred