org.apache.hadoop.hive.ql.io.orc.OrcSplit java code examples

public OrcNewSplit(OrcSplit inner) throws IOException {
 super(inner.getPath(), inner.getStart(), inner.getLength(),
    inner.getLocations());
 this.orcTail = inner.getOrcTail();
 this.hasFooter = inner.hasFooter();
 this.isOriginal = inner.isOriginal();
 this.hasBase = inner.hasBase();
 this.deltas.addAll(inner.getDeltas());
}

@Override
public List<OrcSplit> getSplits() throws IOException {
 List<OrcSplit> splits = Lists.newArrayList();
 // When split-update is enabled, we do not need to account for buckets that aren't covered.
 // This is a huge performance benefit of split-update. And the reason why we are able to
 // do so is because the 'deltas' here are actually only the delete_deltas. All the insert_deltas
 // with valid user payload data has already been considered as base for the covered buckets.
 // Hence, the uncovered buckets do not have any relevant data and we can just ignore them.
 if (acidOperationalProperties != null && acidOperationalProperties.isSplitUpdate()) {
  return splits; // return an empty list.
 }
 // Generate a split for any buckets that weren't covered.
 // This happens in the case where a bucket just has deltas and no
 // base.
 if (!deltas.isEmpty()) {
  for (int b = 0; b < numBuckets; ++b) {
   if (!covered[b]) {
    splits.add(new OrcSplit(dir, null, b, 0, new String[0], null, false, false, deltas, -1, -1));
   }
  }
 }
 return splits;
}

static Reader createOrcReaderForSplit(Configuration conf, OrcSplit orcSplit) throws IOException {
 Path path = orcSplit.getPath();
 Reader reader;
 if (orcSplit.hasBase()) {
  OrcFile.ReaderOptions readerOptions = OrcFile.readerOptions(conf);
  readerOptions.maxLength(orcSplit.getFileLength());
  if (orcSplit.hasFooter()) {
   readerOptions.orcTail(orcSplit.getOrcTail());
  }
  reader = OrcFile.createReader(path, readerOptions);
 } else {
  reader = null;
 }
 return reader;
}

 @Override
 public String toString() {
  return "OrcSplit [" + getPath() + ", start=" + getStart() + ", length=" + getLength()
    + ", isOriginal=" + isOriginal + ", fileLength=" + fileLen + ", hasFooter=" + hasFooter +
    ", hasBase=" + hasBase + ", deltas=" + deltas + "]";
 }
}

static Path[] getDeleteDeltaDirsFromSplit(OrcSplit orcSplit) throws IOException {
 Path path = orcSplit.getPath();
 Path root;
 if (orcSplit.hasBase()) {
  if (orcSplit.isOriginal()) {
   root = orcSplit.getRootDir();
  } else {
   root = path.getParent().getParent();//todo: why not just use getRootDir()?
   assert root.equals(orcSplit.getRootDir()) : "root mismatch: baseDir=" + orcSplit.getRootDir() +
    " path.p.p=" + root;
  }
 } else {
  throw new IllegalStateException("Split w/o base w/Acid 2.0??: " + path);
 }
 return AcidUtils.deserializeDeleteDeltas(root, orcSplit.getDeltas());
}

final Path path = split.getPath();
if (split.hasBase()) {
 if (split.isOriginal()) {
  root = path.getParent();
 } else {
    AcidUtils.deserializeDeleteDeltas(root, split.getDeltas())
    : AcidUtils.deserializeDeltas(root, split.getDeltas());
final Configuration conf = options.getConfiguration();
final int bucket = OrcInputFormat.getBucketForSplit(conf, split);
final Reader.Options readOptions = OrcInputFormat.createOptionsForReader(conf);
readOptions.range(split.getStart(), split.getLength());
 new ValidReadTxnList(txnString);
final OrcRawRecordMerger records =
  new OrcRawRecordMerger(conf, true, reader, split.isOriginal(), bucket,
    validTxnList, readOptions, deltas);
return new RowReader<OrcStruct>() {

    = AcidUtils.getAcidOperationalProperties(options.getConfiguration());
if(!acidOperationalProperties.isSplitUpdate()) {
 throw new IllegalStateException("Expected SpliUpdate table: " + split.getPath());
mergerOptions.rootPath(split.getRootDir());
mergerOptions.bucketPath(split.getPath());
final int bucket;
if (split.hasBase()) {
 AcidOutputFormat.Options acidIOOptions =
  AcidUtils.parseBaseOrDeltaBucketFilename(split.getPath(), conf);
 if(acidIOOptions.getBucketId() < 0) {
  LOG.warn("Can't determine bucket ID for " + split.getPath() + "; ignoring");
 if(split.isOriginal()) {
  mergerOptions.copyIndex(acidIOOptions.getCopyNumber()).bucketPath(split.getPath());
 bucket = (int) split.getStart();
 assert false : "We should never have a split w/o base in acid 2.0 for full acid: " + split.getPath();
readOptions.range(split.getStart(), split.getLength());
 LOG.debug("getReader:: Read ValidWriteIdList: " + validWriteIdList.toString()
    + " isTransactionalTable: " + HiveConf.getBoolVar(conf, ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN));
 LOG.debug("Creating merger for {} and {}", split.getPath(), Arrays.toString(deltas));
  new OrcRawRecordMerger(conf, true, reader, split.isOriginal(), bucket,
    validWriteIdList, readOptions, deltas, mergerOptions);
return new RowReader<OrcStruct>() {

Reader reader = OrcFile.createReader(orcSplit.getPath(),
  OrcFile.readerOptions(conf));
if(orcSplit.isOriginal()) {
final long splitStart = orcSplit.getStart();
final long splitEnd = splitStart + orcSplit.getLength();
int firstStripeIndex = -1;
int lastStripeIndex = -1;

/**
 * {@link VectorizedOrcAcidRowBatchReader} is always used for vectorized reads of acid tables.
 * In some cases this cannot be used from LLAP IO elevator because
 * {@link RecordReader#getRowNumber()} is not (currently) available there but is required to
 * generate ROW__IDs for "original" files
 * @param hasDeletes - if there are any deletes that apply to this split
 * todo: HIVE-17944
 */
static boolean canUseLlapForAcid(OrcSplit split, boolean hasDeletes, Configuration conf) {
 if(!split.isOriginal()) {
  return true;
 }
 VectorizedRowBatchCtx rbCtx = Utilities.getVectorizedRowBatchCtx(conf);
 if(rbCtx == null) {
  throw new IllegalStateException("Could not create VectorizedRowBatchCtx for " + split.getPath());
 }
 return !needSyntheticRowIds(split.isOriginal(), hasDeletes, areRowIdsProjected(rbCtx));
}

reporter.setStatus(orcSplit.toString());
readerOptions = OrcInputFormat.createOptionsForReader(conf);
this.offset = orcSplit.getStart();
this.length = orcSplit.getLength();
    + ":" + orcSplit);
this.syntheticProps = orcSplit.getSyntheticAcidProps();
isOriginal = orcSplit.isOriginal();
if (isOriginal) {
 recordIdColumnVector = new StructColumnVector(VectorizedRowBatch.DEFAULT_SIZE,
rootPath = orcSplit.getRootDir();
  Path parent = orcSplit.getPath().getParent();
  while (parent != null && !rootPath.equals(parent)) {
   if (parent.getName().startsWith(AcidUtils.BASE_PREFIX)) {

static int getBucketForSplit(Configuration conf, OrcSplit orcSplit) {
 if (orcSplit.hasBase()) {
  return AcidUtils.parseBaseOrDeltaBucketFilename(orcSplit.getPath(), conf).getBucket();
 } else {
  return (int) orcSplit.getStart();
 }
}

    new ArrayList<AcidInputFormat.DeltaMetaData>(), true, null, null), null, true, true);
OrcSplit result = splitter.createSplit(0, 200, null);
assertEquals(0, result.getStart());
assertEquals(200, result.getLength());
assertEquals("mock:/a/file", result.getPath().toString());
String[] locs = result.getLocations();
assertEquals(3, locs.length);
assertEquals("host1-1", locs[0]);
assertEquals("host1-3", locs[2]);
result = splitter.createSplit(500, 600, null);
locs = result.getLocations();
assertEquals(3, locs.length);
assertEquals("host2-1", locs[0]);
assertEquals("host2-3", locs[2]);
result = splitter.createSplit(0, 2500, null);
locs = result.getLocations();
assertEquals(1, locs.length);
assertEquals("host0", locs[0]);

OrcSplit split = (OrcSplit) splits[i];
Reader.Options orcReaderOptions = new Reader.Options();
orcReaderOptions.range(split.getStart(), split.getLength());
OrcFile.ReaderOptions qlReaderOptions = OrcFile.readerOptions(conf).maxLength(split.getFileLength());
Reader reader = OrcFile.createReader(split.getPath(), qlReaderOptions);
RecordReader recordReader = reader.rowsOptions(orcReaderOptions);
for(int j = 0; recordReader.hasNext(); j++) {

@Override
public RecordReader<NullWritable, VectorizedRowBatch>
  getRecordReader(InputSplit inputSplit, JobConf conf,
    Reporter reporter) throws IOException {
 FileSplit fSplit = (FileSplit)inputSplit;
 reporter.setStatus(fSplit.toString());
 Path path = fSplit.getPath();
 OrcFile.ReaderOptions opts = OrcFile.readerOptions(conf);
 if(fSplit instanceof OrcSplit){
  OrcSplit orcSplit = (OrcSplit) fSplit;
  if (orcSplit.hasFooter()) {
   opts.orcTail(orcSplit.getOrcTail());
  }
  opts.maxLength(orcSplit.getFileLength());
 }
 Reader reader = OrcFile.createReader(path, opts);
 return new VectorizedOrcRecordReader(reader, conf, fSplit);
}

long splitStart = orcSplit.getStart();
long splitEnd = orcSplit.getStart() + orcSplit.getLength();

reporter.setStatus(orcSplit.toString());
Reader reader = OrcInputFormat.createOrcReaderForSplit(conf, orcSplit);
Reader.Options readerOptions = OrcInputFormat.createOptionsForReader(conf);
readerOptions = OrcRawRecordMerger.createEventOptions(readerOptions);
this.offset = orcSplit.getStart();
this.length = orcSplit.getLength();

OrcSplit result = results.get(0);
assertEquals(3, results.size());
assertEquals(3, result.getStart());
assertEquals(400, result.getLength());
assertEquals(167468, result.getProjectedColumnsUncompressedSize());
result = results.get(1);
assertEquals(403, result.getStart());
assertEquals(400, result.getLength());
assertEquals(167468, result.getProjectedColumnsUncompressedSize());
result = results.get(2);
assertEquals(803, result.getStart());
assertEquals(100, result.getLength());
assertEquals(41867, result.getProjectedColumnsUncompressedSize());
for (int i = 0; i < stripeSizes.length; ++i) {
 assertEquals("checking stripe " + i + " size",
   stripeSizes[i], results.get(i).getLength());
 if (i == stripeSizes.length - 1) {
  assertEquals(41867, results.get(i).getProjectedColumnsUncompressedSize());
 } else {
  assertEquals(83734, results.get(i).getProjectedColumnsUncompressedSize());
assertEquals(1, results.size());
result = results.get(0);
assertEquals(3, result.getStart());
assertEquals(900, result.getLength());
assertEquals(376804, result.getProjectedColumnsUncompressedSize());

if (split instanceof OrcSplit) {
 assertTrue("Footer serialize test for ACID reader, hasFooter is expected in" +
   " orc splits.", ((OrcSplit) split).hasFooter());

private Path getSplitPath(FileSplit inputSplit, JobConf conf) throws IOException {
 Path path = inputSplit.getPath();
 if (inputSplit instanceof OrcSplit) {
  OrcSplit orcSplit = (OrcSplit) inputSplit;
  List<Long> deltas = orcSplit.getDeltas();
  if (!orcSplit.hasBase() && deltas.size() >= 2) {
   throw new IOException("Cannot read valid StructTypeInfo from delta only file: " + path);
  }
 }
 LOG.debug("Input split path: {}", path);
 return path;
}

if (split.isOriginal() && split.getDeltas().isEmpty()) {
 if (vectorMode) {
  return createVectorizedReader(inputSplit, conf, reporter);

How to useOrcSplit in org.apache.hadoop.hive.ql.io.orc

Best Java code snippets using org.apache.hadoop.hive.ql.io.orc.OrcSplit (Showing top 20 results out of 315)

How to use
OrcSplit
in
org.apache.hadoop.hive.ql.io.orc