public OrcNewSplit(OrcSplit inner) throws IOException { super(inner.getPath(), inner.getStart(), inner.getLength(), inner.getLocations()); this.orcTail = inner.getOrcTail(); this.hasFooter = inner.hasFooter(); this.isOriginal = inner.isOriginal(); this.hasBase = inner.hasBase(); this.deltas.addAll(inner.getDeltas()); }
@Override public List<OrcSplit> getSplits() throws IOException { List<OrcSplit> splits = Lists.newArrayList(); // When split-update is enabled, we do not need to account for buckets that aren't covered. // This is a huge performance benefit of split-update. And the reason why we are able to // do so is because the 'deltas' here are actually only the delete_deltas. All the insert_deltas // with valid user payload data has already been considered as base for the covered buckets. // Hence, the uncovered buckets do not have any relevant data and we can just ignore them. if (acidOperationalProperties != null && acidOperationalProperties.isSplitUpdate()) { return splits; // return an empty list. } // Generate a split for any buckets that weren't covered. // This happens in the case where a bucket just has deltas and no // base. if (!deltas.isEmpty()) { for (int b = 0; b < numBuckets; ++b) { if (!covered[b]) { splits.add(new OrcSplit(dir, null, b, 0, new String[0], null, false, false, deltas, -1, -1)); } } } return splits; }
static Reader createOrcReaderForSplit(Configuration conf, OrcSplit orcSplit) throws IOException { Path path = orcSplit.getPath(); Reader reader; if (orcSplit.hasBase()) { OrcFile.ReaderOptions readerOptions = OrcFile.readerOptions(conf); readerOptions.maxLength(orcSplit.getFileLength()); if (orcSplit.hasFooter()) { readerOptions.orcTail(orcSplit.getOrcTail()); } reader = OrcFile.createReader(path, readerOptions); } else { reader = null; } return reader; }
static Path[] getDeleteDeltaDirsFromSplit(OrcSplit orcSplit) throws IOException { Path path = orcSplit.getPath(); Path root; if (orcSplit.hasBase()) { if (orcSplit.isOriginal()) { root = orcSplit.getRootDir(); } else { root = path.getParent().getParent();//todo: why not just use getRootDir()? assert root.equals(orcSplit.getRootDir()) : "root mismatch: baseDir=" + orcSplit.getRootDir() + " path.p.p=" + root; } } else { throw new IllegalStateException("Split w/o base w/Acid 2.0??: " + path); } return AcidUtils.deserializeDeleteDeltas(root, orcSplit.getDeltas()); }
final Path path = split.getPath(); if (split.hasBase()) { if (split.isOriginal()) { root = path.getParent(); } else { AcidUtils.deserializeDeleteDeltas(root, split.getDeltas()) : AcidUtils.deserializeDeltas(root, split.getDeltas()); final Configuration conf = options.getConfiguration(); final int bucket = OrcInputFormat.getBucketForSplit(conf, split); final Reader.Options readOptions = OrcInputFormat.createOptionsForReader(conf); readOptions.range(split.getStart(), split.getLength()); new ValidReadTxnList(txnString); final OrcRawRecordMerger records = new OrcRawRecordMerger(conf, true, reader, split.isOriginal(), bucket, validTxnList, readOptions, deltas); return new RowReader<OrcStruct>() {
= AcidUtils.getAcidOperationalProperties(options.getConfiguration()); if(!acidOperationalProperties.isSplitUpdate()) { throw new IllegalStateException("Expected SpliUpdate table: " + split.getPath()); mergerOptions.rootPath(split.getRootDir()); mergerOptions.bucketPath(split.getPath()); final int bucket; if (split.hasBase()) { AcidOutputFormat.Options acidIOOptions = AcidUtils.parseBaseOrDeltaBucketFilename(split.getPath(), conf); if(acidIOOptions.getBucketId() < 0) { LOG.warn("Can't determine bucket ID for " + split.getPath() + "; ignoring"); if(split.isOriginal()) { mergerOptions.copyIndex(acidIOOptions.getCopyNumber()).bucketPath(split.getPath()); bucket = (int) split.getStart(); assert false : "We should never have a split w/o base in acid 2.0 for full acid: " + split.getPath(); readOptions.range(split.getStart(), split.getLength()); LOG.debug("getReader:: Read ValidWriteIdList: " + validWriteIdList.toString() + " isTransactionalTable: " + HiveConf.getBoolVar(conf, ConfVars.HIVE_TRANSACTIONAL_TABLE_SCAN)); LOG.debug("Creating merger for {} and {}", split.getPath(), Arrays.toString(deltas)); new OrcRawRecordMerger(conf, true, reader, split.isOriginal(), bucket, validWriteIdList, readOptions, deltas, mergerOptions); return new RowReader<OrcStruct>() {
Reader reader = OrcFile.createReader(orcSplit.getPath(), OrcFile.readerOptions(conf)); if(orcSplit.isOriginal()) { final long splitStart = orcSplit.getStart(); final long splitEnd = splitStart + orcSplit.getLength(); int firstStripeIndex = -1; int lastStripeIndex = -1;
/** * {@link VectorizedOrcAcidRowBatchReader} is always used for vectorized reads of acid tables. * In some cases this cannot be used from LLAP IO elevator because * {@link RecordReader#getRowNumber()} is not (currently) available there but is required to * generate ROW__IDs for "original" files * @param hasDeletes - if there are any deletes that apply to this split * todo: HIVE-17944 */ static boolean canUseLlapForAcid(OrcSplit split, boolean hasDeletes, Configuration conf) { if(!split.isOriginal()) { return true; } VectorizedRowBatchCtx rbCtx = Utilities.getVectorizedRowBatchCtx(conf); if(rbCtx == null) { throw new IllegalStateException("Could not create VectorizedRowBatchCtx for " + split.getPath()); } return !needSyntheticRowIds(split.isOriginal(), hasDeletes, areRowIdsProjected(rbCtx)); }
reporter.setStatus(orcSplit.toString()); readerOptions = OrcInputFormat.createOptionsForReader(conf); this.offset = orcSplit.getStart(); this.length = orcSplit.getLength(); + ":" + orcSplit); this.syntheticProps = orcSplit.getSyntheticAcidProps(); isOriginal = orcSplit.isOriginal(); if (isOriginal) { recordIdColumnVector = new StructColumnVector(VectorizedRowBatch.DEFAULT_SIZE, rootPath = orcSplit.getRootDir(); Path parent = orcSplit.getPath().getParent(); while (parent != null && !rootPath.equals(parent)) { if (parent.getName().startsWith(AcidUtils.BASE_PREFIX)) {
new ArrayList<AcidInputFormat.DeltaMetaData>(), true, null, null), null, true, true); OrcSplit result = splitter.createSplit(0, 200, null); assertEquals(0, result.getStart()); assertEquals(200, result.getLength()); assertEquals("mock:/a/file", result.getPath().toString()); String[] locs = result.getLocations(); assertEquals(3, locs.length); assertEquals("host1-1", locs[0]); assertEquals("host1-3", locs[2]); result = splitter.createSplit(500, 600, null); locs = result.getLocations(); assertEquals(3, locs.length); assertEquals("host2-1", locs[0]); assertEquals("host2-3", locs[2]); result = splitter.createSplit(0, 2500, null); locs = result.getLocations(); assertEquals(1, locs.length); assertEquals("host0", locs[0]);
OrcSplit split = (OrcSplit) splits[i]; Reader.Options orcReaderOptions = new Reader.Options(); orcReaderOptions.range(split.getStart(), split.getLength()); OrcFile.ReaderOptions qlReaderOptions = OrcFile.readerOptions(conf).maxLength(split.getFileLength()); Reader reader = OrcFile.createReader(split.getPath(), qlReaderOptions); RecordReader recordReader = reader.rowsOptions(orcReaderOptions); for(int j = 0; recordReader.hasNext(); j++) {
@Override public RecordReader<NullWritable, VectorizedRowBatch> getRecordReader(InputSplit inputSplit, JobConf conf, Reporter reporter) throws IOException { FileSplit fSplit = (FileSplit)inputSplit; reporter.setStatus(fSplit.toString()); Path path = fSplit.getPath(); OrcFile.ReaderOptions opts = OrcFile.readerOptions(conf); if(fSplit instanceof OrcSplit){ OrcSplit orcSplit = (OrcSplit) fSplit; if (orcSplit.hasFooter()) { opts.orcTail(orcSplit.getOrcTail()); } opts.maxLength(orcSplit.getFileLength()); } Reader reader = OrcFile.createReader(path, opts); return new VectorizedOrcRecordReader(reader, conf, fSplit); }
reporter.setStatus(orcSplit.toString()); Reader reader = OrcInputFormat.createOrcReaderForSplit(conf, orcSplit); Reader.Options readerOptions = OrcInputFormat.createOptionsForReader(conf); readerOptions = OrcRawRecordMerger.createEventOptions(readerOptions); this.offset = orcSplit.getStart(); this.length = orcSplit.getLength();
OrcSplit result = results.get(0); assertEquals(3, results.size()); assertEquals(3, result.getStart()); assertEquals(400, result.getLength()); assertEquals(167468, result.getProjectedColumnsUncompressedSize()); result = results.get(1); assertEquals(403, result.getStart()); assertEquals(400, result.getLength()); assertEquals(167468, result.getProjectedColumnsUncompressedSize()); result = results.get(2); assertEquals(803, result.getStart()); assertEquals(100, result.getLength()); assertEquals(41867, result.getProjectedColumnsUncompressedSize()); for (int i = 0; i < stripeSizes.length; ++i) { assertEquals("checking stripe " + i + " size", stripeSizes[i], results.get(i).getLength()); if (i == stripeSizes.length - 1) { assertEquals(41867, results.get(i).getProjectedColumnsUncompressedSize()); } else { assertEquals(83734, results.get(i).getProjectedColumnsUncompressedSize()); assertEquals(1, results.size()); result = results.get(0); assertEquals(3, result.getStart()); assertEquals(900, result.getLength()); assertEquals(376804, result.getProjectedColumnsUncompressedSize());
if (split instanceof OrcSplit) { assertTrue("Footer serialize test for ACID reader, hasFooter is expected in" + " orc splits.", ((OrcSplit) split).hasFooter());
private Path getSplitPath(FileSplit inputSplit, JobConf conf) throws IOException { Path path = inputSplit.getPath(); if (inputSplit instanceof OrcSplit) { OrcSplit orcSplit = (OrcSplit) inputSplit; List<Long> deltas = orcSplit.getDeltas(); if (!orcSplit.hasBase() && deltas.size() >= 2) { throw new IOException("Cannot read valid StructTypeInfo from delta only file: " + path); } } LOG.debug("Input split path: {}", path); return path; }
if (split.isOriginal() && split.getDeltas().isEmpty()) { if (vectorMode) { return createVectorizedReader(inputSplit, conf, reporter);