public ByteBuffer getMetadataToCache( FileSystem fs, Path path, ByteBuffer[] addedVals) throws IOException { // For now, there's nothing special to return in addedVals. Just return the footer. return OrcFile.createReader(fs, path).getSerializedFileFooter(); } }
public ByteBuffer getMetadataToCache( FileSystem fs, Path path, ByteBuffer[] addedVals) throws IOException { // For now, there's nothing special to return in addedVals. Just return the footer. return OrcFile.createReader(fs, path).getSerializedFileFooter(); } }
static void checkFile(Configuration conf, Path inputPath) throws IOException { FileSystem fs = inputPath.getFileSystem(conf); Reader reader = OrcFile.createReader(fs, inputPath); if (OrcInputFormat.isOriginal(reader)) { System.out.println(inputPath + " is not an acid file"); return; } boolean validIndex = isAcidKeyIndexValid(reader); System.out.println("Checking " + inputPath + " - acid key index is " + (validIndex ? "valid" : "invalid")); }
@Override public boolean validateInput(FileSystem fs, HiveConf conf, List<FileStatus> files ) throws IOException { if (files.size() <= 0) { return false; } for (FileStatus file : files) { try { OrcFile.createReader(file.getPath(), OrcFile.readerOptions(conf).filesystem(fs)); } catch (IOException e) { return false; } } return true; }
private static boolean needsCompaction(FileStatus bucket, FileSystem fs) throws IOException { //create reader, look at footer //no need to check side file since it can only be in a streaming ingest delta Reader orcReader = OrcFile.createReader(bucket.getPath(), OrcFile.readerOptions(fs.getConf()).filesystem(fs)); if (orcReader.hasMetadataValue(ACID_STATS)) { try { ByteBuffer val = orcReader.getMetadataValue(ACID_STATS).duplicate(); String acidStats = utf8Decoder.decode(val).toString(); String[] parts = acidStats.split(","); long updates = Long.parseLong(parts[1]); long deletes = Long.parseLong(parts[2]); return deletes > 0 || updates > 0; } catch (CharacterCodingException e) { throw new IllegalArgumentException("Bad string encoding for " + ACID_STATS, e); } } else { throw new IllegalStateException("AcidStats missing in " + bucket.getPath()); } }
@Override public boolean validateInput(FileSystem fs, HiveConf conf, List<FileStatus> files ) throws IOException { if (files.size() <= 0) { return false; } for (FileStatus file : files) { try { OrcFile.createReader(file.getPath(), OrcFile.readerOptions(conf).filesystem(fs)); } catch (IOException e) { return false; } } return true; } }
/** * Finds the next file of the logical bucket * @return {@code null} if there are no more files */ private Reader advanceToNextFile() throws IOException { while(nextFileIndex < originalFiles.size()) { int bucketIdFromPath = AcidUtils.parseBucketId(originalFiles.get(nextFileIndex).getFileStatus().getPath()); if (bucketIdFromPath == bucketId) { break; } //the the bucket we care about here nextFileIndex++; } if(originalFiles.size() <= nextFileIndex) { return null;//no more files for current bucket } return OrcFile.createReader(originalFiles.get(nextFileIndex++).getFileStatus(). getPath(), OrcFile.readerOptions(conf)); } }
public static boolean isRawFormatFile(Path dataFile, FileSystem fs) throws IOException { try { Reader reader = OrcFile.createReader(dataFile, OrcFile.readerOptions(fs.getConf())); /* acid file would have schema like <op, owid, writerId, rowid, cwid, <f1, ... fn>> so could check it this way once/if OrcRecordUpdater.ACID_KEY_INDEX_NAME is removed TypeDescription schema = reader.getSchema(); List<String> columns = schema.getFieldNames(); */ return OrcInputFormat.isOriginal(reader); } catch (FileFormatException ex) { //We may be parsing a delta for Insert-only table which may not even be an ORC file so //cannot have ROW_IDs in it. LOG.debug("isRawFormat() called on " + dataFile + " which is not an ORC file: " + ex.getMessage()); return true; } } }
/** * This is smart enough to handle streaming ingest where there could be a * {@link OrcAcidUtils#DELTA_SIDE_FILE_SUFFIX} side file. * @param dataFile - ORC acid data file * @return version property from file if there, * {@link #ORC_ACID_VERSION_DEFAULT} otherwise */ @VisibleForTesting public static int getAcidVersionFromDataFile(Path dataFile, FileSystem fs) throws IOException { FileStatus fileStatus = fs.getFileStatus(dataFile); Reader orcReader = OrcFile.createReader(dataFile, OrcFile.readerOptions(fs.getConf()) .filesystem(fs) //make sure to check for side file in case streaming ingest died .maxLength(getLogicalLength(fs, fileStatus))); if (orcReader.hasMetadataValue(ACID_VERSION_KEY)) { char[] versionChar = UTF8.decode(orcReader.getMetadataValue(ACID_VERSION_KEY)).array(); String version = new String(versionChar); return Integer.valueOf(version); } return ORC_ACID_VERSION_DEFAULT; } /**
public OrcFileStripeMergeRecordReader(Configuration conf, FileSplit split) throws IOException { path = split.getPath(); start = split.getStart(); end = start + split.getLength(); FileSystem fs = path.getFileSystem(conf); this.reader = OrcFile.createReader(path, OrcFile.readerOptions(conf).filesystem(fs)); this.iter = reader.getStripes().iterator(); this.stripeIdx = 0; this.stripeStatistics = ((ReaderImpl) reader).getOrcProtoStripeStatistics(); }
public OrcFileStripeMergeRecordReader(Configuration conf, FileSplit split) throws IOException { path = split.getPath(); start = split.getStart(); end = start + split.getLength(); FileSystem fs = path.getFileSystem(conf); this.reader = OrcFile.createReader(path, OrcFile.readerOptions(conf).filesystem(fs)); this.iter = reader.getStripes().iterator(); this.stripeIdx = 0; this.stripeStatistics = ((ReaderImpl) reader).getOrcProtoStripeStatistics(); }
private static void assertFileContentsOrcHive( Type type, TempFile tempFile, Iterable<?> expectedValues) throws Exception { JobConf configuration = new JobConf(new Configuration(false)); configuration.set(READ_COLUMN_IDS_CONF_STR, "0"); configuration.setBoolean(READ_ALL_COLUMNS, false); Reader reader = OrcFile.createReader( new Path(tempFile.getFile().getAbsolutePath()), new ReaderOptions(configuration)); org.apache.hadoop.hive.ql.io.orc.RecordReader recordReader = reader.rows(); StructObjectInspector rowInspector = (StructObjectInspector) reader.getObjectInspector(); StructField field = rowInspector.getStructFieldRef("test"); Iterator<?> iterator = expectedValues.iterator(); Object rowData = null; while (recordReader.hasNext()) { rowData = recordReader.next(rowData); Object expectedValue = iterator.next(); Object actualValue = rowInspector.getStructFieldData(rowData, field); actualValue = decodeRecordReaderValue(type, actualValue); assertColumnValueEquals(type, actualValue, expectedValue); } assertFalse(iterator.hasNext()); }
private ArrayList<SampleRec> dumpBucket(Path orcFile) throws IOException { org.apache.hadoop.fs.FileSystem fs = org.apache.hadoop.fs.FileSystem.getLocal(new Configuration()); Reader reader = OrcFile.createReader(orcFile, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); StructObjectInspector inspector = (StructObjectInspector) reader .getObjectInspector(); System.out.format("Found Bucket File : %s \n", orcFile.getName()); ArrayList<SampleRec> result = new ArrayList<SampleRec>(); while (rows.hasNext()) { Object row = rows.next(null); SampleRec rec = (SampleRec) deserializeDeltaFileRow(row, inspector)[5]; result.add(rec); } return result; }
private ArrayList<SampleRec> dumpBucket(Path orcFile) throws IOException { org.apache.hadoop.fs.FileSystem fs = org.apache.hadoop.fs.FileSystem.getLocal(new Configuration()); Reader reader = OrcFile.createReader(orcFile, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); StructObjectInspector inspector = (StructObjectInspector) reader .getObjectInspector(); System.out.format("Found Bucket File : %s \n", orcFile.getName()); ArrayList<SampleRec> result = new ArrayList<SampleRec>(); while (rows.hasNext()) { Object row = rows.next(null); SampleRec rec = (SampleRec) deserializeDeltaFileRow(row, inspector)[5]; result.add(rec); } return result; }
static Reader createOrcReaderForSplit(Configuration conf, OrcSplit orcSplit) throws IOException { Path path = orcSplit.getPath(); Reader reader; if (orcSplit.hasBase()) { OrcFile.ReaderOptions readerOptions = OrcFile.readerOptions(conf); readerOptions.maxLength(orcSplit.getFileLength()); if (orcSplit.hasFooter()) { readerOptions.orcTail(orcSplit.getOrcTail()); } reader = OrcFile.createReader(path, readerOptions); } else { reader = null; } return reader; }
@Override public RecordReader<NullWritable, OrcStruct> createRecordReader( InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit) inputSplit; Path path = fileSplit.getPath(); Configuration conf = ShimLoader.getHadoopShims() .getConfiguration(context); return new OrcRecordReader(OrcFile.createReader(path, OrcFile.readerOptions(conf)), ShimLoader.getHadoopShims().getConfiguration(context), fileSplit.getStart(), fileSplit.getLength()); }
static Reader createOrcReaderForSplit(Configuration conf, OrcSplit orcSplit) throws IOException { Path path = orcSplit.getPath(); Reader reader; if (orcSplit.hasBase()) { OrcFile.ReaderOptions readerOptions = OrcFile.readerOptions(conf); readerOptions.maxLength(orcSplit.getFileLength()); if (orcSplit.hasFooter()) { readerOptions.orcTail(orcSplit.getOrcTail()); } reader = OrcFile.createReader(path, readerOptions); } else { reader = null; } return reader; }
@Override public RecordReader<NullWritable, OrcStruct> createRecordReader( InputSplit inputSplit, TaskAttemptContext context) throws IOException, InterruptedException { FileSplit fileSplit = (FileSplit) inputSplit; Path path = fileSplit.getPath(); Configuration conf = ShimLoader.getHadoopShims() .getConfiguration(context); return new OrcRecordReader(OrcFile.createReader(path, OrcFile.readerOptions(conf)), ShimLoader.getHadoopShims().getConfiguration(context), fileSplit.getStart(), fileSplit.getLength()); }
@Override public RecordReader<NullWritable, VectorizedRowBatch> getRecordReader(InputSplit inputSplit, JobConf conf, Reporter reporter) throws IOException { FileSplit fSplit = (FileSplit)inputSplit; reporter.setStatus(fSplit.toString()); Path path = fSplit.getPath(); OrcFile.ReaderOptions opts = OrcFile.readerOptions(conf); if(fSplit instanceof OrcSplit){ OrcSplit orcSplit = (OrcSplit) fSplit; if (orcSplit.hasFooter()) { opts.orcTail(orcSplit.getOrcTail()); } opts.maxLength(orcSplit.getFileLength()); } Reader reader = OrcFile.createReader(path, opts); return new VectorizedOrcRecordReader(reader, conf, fSplit); }
@Override public RecordReader<NullWritable, VectorizedRowBatch> getRecordReader(InputSplit inputSplit, JobConf conf, Reporter reporter) throws IOException { FileSplit fSplit = (FileSplit)inputSplit; reporter.setStatus(fSplit.toString()); Path path = fSplit.getPath(); OrcFile.ReaderOptions opts = OrcFile.readerOptions(conf); if(fSplit instanceof OrcSplit){ OrcSplit orcSplit = (OrcSplit) fSplit; if (orcSplit.hasFooter()) { opts.orcTail(orcSplit.getOrcTail()); } opts.maxLength(orcSplit.getFileLength()); } Reader reader = OrcFile.createReader(path, opts); return new VectorizedOrcRecordReader(reader, conf, fSplit); }