.compress(reader.getCompression()) .version(reader.getFileVersion()) .rowIndexStride(reader.getRowIndexStride()) .inspector(reader.getObjectInspector()); if (reader.getCompression() != org.apache.hadoop.hive.ql.io.orc.CompressionKind.NONE) { writerOptions.bufferSize(reader.getCompressionSize()).enforceBufferSize(); List<StripeInformation> stripes = reader.getStripes(); List<StripeStatistics> stripeStats = reader.getOrcProtoStripeStatistics(); long lastRow = reader.getNumberOfRows() - 1; try (RecordReader rr = reader.rows()) { rr.seekToRow(lastRow); OrcStruct row = (OrcStruct) rr.next(null); StructObjectInspector soi = (StructObjectInspector) reader.getObjectInspector(); for (String metadataKey : reader.getMetadataKeys()) { if (!metadataKey.equals(OrcRecordUpdater.ACID_KEY_INDEX_NAME)) { writer.addUserMetadata(metadataKey, reader.getMetadataValue(metadataKey));
private static boolean needsCompaction(FileStatus bucket, FileSystem fs) throws IOException { //create reader, look at footer //no need to check side file since it can only be in a streaming ingest delta Reader orcReader = OrcFile.createReader(bucket.getPath(), OrcFile.readerOptions(fs.getConf()).filesystem(fs)); if (orcReader.hasMetadataValue(ACID_STATS)) { try { ByteBuffer val = orcReader.getMetadataValue(ACID_STATS).duplicate(); String acidStats = utf8Decoder.decode(val).toString(); String[] parts = acidStats.split(","); long updates = Long.parseLong(parts[1]); long deletes = Long.parseLong(parts[2]); return deletes > 0 || updates > 0; } catch (CharacterCodingException e) { throw new IllegalArgumentException("Bad string encoding for " + ACID_STATS, e); } } else { throw new IllegalStateException("AcidStats missing in " + bucket.getPath()); } }
if ((stripeStatistics == null || stripeStatistics.isEmpty()) && reader.getNumberOfRows() > 0) { keyWrapper.setInputPath(path); keyWrapper.setIsIncompatFile(true); keyWrapper.setCompression(reader.getCompressionKind()); keyWrapper.setCompressBufferSize(reader.getCompressionSize()); keyWrapper.setVersion(reader.getFileVersion()); keyWrapper.setRowIndexStride(reader.getRowIndexStride()); keyWrapper.setTypes(reader.getTypes()); } else { stripeIdx++;
static boolean isAcidKeyIndexValid(Reader reader) { if (reader.getNumberOfRows() == 0) { return true; } // The number of stripes should match the key index count List<StripeInformation> stripes = reader.getStripes(); RecordIdentifier[] keyIndex = OrcRecordUpdater.parseKeyIndex(reader); if (keyIndex == null) { return false; } for (int idx = 0; idx < keyIndex.length; ++idx) { if (keyIndex[idx] == null) { LOG.info("*** keyIndex[" + idx + "] is null"); return false; } } return stripes.size() == keyIndex.length; }
@Override public SerDeStats getStats() { stats.setRawDataSize(file.getRawDataSize()); stats.setRowCount(file.getNumberOfRows()); return stats; } }
Reader reader = createReader(fileSystem, path(input)); if (reader.getNumberOfRows() < rowsToDelete.length()) { throw new IOException("File has fewer rows than deletion vector"); if (reader.getNumberOfRows() == deleteRowCount) { return new OrcFileInfo(0, 0); if (reader.getNumberOfRows() >= Integer.MAX_VALUE) { throw new IOException("File has too many rows"); int inputRowCount = toIntExact(reader.getNumberOfRows()); .compress(reader.getCompression()) .inspector(reader.getObjectInspector()); try (Closer<RecordReader, IOException> recordReader = closer(reader.rows(), RecordReader::close); Closer<Writer, IOException> writer = closer(createWriter(path(output), writerOptions), Writer::close)) { if (reader.hasMetadataValue(OrcFileMetadata.KEY)) { ByteBuffer orcFileMetadata = reader.getMetadataValue(OrcFileMetadata.KEY); writer.get().addUserMetadata(OrcFileMetadata.KEY, orcFileMetadata);
private ArrayList<SampleRec> dumpBucket(Path orcFile) throws IOException { org.apache.hadoop.fs.FileSystem fs = org.apache.hadoop.fs.FileSystem.getLocal(new Configuration()); Reader reader = OrcFile.createReader(orcFile, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); StructObjectInspector inspector = (StructObjectInspector) reader .getObjectInspector(); System.out.format("Found Bucket File : %s \n", orcFile.getName()); ArrayList<SampleRec> result = new ArrayList<SampleRec>(); while (rows.hasNext()) { Object row = rows.next(null); SampleRec rec = (SampleRec) deserializeDeltaFileRow(row, inspector)[5]; result.add(rec); } return result; }
Path path = new Path(filename); Reader reader = OrcFile.createReader(path, OrcFile.readerOptions(conf)); System.out.println("File Version: " + reader.getFileVersion().getName() + " with " + reader.getWriterVersion()); RecordReaderImpl rows = (RecordReaderImpl) reader.rows(); System.out.println("Rows: " + reader.getNumberOfRows()); System.out.println("Compression: " + reader.getCompression()); if (reader.getCompression() != CompressionKind.NONE) { System.out.println("Compression size: " + reader.getCompressionSize()); System.out.println("Type: " + reader.getObjectInspector().getTypeName()); System.out.println("\nStripe Statistics:"); Metadata metadata = reader.getMetadata(); for (int n = 0; n < metadata.getStripeStatistics().size(); n++) { System.out.println(" Stripe " + (n + 1) + ":"); ColumnStatistics[] stats = reader.getStatistics(); int colCount = stats.length; System.out.println("\nFile Statistics:"); for (StripeInformation stripe : reader.getStripes()) { ++stripeIx; long stripeStart = stripe.getOffset();
OrcFile.readerOptions(conf).filesystem(fs)); StructObjectInspector readerInspector = (StructObjectInspector) reader.getObjectInspector(); List<? extends StructField> fields = readerInspector.getAllStructFieldRefs(); HiveDecimalObjectInspector doi = (HiveDecimalObjectInspector) readerInspector. getStructFieldRef("dec").getFieldObjectInspector(); RecordReader rows = reader.rows(); while (rows.hasNext()) { Object row = rows.next(null); ColumnStatistics[] stats = reader.getStatistics(); assertEquals(2, stats[0].getNumberOfValues()); assertEquals(0, stats[1].getNumberOfValues());
assertEquals(false, reader.getMetadataKeys().iterator().hasNext()); assertEquals(5077, reader.getNumberOfRows()); DecimalColumnStatistics stats = (DecimalColumnStatistics) reader.getStatistics()[5]; assertEquals(71, stats.getNumberOfValues()); assertEquals(HiveDecimal.create("-5643.234"), stats.getMinimum()); int rowCount = 0; long currentOffset = -1; for(StripeInformation stripe: reader.getStripes()) { stripeCount += 1; rowCount += stripe.getNumberOfRows(); assertEquals(reader.getNumberOfRows(), rowCount); assertEquals(2, stripeCount); assertEquals(reader.getContentLength(), currentOffset); RecordReader rows = reader.rows(); assertEquals(0, rows.getRowNumber()); assertEquals(0.0, rows.getProgress(), 0.000001); row = (OrcStruct) rows.next(null); assertEquals(1, rows.getRowNumber()); inspector = reader.getObjectInspector(); assertEquals("struct<time:timestamp,union:uniontype<int,string>,decimal:decimal(38,18)>", inspector.getTypeName()); assertEquals(false, rows.hasNext()); assertEquals(1.0, rows.getProgress(), 0.00001); assertEquals(reader.getNumberOfRows(), rows.getRowNumber());
Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); assertEquals(4, reader.getNumberOfRows()); assertEquals(273, reader.getRawDataSize()); assertEquals(15, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1"))); assertEquals(258, reader.getRawDataSizeOfColumns(Lists.newArrayList("string1"))); assertEquals(273, reader.getRawDataSizeOfColumns(Lists.newArrayList("bytes1", "string1"))); ColumnStatistics[] stats = reader.getStatistics(); assertEquals(4, stats[0].getNumberOfValues()); assertEquals("count: 4 hasNull: false", stats[0].toString()); (StructObjectInspector) reader.getObjectInspector(); assertEquals(ObjectInspector.Category.STRUCT, readerInspector.getCategory()); StringObjectInspector st = (StringObjectInspector) readerInspector. getStructFieldRef("string1").getFieldObjectInspector(); RecordReader rows = reader.rows(); Object row = rows.next(null); assertNotNull(row);
Reader reader = OrcFile.createReader(outputFilePath, OrcFile.readerOptions(conf).filesystem(localFs)); assertTrue(reader.getNumberOfRows() == rownum); assertEquals(reader.getCompression(), CompressionKind.ZLIB); StructObjectInspector soi = (StructObjectInspector)reader.getObjectInspector(); StructTypeInfo ti = (StructTypeInfo)TypeInfoUtils.getTypeInfoFromObjectInspector(soi); PrimitiveObjectInspector.PrimitiveCategory.STRING); RecordReader rows = reader.rows(); Object row = rows.next(null);
int rowCount = 0; long currentOffset = -1; for(StripeInformation stripe : reader.getStripes()) { stripeCount += 1; rowCount += stripe.getNumberOfRows(); assertEquals(reader.getNumberOfRows(), rowCount); assertEquals(2, stripeCount); ColumnStatistics[] stats = reader.getStatistics(); assertEquals(7500, stats[1].getNumberOfValues()); assertEquals(3750, ((BooleanColumnStatistics) stats[1]).getFalseCount()); .getObjectInspector(); assertEquals(ObjectInspector.Category.STRUCT, readerInspector.getCategory()); assertEquals("struct<boolean1:boolean,byte1:tinyint,short1:smallint," StringObjectInspector mk = (StringObjectInspector) ma .getMapKeyObjectInspector(); RecordReader rows = reader.rows(); Object row = rows.next(null); assertNotNull(row);
@Test public void emptyFile() throws Exception { ObjectInspector inspector; synchronized (TestOrcFile.class) { inspector = ObjectInspectorFactory.getReflectionObjectInspector (BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf) .inspector(inspector) .stripeSize(1000) .compress(CompressionKind.NONE) .bufferSize(100)); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); assertEquals(false, reader.rows().hasNext()); assertEquals(CompressionKind.NONE, reader.getCompression()); assertEquals(0, reader.getNumberOfRows()); assertEquals(0, reader.getCompressionSize()); assertEquals(false, reader.getMetadataKeys().iterator().hasNext()); assertEquals(3, reader.getContentLength()); assertEquals(false, reader.getStripes().iterator().hasNext()); }
Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); assertEquals(COUNT, reader.getNumberOfRows()); RecordReader rows = reader.rows(); OrcStruct row = null; for(int i=COUNT-1; i >= 0; --i) { reader.getStripes().iterator(); long offsetOfStripe2 = 0; long offsetOfStripe4 = 0; boolean[] columns = new boolean[reader.getStatistics().length]; columns[5] = true; // long colulmn columns[9] = true; // text column rows = reader.rowsOptions(new Reader.Options() .range(offsetOfStripe2, offsetOfStripe4 - offsetOfStripe2) .include(columns));
.version(fileVersion) .rowIndexStride(rowIndexStride) .inspector(reader.getObjectInspector());
types.add(typeBuilder.build()); Mockito.when(reader.getTypes()).thenReturn(types); Mockito.when(reader.rowsOptions(Mockito.any(Reader.Options.class), Mockito.any(HiveConf.class))) .thenReturn(recordReader); Mockito.when(recordReader.next(row3)).thenReturn(row5); Mockito.when(reader.hasMetadataValue(OrcRecordUpdater.ACID_KEY_INDEX_NAME)) .thenReturn(true); Mockito.when(reader.getMetadataValue(OrcRecordUpdater.ACID_KEY_INDEX_NAME)) .thenReturn(ByteBuffer.wrap("10,20,30;40,50,60;40,50,61" .getBytes("UTF-8"))); Mockito.when(reader.getStripes()) .thenReturn(createStripes(2, 2, 1));
ColumnStatistics[] stats = reader.getStatistics(); assertEquals(2, stats[1].getNumberOfValues()); assertEquals(1, ((BooleanColumnStatistics) stats[1]).getFalseCount()); stats[3].toString()); StripeStatistics ss = reader.getStripeStatistics().get(0); assertEquals(2, ss.getColumnStatistics()[0].getNumberOfValues()); assertEquals(1, ((BooleanColumnStatistics) ss.getColumnStatistics()[1]).getTrueCount()); (StructObjectInspector) reader.getObjectInspector(); assertEquals(ObjectInspector.Category.STRUCT, readerInspector.getCategory()); StringObjectInspector mk = (StringObjectInspector) ma.getMapKeyObjectInspector(); RecordReader rows = reader.rows(); Object row = rows.next(null); assertNotNull(row);
public SparkOrcNewRecordReader(Reader file, Configuration conf, long offset, long length) throws IOException { List<OrcProto.Type> types = file.getTypes(); numColumns = (types.size() == 0) ? 0 : types.get(0).getSubtypesCount(); value = new OrcStruct(numColumns); this.reader = OrcInputFormat.createReaderFromFile(file, conf, offset, length); this.objectInspector = file.getObjectInspector(); }
if ((stripeStatistics == null || stripeStatistics.isEmpty()) && reader.getNumberOfRows() > 0) { keyWrapper.setInputPath(path); keyWrapper.setIsIncompatFile(true); keyWrapper.setCompression(reader.getCompressionKind()); keyWrapper.setCompressBufferSize(reader.getCompressionSize()); keyWrapper.setFileVersion(reader.getFileVersion()); keyWrapper.setWriterVersion(reader.getWriterVersion()); keyWrapper.setRowIndexStride(reader.getRowIndexStride()); keyWrapper.setFileSchema(reader.getSchema()); } else { stripeIdx++;