@Override public boolean validateInput(FileSystem fs, HiveConf conf, List<FileStatus> files ) throws IOException { if (files.size() <= 0) { return false; } for (FileStatus file : files) { try { OrcFile.createReader(file.getPath(), OrcFile.readerOptions(conf).filesystem(fs)); } catch (IOException e) { return false; } } return true; }
int bufferSize, int rowIndexStride) throws IOException { return createWriter(path, writerOptions(conf) .inspector(inspector) .fileSystem(fs)
@Test public void emptyFile() throws Exception { ObjectInspector inspector; synchronized (TestOrcFile.class) { inspector = ObjectInspectorFactory.getReflectionObjectInspector (BigRow.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf) .inspector(inspector) .stripeSize(1000) .compress(CompressionKind.NONE) .bufferSize(100)); writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); assertEquals(false, reader.rows().hasNext()); assertEquals(CompressionKind.NONE, reader.getCompression()); assertEquals(0, reader.getNumberOfRows()); assertEquals(0, reader.getCompressionSize()); assertEquals(false, reader.getMetadataKeys().iterator().hasNext()); assertEquals(3, reader.getContentLength()); assertEquals(false, reader.getStripes().iterator().hasNext()); }
protected Writer createOrcWriter(CacheWriter cacheWriter, Configuration conf, Path path, StructObjectInspector oi) throws IOException { // TODO: this is currently broken. We need to set memory manager to a bogus implementation // to avoid problems with memory manager actually tracking the usage. return OrcFile.createWriter(path, createOrcWriterOptions(oi, conf, cacheWriter, allocSize)); } }
public ByteBuffer getMetadataToCache( FileSystem fs, Path path, ByteBuffer[] addedVals) throws IOException { // For now, there's nothing special to return in addedVals. Just return the footer. return OrcFile.createReader(fs, path).getSerializedFileFooter(); } }
private OrcFile.WriterOptions getOptions(JobConf conf, Properties props) { OrcFile.WriterOptions result = OrcFile.writerOptions(props, conf); if (props != null) { final String columnNameProperty =
/** * Ensures orcReader is initialized for the split. */ private void ensureOrcReader() throws IOException { if (orcReader != null) return; path = split.getPath(); if (fileKey instanceof Long && HiveConf.getBoolVar( daemonConf, ConfVars.LLAP_IO_USE_FILEID_PATH)) { path = HdfsUtils.getFileIdPath(fs, path, (long)fileKey); } LlapIoImpl.ORC_LOGGER.trace("Creating reader for {} ({})", path, split.getPath()); long startTime = counters.startTimeCounter(); ReaderOptions opts = OrcFile.readerOptions(jobConf).filesystem(fs).fileMetadata(fileMetadata); if (split instanceof OrcSplit) { OrcTail orcTail = ((OrcSplit) split).getOrcTail(); if (orcTail != null) { LlapIoImpl.ORC_LOGGER.debug("Setting OrcTail. path={}", path); opts.orcTail(orcTail); } } orcReader = EncodedOrcFile.createReader(path, opts); counters.incrWallClockCounter(LlapIOCounters.HDFS_TIME_NS, startTime); }
@Test public void testBitPack64Large() throws Exception { ObjectInspector inspector; synchronized (TestOrcFile.class) { inspector = ObjectInspectorFactory.getReflectionObjectInspector(Long.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } int size = 1080832; long[] inp = new long[size]; Random rand = new Random(1234); for (int i = 0; i < size; i++) { inp[i] = rand.nextLong(); } List<Long> input = Lists.newArrayList(Longs.asList(inp)); Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf).inspector(inspector).compress(CompressionKind.ZLIB)); for (Long l : input) { writer.addRow(l); } writer.close(); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); RecordReader rows = reader.rows(); int idx = 0; while (rows.hasNext()) { Object row = rows.next(null); Assert.assertEquals(input.get(idx++).longValue(), ((LongWritable) row).get()); } }
private void initWriter() throws IOException { if (writer == null) { writer = OrcFile.createWriter(path, writerOptions); AcidUtils.OrcAcidVersion.setAcidVersionInDataFile(writer); AcidUtils.OrcAcidVersion.writeVersionFile(path.getParent(), fs); } }
public ByteBuffer getMetadataToCache( FileSystem fs, Path path, ByteBuffer[] addedVals) throws IOException { // For now, there's nothing special to return in addedVals. Just return the footer. return OrcFile.createReader(fs, path).getSerializedFileFooter(); } }
private OrcFile.WriterOptions getOptions(JobConf conf, Properties props) { OrcFile.WriterOptions result = OrcFile.writerOptions(props, conf); if (props != null) { final String columnNameProperty =
@Test public void testOrcSerDeStatsMap() throws Exception { ObjectInspector inspector; synchronized (TestOrcSerDeStats.class) { inspector = ObjectInspectorFactory.getReflectionObjectInspector (MapStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf) .inspector(inspector) .stripeSize(10000) .bufferSize(10000)); for (int row = 0; row < 1000; row++) { Map<String, Double> test = new HashMap<String, Double>(); for (int i = 0; i < 10; i++) { test.put("hi" + i, 2.0); } writer.addRow(new MapStruct(test)); } writer.close(); // stats from writer assertEquals(1000, writer.getNumberOfRows()); assertEquals(950000, writer.getRawDataSize()); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); // stats from reader assertEquals(1000, reader.getNumberOfRows()); assertEquals(950000, reader.getRawDataSize()); assertEquals(950000, reader.getRawDataSizeOfColumns(Lists.newArrayList("map1"))); }
@Override public boolean validateInput(FileSystem fs, HiveConf conf, List<FileStatus> files ) throws IOException { if (files.size() <= 0) { return false; } for (FileStatus file : files) { try { OrcFile.createReader(file.getPath(), OrcFile.readerOptions(conf).filesystem(fs)); } catch (IOException e) { return false; } } return true; } }
int bufferSize, int rowIndexStride) throws IOException { return createWriter(path, writerOptions(conf) .inspector(inspector) .fileSystem(fs)
@Override public void close(TaskAttemptContext context) throws IOException, InterruptedException { if (writer == null) { // a row with no columns ObjectInspector inspector = ObjectInspectorFactory. getStandardStructObjectInspector(new ArrayList<String>(), new ArrayList<ObjectInspector>()); options.inspector(inspector); writer = OrcFile.createWriter(path, options); } writer.close(); } }
static void checkFile(Configuration conf, Path inputPath) throws IOException { FileSystem fs = inputPath.getFileSystem(conf); Reader reader = OrcFile.createReader(fs, inputPath); if (OrcInputFormat.isOriginal(reader)) { System.out.println(inputPath + " is not an acid file"); return; } boolean validIndex = isAcidKeyIndexValid(reader); System.out.println("Checking " + inputPath + " - acid key index is " + (validIndex ? "valid" : "invalid")); }
@Override public RecordWriter getRecordWriter(TaskAttemptContext context) throws IOException, InterruptedException { Path file = getDefaultWorkFile(context, ""); return new OrcRecordWriter(file, OrcFile.writerOptions( ShimLoader.getHadoopShims().getConfiguration(context))); } }
@Test public void testOrcSerDeStatsList() throws Exception { ObjectInspector inspector; synchronized (TestOrcSerDeStats.class) { inspector = ObjectInspectorFactory.getReflectionObjectInspector (ListStruct.class, ObjectInspectorFactory.ObjectInspectorOptions.JAVA); } Writer writer = OrcFile.createWriter(testFilePath, OrcFile.writerOptions(conf) .inspector(inspector) .stripeSize(10000) .bufferSize(10000)); for (int row = 0; row < 5000; row++) { List<String> test = new ArrayList<String>(); for (int i = 0; i < 1000; i++) { test.add("hi"); } writer.addRow(new ListStruct(test)); } writer.close(); assertEquals(5000, writer.getNumberOfRows()); assertEquals(430000000, writer.getRawDataSize()); Reader reader = OrcFile.createReader(testFilePath, OrcFile.readerOptions(conf).filesystem(fs)); // stats from reader assertEquals(5000, reader.getNumberOfRows()); assertEquals(430000000, reader.getRawDataSize()); assertEquals(430000000, reader.getRawDataSizeOfColumns(Lists.newArrayList("list1"))); }
public static boolean isRawFormatFile(Path dataFile, FileSystem fs) throws IOException { try { Reader reader = OrcFile.createReader(dataFile, OrcFile.readerOptions(fs.getConf())); /* acid file would have schema like <op, owid, writerId, rowid, cwid, <f1, ... fn>> so could check it this way once/if OrcRecordUpdater.ACID_KEY_INDEX_NAME is removed TypeDescription schema = reader.getSchema(); List<String> columns = schema.getFieldNames(); */ return OrcInputFormat.isOriginal(reader); } catch (FileFormatException ex) { //We may be parsing a delta for Insert-only table which may not even be an ORC file so //cannot have ROW_IDs in it. LOG.debug("isRawFormat() called on " + dataFile + " which is not an ORC file: " + ex.getMessage()); return true; } } }
OrcFile.WriterOptions opts = OrcFile.writerOptions(conf).inspector(inspector).compress(CompressionKind.ZLIB); Writer writer = OrcFile.createWriter(new Path(testFilePath, "-0"), opts); writer.close(); assertEquals(opts.getMemoryManager().getClass(), MemoryManagerImpl.class); LlapDaemonInfo.initialize("test", new Configuration()); LlapProxy.setDaemon(true); opts = OrcFile.writerOptions(conf).inspector(inspector).compress(CompressionKind.ZLIB); writer = OrcFile.createWriter(new Path(testFilePath, "-1"), opts); writer.close(); assertEquals(opts.getMemoryManager().getClass(), OrcFile.LlapAwareMemoryManager.class); opts = OrcFile.writerOptions(conf).inspector(inspector).compress(CompressionKind.ZLIB); writer = OrcFile.createWriter(new Path(testFilePath, "-2"), opts); writer.close(); assertEquals(opts.getMemoryManager().getClass(), MemoryManagerImpl.class);