public UnknownColumnException(ColumnDescriptor descriptor) { super("Column not found: " + descriptor.toString()); this.descriptor = descriptor; }
public UnknownColumnException(ColumnDescriptor descriptor) { super("Column not found: " + descriptor.toString()); this.descriptor = descriptor; }
private void updateStats(PageHeader pageHeader, String op, long start, long time, long bytesin, long bytesout) { String pageType = "Data Page"; if (pageHeader.type == PageType.DICTIONARY_PAGE) { pageType = "Dictionary Page"; } logger.trace("ParquetTrace,{},{},{},{},{},{},{},{}", op, pageType.toString(), this.parentColumnReader.parentReader.hadoopPath, this.parentColumnReader.columnDescriptor.toString(), start, bytesin, bytesout, time); if (pageHeader.type != PageType.DICTIONARY_PAGE) { if (bytesin == bytesout) { this.stats.timePageLoads += time; this.stats.numPageLoads++; this.stats.totalPageReadBytes += bytesin; } else { this.stats.timePagesDecompressed += time; this.stats.numPagesDecompressed++; this.stats.totalDecompressedBytes += bytesin; } } else { if (bytesin == bytesout) { this.stats.timeDictPageLoads += time; this.stats.numDictPageLoads++; this.stats.totalDictPageReadBytes += bytesin; } else { this.stats.timeDictPagesDecompressed += time; this.stats.numDictPagesDecompressed++; this.stats.totalDictDecompressedBytes += bytesin; } } }
protected void updateStats(PageHeader pageHeader, String op, long start, long time, long bytesin, long bytesout) { String pageType = "Data Page"; if (pageHeader.type == PageType.DICTIONARY_PAGE) { pageType = "Dictionary Page"; } logger.trace("ParquetTrace,{},{},{},{},{},{},{},{}", op, pageType, this.parentColumnReader.parentReader.hadoopPath, this.parentColumnReader.columnDescriptor.toString(), start, bytesin, bytesout, time); if (pageHeader.type != PageType.DICTIONARY_PAGE) { if (bytesin == bytesout) { this.stats.timeDataPageLoads.addAndGet(time); this.stats.numDataPageLoads.incrementAndGet(); this.stats.totalDataPageReadBytes.addAndGet(bytesin); } else { this.stats.timeDataPagesDecompressed.addAndGet(time); this.stats.numDataPagesDecompressed.incrementAndGet(); this.stats.totalDataDecompressedBytes.addAndGet(bytesin); } } else { if (bytesin == bytesout) { this.stats.timeDictPageLoads.addAndGet(time); this.stats.numDictPageLoads.incrementAndGet(); this.stats.totalDictPageReadBytes.addAndGet(bytesin); } else { this.stats.timeDictPagesDecompressed.addAndGet(time); this.stats.numDictPagesDecompressed.incrementAndGet(); this.stats.totalDictDecompressedBytes.addAndGet(bytesin); } } }
public static void printDictionary(ColumnDescriptor columnDescriptor, Dictionary localDictionary) { System.out.println("Dictionary for column " + columnDescriptor.toString()); for (int i = 0; i < localDictionary.getMaxId(); ++i) { switch (columnDescriptor.getType()) { case INT32: System.out.println(format("%d: %d", i, localDictionary.decodeToInt(i))); break; case INT64: System.out.println(format("%d: %d", i, localDictionary.decodeToLong(i))); break; case INT96: case BINARY: case FIXED_LEN_BYTE_ARRAY: System.out.println(format("%d: %s", i, new String(localDictionary.decodeToBinary(i).getBytesUnsafe()))); break; case FLOAT: System.out.println(format("%d: %f", i, localDictionary.decodeToFloat(i))); break; case DOUBLE: System.out.println(format("%d: %f", i, localDictionary.decodeToDouble(i))); break; case BOOLEAN: System.out.println(format("%d: %b", i, localDictionary.decodeToBoolean(i))); break; default: break; } } } }
/** * Builds a global dictionary for parquet table for BINARY or FIXED_LEN_BYTE_ARRAY column types. * It will remove exiting dictionaries if present and create new ones. * @param fs filesystem * @param tableDir root directory for given table that has parquet files * @param bufferAllocator memory allocator * @return GlobalDictionariesInfo that has dictionary version, root path and columns along with path to dictionary files. * @throws IOException */ public static GlobalDictionariesInfo createGlobalDictionaries(FileSystem fs, Path tableDir, BufferAllocator bufferAllocator) throws IOException { final FileStatus[] statuses = fs.listStatus(tableDir, PARQUET_FILES_FILTER); final Map<ColumnDescriptor, Path> globalDictionaries = Maps.newHashMap(); final Map<ColumnDescriptor, List<Dictionary>> allDictionaries = readLocalDictionaries(fs, statuses, bufferAllocator); final long dictionaryVersion = getDictionaryVersion(fs, tableDir) + 1; final Path tmpDictionaryRootDir = createTempRootDir(fs, tableDir, dictionaryVersion); logger.debug("Building global dictionaries for columns {} with version {}", allDictionaries.keySet(), dictionaryVersion); // Sort all local dictionaries and write it to file with an index if needed for (Map.Entry<ColumnDescriptor, List<Dictionary>> entry : allDictionaries.entrySet()) { final ColumnDescriptor columnDescriptor = entry.getKey(); final Path dictionaryFile = dictionaryFilePath(tmpDictionaryRootDir, columnDescriptor); logger.debug("Creating a new global dictionary for {} with version {}", columnDescriptor.toString(), dictionaryVersion); createDictionaryFile(fs, dictionaryFile, columnDescriptor, entry.getValue(), null, bufferAllocator); globalDictionaries.put(columnDescriptor, dictionaryFile); } final Path finalDictionaryRootDir = createDictionaryVersionedRootPath(fs, tableDir, dictionaryVersion, tmpDictionaryRootDir); return new GlobalDictionariesInfo(globalDictionaries, finalDictionaryRootDir, dictionaryVersion); }
public static void main(String []args) { try (final BufferAllocator bufferAllocator = new RootAllocator(VM.getMaxDirectMemory())) { final Path tableDir = new Path(args[0]); final FileSystem fs = tableDir.getFileSystem(new Configuration()); if (fs.exists(tableDir) && fs.isDirectory(tableDir)) { Map<ColumnDescriptor, Path> dictionaryEncodedColumns = createGlobalDictionaries(fs, tableDir, bufferAllocator).getColumnsToDictionaryFiles(); long version = getDictionaryVersion(fs, tableDir); Path dictionaryRootDir = getDictionaryVersionedRootPath(fs, tableDir, version); for (ColumnDescriptor columnDescriptor: dictionaryEncodedColumns.keySet()) { final VectorContainer data = readDictionary(fs, dictionaryRootDir, columnDescriptor, bufferAllocator); System.out.println("Dictionary for column [" + columnDescriptor.toString() + " size " + data.getRecordCount()); BatchPrinter.printBatch(data); data.clear(); } } } catch (IOException ioe) { logger.error("Failed ", ioe); } }
public void processPages(long recordsToReadInThisPass) throws IOException { reset(); if(recordsToReadInThisPass>0) { do { determineSize(recordsToReadInThisPass); } while (valuesReadInCurrentPass < recordsToReadInThisPass && pageReader.hasPage()); } logger.trace("Column Reader: {} - Values read in this pass: {} - ", this.getColumnDescriptor().toString(), valuesReadInCurrentPass); valueVec.getMutator().setValueCount(valuesReadInCurrentPass); }
logger.debug("Creating a new global dictionary for {} with version {}", columnDescriptor.toString(), nextDictionaryVersion); createDictionaryFile(fs, newDictionaryFile, columnDescriptor, entry.getValue(), null, bufferAllocator); globalDictionaries.put(columnDescriptor, newDictionaryFile); logger.debug("Updating global dictionary for {} with version {}", columnDescriptor.toString(), nextDictionaryVersion); createDictionaryFile(fs, newDictionaryFile, columnDescriptor, entry.getValue(), vectorContainer, bufferAllocator); globalDictionaries.put(columnDescriptor, newDictionaryFile);
/** * Get the page header and the pageData (uncompressed) for the next page */ protected void nextInternal() throws IOException{ Stopwatch timer = Stopwatch.createUnstarted(); // next, we need to decompress the bytes // TODO - figure out if we need multiple dictionary pages, I believe it may be limited to one // I think we are clobbering parts of the dictionary if there can be multiple pages of dictionary do { long start=dataReader.getPos(); timer.start(); pageHeader = Util.readPageHeader(dataReader); long timeToRead = timer.elapsed(TimeUnit.NANOSECONDS); long pageHeaderBytes=dataReader.getPos()-start; this.updateStats(pageHeader, "Page Header", start, timeToRead, pageHeaderBytes, pageHeaderBytes); logger.trace("ParquetTrace,{},{},{},{},{},{},{},{}","Page Header Read","", this.parentColumnReader.parentReader.hadoopPath, this.parentColumnReader.columnDescriptor.toString(), start, 0, 0, timeToRead); timer.reset(); if (pageHeader.getType() == PageType.DICTIONARY_PAGE) { readDictionaryPage(pageHeader, parentColumnReader); } } while (pageHeader.getType() == PageType.DICTIONARY_PAGE); int compressedSize = pageHeader.getCompressed_page_size(); int uncompressedSize = pageHeader.getUncompressed_page_size(); pageData = readPage(pageHeader, compressedSize, uncompressedSize); }
logger.trace("ParquetTrace,{},{},{},{},{},{},{},{}","Page Header Read","", this.parentColumnReader.parentReader.hadoopPath, this.parentColumnReader.columnDescriptor.toString(), start, 0, 0, timeToRead); timer.reset(); if (pageHeader.getType() == PageType.DICTIONARY_PAGE) {