parquet.hadoop.metadata.ColumnChunkMetaData.getCodec java code examples

private static Map<ColumnDescriptor, DictionaryDescriptor> getDictionaries(BlockMetaData blockMetadata, ParquetDataSource dataSource, Map<List<String>, RichColumnDescriptor> descriptorsByPath, TupleDomain<ColumnDescriptor> parquetTupleDomain)
{
  ImmutableMap.Builder<ColumnDescriptor, DictionaryDescriptor> dictionaries = ImmutableMap.builder();
  for (ColumnChunkMetaData columnMetaData : blockMetadata.getColumns()) {
    RichColumnDescriptor descriptor = descriptorsByPath.get(Arrays.asList(columnMetaData.getPath().toArray()));
    if (descriptor != null) {
      if (isOnlyDictionaryEncodingPages(columnMetaData.getEncodings()) && isColumnPredicate(descriptor, parquetTupleDomain)) {
        int totalSize = toIntExact(columnMetaData.getTotalSize());
        byte[] buffer = new byte[totalSize];
        dataSource.readFully(columnMetaData.getStartingPos(), buffer);
        Optional<DictionaryPage> dictionaryPage = readDictionaryPage(buffer, columnMetaData.getCodec());
        dictionaries.put(descriptor, new DictionaryDescriptor(descriptor, dictionaryPage));
        break;
      }
    }
  }
  return dictionaries.build();
}

public PageReader readAllPages()
    throws IOException
{
  List<DataPage> pages = new ArrayList<>();
  DictionaryPage dictionaryPage = null;
  long valueCount = 0;
  while (valueCount < descriptor.getColumnChunkMetaData().getValueCount()) {
    PageHeader pageHeader = readPageHeader();
    int uncompressedPageSize = pageHeader.getUncompressed_page_size();
    int compressedPageSize = pageHeader.getCompressed_page_size();
    switch (pageHeader.type) {
      case DICTIONARY_PAGE:
        if (dictionaryPage != null) {
          throw new ParquetCorruptionException("%s has more than one dictionary page in column chunk", descriptor.getColumnDescriptor());
        }
        dictionaryPage = readDictionaryPage(pageHeader, uncompressedPageSize, compressedPageSize);
        break;
      case DATA_PAGE:
        valueCount += readDataPageV1(pageHeader, uncompressedPageSize, compressedPageSize, pages);
        break;
      case DATA_PAGE_V2:
        valueCount += readDataPageV2(pageHeader, uncompressedPageSize, compressedPageSize, pages);
        break;
      default:
        skip(compressedPageSize);
        break;
    }
  }
  return new PageReader(descriptor.getColumnChunkMetaData().getCodec(), pages, dictionaryPage);
}

private static void showDetails(PrettyPrintWriter out, ColumnChunkMetaData meta, boolean name) {
 long doff = meta.getDictionaryPageOffset();
 long foff = meta.getFirstDataPageOffset();
 long tsize = meta.getTotalSize();
 long usize = meta.getTotalUncompressedSize();
 long count = meta.getValueCount();
 double ratio = usize / (double)tsize;
 String encodings = Joiner.on(',').skipNulls().join(meta.getEncodings());
 if (name) {
  String path = Joiner.on('.').skipNulls().join(meta.getPath());
  out.format("%s: ", path);
 }
 out.format(" %s", meta.getType());
 out.format(" %s", meta.getCodec());
 out.format(" DO:%d", doff);
 out.format(" FPO:%d", foff);
 out.format(" SZ:%d/%d/%.2f", tsize, usize, ratio);
 out.format(" VC:%d", count);
 if (!encodings.isEmpty()) out.format(" ENC:%s", encodings);
 out.println();
}

byte[] buffer = new byte[totalSize];
dataSource.readFully(columnChunkMetaData.getStartingPos(), buffer);
DictionaryPage dictionaryPage = readDictionaryPage(buffer, codecFactory, columnChunkMetaData.getCodec());
dictionaries.put(ordinal, new ParquetDictionaryDescriptor(columnDescriptor, dictionaryPage));

 private void addRowGroup(ParquetMetadata parquetMetadata, List<RowGroup> rowGroups, BlockMetaData block) {
  //rowGroup.total_byte_size = ;
  List<ColumnChunkMetaData> columns = block.getColumns();
  List<ColumnChunk> parquetColumns = new ArrayList<ColumnChunk>();
  for (ColumnChunkMetaData columnMetaData : columns) {
   ColumnChunk columnChunk = new ColumnChunk(columnMetaData.getFirstDataPageOffset()); // verify this is the right offset
   columnChunk.file_path = block.getPath(); // they are in the same file for now
   columnChunk.meta_data = new parquet.format.ColumnMetaData(
     getType(columnMetaData.getType()),
     toFormatEncodings(columnMetaData.getEncodings()),
     Arrays.asList(columnMetaData.getPath().toArray()),
     columnMetaData.getCodec().getParquetCompressionCodec(),
     columnMetaData.getValueCount(),
     columnMetaData.getTotalUncompressedSize(),
     columnMetaData.getTotalSize(),
     columnMetaData.getFirstDataPageOffset());
   columnChunk.meta_data.dictionary_page_offset = columnMetaData.getDictionaryPageOffset();
   if (!columnMetaData.getStatistics().isEmpty()) {
    columnChunk.meta_data.setStatistics(toParquetStatistics(columnMetaData.getStatistics()));
   }
//      columnChunk.meta_data.index_page_offset = ;
//      columnChunk.meta_data.key_value_metadata = ; // nothing yet

   parquetColumns.add(columnChunk);
  }
  RowGroup rowGroup = new RowGroup(parquetColumns, block.getTotalByteSize(), block.getRowCount());
  rowGroups.add(rowGroup);
 }

BytesDecompressor decompressor = codecFactory.getDecompressor(descriptor.getColumnChunkMetaData().getCodec());
return new ParquetColumnChunkPageReader(decompressor, pages, dictionaryPage);

   + " pages ending at file offset " + (descriptor.fileOffset + pos()));
BytesDecompressor decompressor = codecFactory.getDecompressor(descriptor.metadata.getCodec());
return new ColumnChunkPageReader(decompressor, pagesInChunk, dictionaryPage);

Popular methods of ColumnChunkMetaData

getTotalSize
getFirstDataPageOffset
get
getEncodings
getPath
getValueCount
getStartingPos
getStatistics
getDictionaryPageOffset
getTotalUncompressedSize
getType
positiveLongFitsInAnInt
checks that a positive long value fits in an int. (reindexed on Integer.MIN_VALUE)

Popular in Java

Reactive rest calls using spring rest template
getOriginalFilename (MultipartFile)
Return the original filename in the client's filesystem.This may contain path information depending
addToBackStack (FragmentTransaction)
scheduleAtFixedRate (ScheduledExecutorService)
Collections (java.util)
This class consists exclusively of static methods that operate on or return collections. It contains
LinkedList (java.util)
Doubly-linked list implementation of the List and Dequeinterfaces. Implements all optional list oper
ConcurrentHashMap (java.util.concurrent)
A plug-in replacement for JDK1.5 java.util.concurrent.ConcurrentHashMap. This version is based on or
ExecutorService (java.util.concurrent)
An Executor that provides methods to manage termination and methods that can produce a Future for tr
Manifest (java.util.jar)
The Manifest class is used to obtain attribute information for a JarFile and its entries.
Scheduler (org.quartz)
This is the main interface of a Quartz Scheduler. A Scheduler maintains a registry of org.quartz.Job
Top Sublime Text plugins

How to use getCodecmethodin parquet.hadoop.metadata.ColumnChunkMetaData

Best Java code snippets using parquet.hadoop.metadata.ColumnChunkMetaData.getCodec (Showing top 7 results out of 315)

How to use
getCodec
method
in
parquet.hadoop.metadata.ColumnChunkMetaData