static GlobalMetaData mergeInto( FileMetaData toMerge, GlobalMetaData mergedMetadata, boolean strict) { MessageType schema = null; Map<String, Set<String>> newKeyValues = new HashMap<String, Set<String>>(); Set<String> createdBy = new HashSet<String>(); if (mergedMetadata != null) { schema = mergedMetadata.getSchema(); newKeyValues.putAll(mergedMetadata.getKeyValueMetaData()); createdBy.addAll(mergedMetadata.getCreatedBy()); } if ((schema == null && toMerge.getSchema() != null) || (schema != null && !schema.equals(toMerge.getSchema()))) { schema = mergeInto(toMerge.getSchema(), schema, strict); } for (Entry<String, String> entry : toMerge.getKeyValueMetaData().entrySet()) { Set<String> values = newKeyValues.get(entry.getKey()); if (values == null) { values = new HashSet<String>(); newKeyValues.put(entry.getKey(), values); } values.add(entry.getValue()); } createdBy.add(toMerge.getCreatedBy()); return new GlobalMetaData( schema, newKeyValues, createdBy); }
/** * @param conf the configuration * @param file the file to read * @param readSupport to materialize records * @param filter the filter to use to filter records * @throws IOException */ public ParquetReader(Configuration conf, Path file, ReadSupport<T> readSupport, UnboundRecordFilter filter) throws IOException { this.readSupport = readSupport; this.filter = filter; this.conf = conf; FileSystem fs = file.getFileSystem(conf); List<FileStatus> statuses = Arrays.asList(fs.listStatus(file)); List<Footer> footers = ParquetFileReader.readAllFootersInParallelUsingSummaryFiles(conf, statuses); this.footersIterator = footers.iterator(); globalMetaData = ParquetFileWriter.getGlobalMetaData(footers); List<BlockMetaData> blocks = new ArrayList<BlockMetaData>(); for (Footer footer : footers) { blocks.addAll(footer.getParquetMetadata().getBlocks()); } MessageType schema = globalMetaData.getSchema(); Map<String, Set<String>> extraMetadata = globalMetaData.getKeyValueMetaData(); readContext = readSupport.init(new InitContext(conf, extraMetadata, schema)); }
if (mergedMetadata != null) { schema = mergedMetadata.getSchema(); newKeyValues.putAll(mergedMetadata.getKeyValueMetaData()); createdBy.addAll(mergedMetadata.getCreatedBy());
/** * Create Hadoop job according to arguments from main. */ @Override public synchronized Job createJob(String[] args) throws IOException { Job job = super.createJob(args); // for parquet format job, we have to append parquet schema field. We can only set parquet.pig.schema here // because of 'Job' dependency. While the other two required list parameters are in TrainModelProcessor. @SuppressWarnings("rawtypes") final GlobalMetaData globalMetaData = new ParquetInputFormat().getGlobalMetaData(job); Schema schema = getPigSchemaFromMultipleFiles(globalMetaData.getSchema(), globalMetaData.getKeyValueMetaData()); String schemaStr = pigSchemaToString(schema); job.getConfiguration().set("parquet.pig.schema", schemaStr); return job; }
/** * @param configuration the configuration to connect to the file system * @param footers the footers of the files to read * @return the splits for the footers * @throws IOException * @deprecated split planning using file footers will be removed */ @Deprecated public List<ParquetInputSplit> getSplits(Configuration configuration, List<Footer> footers) throws IOException { boolean strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true); final long maxSplitSize = configuration.getLong("mapred.max.split.size", Long.MAX_VALUE); final long minSplitSize = Math.max(getFormatMinSplitSize(), configuration.getLong("mapred.min.split.size", 0L)); if (maxSplitSize < 0 || minSplitSize < 0) { throw new ParquetDecodingException("maxSplitSize or minSplitSize should not be negative: maxSplitSize = " + maxSplitSize + "; minSplitSize = " + minSplitSize); } GlobalMetaData globalMetaData = ParquetFileWriter.getGlobalMetaData(footers, strictTypeChecking); ReadContext readContext = getReadSupport(configuration).init(new InitContext( configuration, globalMetaData.getKeyValueMetaData(), globalMetaData.getSchema())); return new ClientSideMetadataSplitStrategy().getSplits( configuration, footers, maxSplitSize, minSplitSize, readContext); }
private void initSchema(Job job) throws IOException { if (schema != null) { return; } if (schema == null && requestedSchema != null) { // this is only true in front-end schema = requestedSchema; } if (schema == null) { // no requested schema => use the schema from the file final GlobalMetaData globalMetaData = getParquetInputFormat().getGlobalMetaData(job); schema = getPigSchemaFromMultipleFiles(globalMetaData.getSchema(), globalMetaData.getKeyValueMetaData()); } if (isElephantBirdCompatible(job)) { convertToElephantBirdCompatibleSchema(schema); } }