/** * Computes the ORC projection mask of the fields to include from the selected fields.rowOrcInputFormat.nextRecord(null). * * @return The ORC projection mask. */ private boolean[] computeProjectionMask() { // mask with all fields of the schema boolean[] projectionMask = new boolean[schema.getMaximumId() + 1]; // for each selected field for (int inIdx : selectedFields) { // set all nested fields of a selected field to true TypeDescription fieldSchema = schema.getChildren().get(inIdx); for (int i = fieldSchema.getId(); i <= fieldSchema.getMaximumId(); i++) { projectionMask[i] = true; } } return projectionMask; }
private static void addColumnToIncludes(TypeDescription child, boolean[] result) { for(int col = child.getId(); col <= child.getMaximumId(); ++col) { result[col] = true; } }
public static boolean[] genIncludedColumns(TypeDescription readerSchema, List<Integer> included) { boolean[] result = new boolean[readerSchema.getMaximumId() + 1]; if (included == null) { Arrays.fill(result, true); return result; } result[0] = true; List<TypeDescription> children = readerSchema.getChildren(); for (int columnNumber = 0; columnNumber < children.size(); ++columnNumber) { if (included.contains(columnNumber)) { TypeDescription child = children.get(columnNumber); for(int col = child.getId(); col <= child.getMaximumId(); ++col) { result[col] = true; } } } return result; }
TypeDescription child = children.get(columnNumber); int id = child.getId(); int maxId = child.getMaximumId(); if (id >= included.length || maxId >= included.length) { throw new AssertionError("Inconsistent includes: " + included.length
TypeDescription child = children.get(columnNumber); int id = child.getId(); int maxId = child.getMaximumId(); if (id >= included.length || maxId >= included.length) { throw new AssertionError("Inconsistent includes: " + included.length
public static boolean[] genIncludedColumns(TypeDescription readerSchema, List<Integer> included, Integer recursiveStruct) { boolean[] result = new boolean[readerSchema.getMaximumId() + 1]; if (included == null) { Arrays.fill(result, true); return result; } result[0] = true; List<TypeDescription> children = readerSchema.getChildren(); for (int columnNumber = 0; columnNumber < children.size(); ++columnNumber) { if (included.contains(columnNumber)) { addColumnToIncludes(children.get(columnNumber), result); } else if (recursiveStruct != null && recursiveStruct == columnNumber) { // This assumes all struct cols immediately follow struct List<TypeDescription> nestedChildren = children.get(columnNumber).getChildren(); for (int columnNumberDelta = 0; columnNumberDelta < nestedChildren.size(); ++columnNumberDelta) { int columnNumberNested = columnNumber + 1 + columnNumberDelta; if (included.contains(columnNumberNested)) { addColumnToIncludes(nestedChildren.get(columnNumberDelta), result); } } } } return result; }
assertEquals(0, writer.getSchema().getMaximumId()); boolean[] expected = new boolean[] {false}; boolean[] included = OrcUtils.includeColumns("", writer.getSchema());
assertEquals(2, schema.getMaximumId()); boolean[] expected = new boolean[] {false, false, true}; boolean[] included = OrcUtils.includeColumns("string1", schema);
assertEquals(2, schema.getMaximumId()); boolean[] expected = new boolean[] {false, true, false}; boolean[] included = OrcUtils.includeColumns("int1", schema);
assertEquals(23, schema.getMaximumId()); boolean[] expected = new boolean[] {false, false, false, false, false, false, false, false, false, false,
private void ensureRawDataReader(boolean isOpen) throws IOException { ensureOrcReader(); if (rawDataReader != null) { if (!isRawDataReaderOpen && isOpen) { long startTime = counters.startTimeCounter(); rawDataReader.open(); counters.incrWallClockCounter(LlapIOCounters.HDFS_TIME_NS, startTime); } return; } long startTime = counters.startTimeCounter(); boolean useZeroCopy = (daemonConf != null) && OrcConf.USE_ZEROCOPY.getBoolean(daemonConf); rawDataReader = RecordReaderUtils.createDefaultDataReader( DataReaderProperties.builder().withBufferSize(orcReader.getCompressionSize()) .withCompression(orcReader.getCompressionKind()) .withFileSystem(fs).withPath(path) .withTypeCount(orcReader.getSchema().getMaximumId() + 1) .withZeroCopy(useZeroCopy) .build()); if (isOpen) { rawDataReader.open(); isRawDataReaderOpen = true; } counters.incrWallClockCounter(LlapIOCounters.HDFS_TIME_NS, startTime); }
sarg.getLeaves(), evolution); sargColumns = new boolean[evolution.getFileSchema().getMaximumId() + 1]; for (int i : filterColumns) {
assertEquals(5, schema.getMaximumId()); boolean[] expected = new boolean[] {false, false, false, false, false, false}; boolean[] included = OrcUtils.includeColumns("", schema);
/** * Convert a string with a comma separated list of column ids into the * array of boolean that match the schemas. * @param schema the schema for the reader * @param columnsStr the comma separated list of column ids * @return a boolean array */ public static boolean[] parseInclude(TypeDescription schema, String columnsStr) { if (columnsStr == null || schema.getCategory() != TypeDescription.Category.STRUCT) { return null; } boolean[] result = new boolean[schema.getMaximumId() + 1]; result[0] = true; if (StringUtils.isBlank(columnsStr)) { return result; } List<TypeDescription> types = schema.getChildren(); for(String idString: columnsStr.split(",")) { TypeDescription type = types.get(Integer.parseInt(idString)); for(int c=type.getId(); c <= type.getMaximumId(); ++c) { result[c] = true; } } return result; }
int numFlattenedCols = schema.getMaximumId(); boolean[] results = new boolean[numFlattenedCols + 1]; if ("*".equals(selectedColumns)) { TypeDescription col = findColumn(column, fieldNames, fields); if (col != null) { for(int i=col.getId(); i <= col.getMaximumId(); ++i) { results[i] = true;
private boolean[] populatePpdSafeConversion() { if (fileSchema == null || readerSchema == null || readerFileTypes == null) { return null; } boolean[] result = new boolean[readerSchema.getMaximumId() + 1]; boolean safePpd = validatePPDConversion(fileSchema, readerSchema); result[readerSchema.getId()] = safePpd; return populatePpdSafeConversionForChildern(result, readerSchema.getChildren()); }
@Override public Metrics metrics() { try { long rows = writer.getNumberOfRows(); ColumnStatistics[] stats = writer.getStatistics(); // we don't currently have columnSizes or distinct counts. Map<Integer, Long> valueCounts = new HashMap<>(); Map<Integer, Long> nullCounts = new HashMap<>(); Integer[] icebergIds = new Integer[orcSchema.getMaximumId() + 1]; for(TypeDescription type: columnIds.keySet()) { icebergIds[type.getId()] = columnIds.get(type); } for(int c=1; c < stats.length; ++c) { if (icebergIds[c] != null) { valueCounts.put(icebergIds[c], stats[c].getNumberOfValues()); } } for(TypeDescription child: orcSchema.getChildren()) { int c = child.getId(); if (icebergIds[c] != null) { nullCounts.put(icebergIds[c], rows - stats[c].getNumberOfValues()); } } return new Metrics(rows, null, valueCounts, nullCounts); } catch (IOException e) { throw new RuntimeException("Can't get statistics " + path, e); } }
public PhysicalFsWriter(FileSystem fs, Path path, OrcFile.WriterOptions opts) throws IOException { this.path = path; long defaultStripeSize = opts.getStripeSize(); this.addBlockPadding = opts.getBlockPadding(); if (opts.isEnforceBufferSize()) { this.bufferSize = opts.getBufferSize(); } else { this.bufferSize = WriterImpl.getEstimatedBufferSize(defaultStripeSize, opts.getSchema().getMaximumId() + 1, opts.getBufferSize()); } this.compress = opts.getCompress(); this.maxPadding = (int) (opts.getPaddingTolerance() * defaultStripeSize); this.blockSize = opts.getBlockSize(); LOG.info("ORC writer created for path: {} with stripeSize: {} blockSize: {}" + " compression: {} bufferSize: {}", path, defaultStripeSize, blockSize, compress, bufferSize); rawWriter = fs.create(path, opts.getOverwrite(), HDFS_BUFFER_SIZE, fs.getDefaultReplication(path), blockSize); blockOffset = 0; codec = OrcCodecPool.getCodec(compress); writer = new OutStream("metadata", bufferSize, codec, new DirectStream(rawWriter)); protobufWriter = CodedOutputStream.newInstance(writer); writeVariableLengthBlocks = opts.getWriteVariableLengthBlocks(); shims = opts.getHadoopShims(); }
buildIndex = rowIndexStride > 0; codec = createCodec(compress); int numColumns = schema.getMaximumId() + 1; this.bufferSize = getEstimatedBufferSize(defaultStripeSize, numColumns, opts.getBufferSize()); if (version == OrcFile.Version.V_0_11) { this.bloomFilterColumns = new boolean[schema.getMaximumId() + 1]; } else { this.bloomFilterColumns =
private void ensureRawDataReader(boolean isOpen) throws IOException { ensureOrcReader(); if (rawDataReader != null) { if (!isRawDataReaderOpen && isOpen) { long startTime = counters.startTimeCounter(); rawDataReader.open(); counters.incrTimeCounter(LlapIOCounters.HDFS_TIME_NS, startTime); } return; } long startTime = counters.startTimeCounter(); boolean useZeroCopy = (daemonConf != null) && OrcConf.USE_ZEROCOPY.getBoolean(daemonConf); rawDataReader = RecordReaderUtils.createDefaultDataReader( DataReaderProperties.builder().withBufferSize(orcReader.getCompressionSize()) .withCompression(orcReader.getCompressionKind()) .withFileSystem(fs).withPath(path) .withTypeCount(orcReader.getSchema().getMaximumId() + 1) .withZeroCopy(useZeroCopy) .build()); if (isOpen) { rawDataReader.open(); isRawDataReaderOpen = true; } counters.incrTimeCounter(LlapIOCounters.HDFS_TIME_NS, startTime); }