public static TypeDescription[] genIncludedTypes(TypeDescription fileSchema, List<Integer> included, Integer recursiveStruct) { TypeDescription[] result = new TypeDescription[included.size()]; List<TypeDescription> children = fileSchema.getChildren(); for (int columnNumber = 0; columnNumber < children.size(); ++columnNumber) { int indexInBatchCols = included.indexOf(columnNumber); if (indexInBatchCols >= 0) { result[indexInBatchCols] = children.get(columnNumber); } else if (recursiveStruct != null && recursiveStruct == columnNumber) { // This assumes all struct cols immediately follow struct List<TypeDescription> nestedChildren = children.get(columnNumber).getChildren(); for (int columnNumberDelta = 0; columnNumberDelta < nestedChildren.size(); ++columnNumberDelta) { int columnNumberNested = columnNumber + 1 + columnNumberDelta; int nestedIxInBatchCols = included.indexOf(columnNumberNested); if (nestedIxInBatchCols >= 0) { result[nestedIxInBatchCols] = nestedChildren.get(columnNumberDelta); } } } } return result; }
public ListColumnConverter(TypeDescription schema) { childrenConverter = createConverter(schema.getChildren().get(0)); }
/** * Fills an ORC batch into an array of Row. * * @param rows The batch of rows need to be filled. * @param schema The schema of the ORC data. * @param batch The ORC data. * @param selectedFields The list of selected ORC fields. * @return The number of rows that were filled. */ static int fillRows(Row[] rows, TypeDescription schema, VectorizedRowBatch batch, int[] selectedFields) { int rowsToRead = Math.min((int) batch.count(), rows.length); List<TypeDescription> fieldTypes = schema.getChildren(); // read each selected field for (int fieldIdx = 0; fieldIdx < selectedFields.length; fieldIdx++) { int orcIdx = selectedFields[fieldIdx]; readField(rows, fieldIdx, fieldTypes.get(orcIdx), batch.cols[orcIdx], rowsToRead); } return rowsToRead; }
public StructColumnConverter(TypeDescription schema) { List<TypeDescription> kids = schema.getChildren(); childrenConverters = new JsonConverter[kids.size()]; for (int c = 0; c < childrenConverters.length; ++c) { childrenConverters[c] = createConverter(kids.get(c)); } fieldNames = schema.getFieldNames(); }
/** * Creates an OrcRowInputFormat. * * @param path The path to read ORC files from. * @param orcSchema The schema of the ORC files as ORC TypeDescription. * @param orcConfig The configuration to read the ORC files with. * @param batchSize The number of Row objects to read in a batch. */ public OrcRowInputFormat(String path, TypeDescription orcSchema, Configuration orcConfig, int batchSize) { super(new Path(path)); // configure OrcRowInputFormat this.schema = orcSchema; this.rowType = (RowTypeInfo) OrcBatchReader.schemaToTypeInfo(schema); this.conf = orcConfig; this.batchSize = batchSize; // set default selection mask, i.e., all fields. this.selectedFields = new int[this.schema.getChildren().size()]; for (int i = 0; i < selectedFields.length; i++) { this.selectedFields[i] = i; } }
public static boolean[] genIncludedColumns(TypeDescription readerSchema, List<Integer> included, Integer recursiveStruct) { boolean[] result = new boolean[readerSchema.getMaximumId() + 1]; if (included == null) { Arrays.fill(result, true); return result; } result[0] = true; List<TypeDescription> children = readerSchema.getChildren(); for (int columnNumber = 0; columnNumber < children.size(); ++columnNumber) { if (included.contains(columnNumber)) { addColumnToIncludes(children.get(columnNumber), result); } else if (recursiveStruct != null && recursiveStruct == columnNumber) { // This assumes all struct cols immediately follow struct List<TypeDescription> nestedChildren = children.get(columnNumber).getChildren(); for (int columnNumberDelta = 0; columnNumberDelta < nestedChildren.size(); ++columnNumberDelta) { int columnNumberNested = columnNumber + 1 + columnNumberDelta; if (included.contains(columnNumberNested)) { addColumnToIncludes(nestedChildren.get(columnNumberDelta), result); } } } } return result; }
private static void readNonNullStructColumn(Object[] vals, int fieldIdx, StructColumnVector structVector, TypeDescription schema, int childCount) { List<TypeDescription> childrenTypes = schema.getChildren(); int numFields = childrenTypes.size(); // create a batch of Rows to read the structs Row[] structs = new Row[childCount]; // TODO: possible improvement: reuse existing Row objects for (int i = 0; i < childCount; i++) { structs[i] = new Row(numFields); } // read struct fields // we don't have to handle isRepeating because ORC assumes that it is propagated into the children. for (int i = 0; i < numFields; i++) { readField(structs, i, childrenTypes.get(i), structVector.fields[i], childCount); } if (fieldIdx == -1) { // set struct as an object System.arraycopy(structs, 0, vals, 0, childCount); } else { // set struct as a field of Row Row[] rows = (Row[]) vals; for (int i = 0; i < childCount; i++) { rows[i].setField(fieldIdx, structs[i]); } } }
return Row.class; case LIST: Class<?> childClass = getClassForType(schema.getChildren().get(0)); return Array.newInstance(childClass, 0).getClass(); case MAP:
/** * Computes the ORC projection mask of the fields to include from the selected fields.rowOrcInputFormat.nextRecord(null). * * @return The ORC projection mask. */ private boolean[] computeProjectionMask() { // mask with all fields of the schema boolean[] projectionMask = new boolean[schema.getMaximumId() + 1]; // for each selected field for (int inIdx : selectedFields) { // set all nested fields of a selected field to true TypeDescription fieldSchema = schema.getChildren().get(inIdx); for (int i = fieldSchema.getId(); i <= fieldSchema.getMaximumId(); i++) { projectionMask[i] = true; } } return projectionMask; }
return OrcBatchReader::copyBinary; case STRUCT: List<TypeDescription> fieldTypes = schema.getChildren(); Function<Object, Object>[] copyFields = new Function[fieldTypes.size()]; for (int i = 0; i < fieldTypes.size(); i++) { TypeDescription entryType = schema.getChildren().get(0); Function<Object, Object> copyEntry = getCopyFunction(entryType); Class entryClass = getClassForType(entryType); return new CopyList(copyEntry, entryClass); case MAP: TypeDescription keyType = schema.getChildren().get(0); TypeDescription valueType = schema.getChildren().get(1); Function<Object, Object> copyKey = getCopyFunction(keyType); Function<Object, Object> copyValue = getCopyFunction(valueType);
private static void setList(JSONWriter writer, ListColumnVector vector, TypeDescription schema, int row) throws JSONException { writer.array(); int offset = (int) vector.offsets[row]; TypeDescription childType = schema.getChildren().get(0); for (int i = 0; i < vector.lengths[row]; ++i) { setValue(writer, vector.child, childType, offset + i); } writer.endArray(); }
public static boolean[] genIncludedColumns(TypeDescription readerSchema, List<Integer> included) { boolean[] result = new boolean[readerSchema.getMaximumId() + 1]; if (included == null) { Arrays.fill(result, true); return result; } result[0] = true; List<TypeDescription> children = readerSchema.getChildren(); for (int columnNumber = 0; columnNumber < children.size(); ++columnNumber) { if (included.contains(columnNumber)) { TypeDescription child = children.get(columnNumber); for(int col = child.getId(); col <= child.getMaximumId(); ++col) { result[col] = true; } } } return result; }
return PrimitiveArrayTypeInfo.BYTE_PRIMITIVE_ARRAY_TYPE_INFO; case STRUCT: List<TypeDescription> fieldSchemas = schema.getChildren(); TypeInformation[] fieldTypes = new TypeInformation[fieldSchemas.size()]; for (int i = 0; i < fieldSchemas.size(); i++) { return new RowTypeInfo(fieldTypes, fieldNames); case LIST: TypeDescription elementSchema = schema.getChildren().get(0); TypeInformation<?> elementType = schemaToTypeInfo(elementSchema); TypeDescription keySchema = schema.getChildren().get(0); TypeDescription valSchema = schema.getChildren().get(1); TypeInformation<?> keyType = schemaToTypeInfo(keySchema); TypeInformation<?> valType = schemaToTypeInfo(valSchema);
private static void setStruct(JSONWriter writer, StructColumnVector batch, TypeDescription schema, int row) throws JSONException { writer.object(); List<String> fieldNames = schema.getFieldNames(); List<TypeDescription> fieldTypes = schema.getChildren(); for (int i = 0; i < fieldTypes.size(); ++i) { writer.key(fieldNames.get(i)); setValue(writer, batch.fields[i], fieldTypes.get(i), row); } writer.endObject(); } }
private static void readNonNullListColumn(Object[] vals, int fieldIdx, ListColumnVector list, TypeDescription schema, int childCount) { TypeDescription fieldType = schema.getChildren().get(0);
static OrcUnion nextUnion(ColumnVector vector, int row, TypeDescription schema, Object previous) { if (vector.isRepeating) { row = 0; } if (vector.noNulls || !vector.isNull[row]) { OrcUnion result; List<TypeDescription> childrenTypes = schema.getChildren(); if (previous == null || previous.getClass() != OrcUnion.class) { result = new OrcUnion(); } else { result = (OrcUnion) previous; } UnionColumnVector union = (UnionColumnVector) vector; byte tag = (byte) union.tags[row]; result.set(tag, nextValue(union.fields[tag], row, childrenTypes.get(tag), result.getObject())); return result; } else { return null; } }
public JsonORCFileWriter(LogFilePath logFilePath, CompressionCodec codec) throws IOException { Configuration conf = new Configuration(); Path path = new Path(logFilePath.getLogFilePath()); schema = schemaProvider.getSchema(logFilePath.getTopic(), logFilePath); List<TypeDescription> fieldTypes = schema.getChildren(); converters = new JsonConverter[fieldTypes.size()]; for (int c = 0; c < converters.length; ++c) { converters[c] = VectorColumnFiller.createConverter(fieldTypes .get(c)); } writer = OrcFile.createWriter(path, OrcFile.writerOptions(conf) .compress(resolveCompression(codec)).setSchema(schema)); batch = schema.createRowBatch(); }
public static void processRow(JSONWriter writer, VectorizedRowBatch batch, TypeDescription schema, int row) throws JSONException { if (schema.getCategory() == TypeDescription.Category.STRUCT) { List<TypeDescription> fieldTypes = schema.getChildren(); List<String> fieldNames = schema.getFieldNames(); writer.object(); for (int c = 0; c < batch.cols.length; ++c) { writer.key(fieldNames.get(c)); setValue(writer, batch.cols[c], fieldTypes.get(c), row); } writer.endObject(); } else { setValue(writer, batch.cols[0], schema, row); } }
private static void readNonNullMapColumn(Object[] vals, int fieldIdx, MapColumnVector mapsVector, TypeDescription schema, int childCount) { List<TypeDescription> fieldType = schema.getChildren(); TypeDescription keyType = fieldType.get(0); TypeDescription valueType = fieldType.get(1);
@Override public Object next(Object previous) throws IOException { if (!ensureBatch()) { return null; } if (schema.getCategory() == TypeDescription.Category.STRUCT) { OrcStruct result; List<TypeDescription> children = schema.getChildren(); int numberOfChildren = children.size(); if (previous == null || previous.getClass() != OrcStruct.class) { result = new OrcStruct(numberOfChildren); previous = result; } else { result = (OrcStruct) previous; if (result.getNumFields() != numberOfChildren) { result.setNumFields(numberOfChildren); } } for(int i=0; i < numberOfChildren; ++i) { result.setFieldValue(i, nextValue(batch.cols[i], rowInBatch, children.get(i), result.getFieldValue(i))); } } else { previous = nextValue(batch.cols[0], rowInBatch, schema, previous); } rowInBatch += 1; return previous; }