public static MessageType convert(final List<String> columnNames, final List<TypeInfo> columnTypes) { return new MessageType("hive_schema", convertTypes(columnNames, columnTypes)); }
public static MessageType convert(final List<String> columnNames, final List<TypeInfo> columnTypes) { return new MessageType("hive_schema", convertTypes(columnNames, columnTypes)); }
public static MessageType convert(final List<String> columnNames, final List<TypeInfo> columnTypes) { return new MessageType("hive_schema", convertTypes(columnNames, columnTypes)); }
.collect(toList()); MessageType requestedSchema = new MessageType(fileSchema.getName(), fields);
private Type buildSchema() { JsonArray inputSchema = this.jsonSchema.getDataTypeValues(); List<Type> parquetTypes = new ArrayList<>(); for (JsonElement element : inputSchema) { JsonObject map = (JsonObject) element; JsonSchema elementSchema = new JsonSchema(map); String columnName = elementSchema.getColumnName(); JsonElementConverter converter = JsonElementConversionFactory.getConverter(elementSchema, false); Type schemaType = converter.schema(); this.converters.put(columnName, converter); parquetTypes.add(schemaType); } String docName = this.jsonSchema.getColumnName(); switch (recordType) { case ROOT: return new MessageType(docName, parquetTypes); case CHILD: return new GroupType(this.jsonSchema.optionalOrRequired(), docName, parquetTypes); default: throw new RuntimeException("Unsupported Record type"); } }
@Test public void testParquetTupleDomainPrimitive() { HiveColumnHandle columnHandle = new HiveColumnHandle("my_primitive", HiveType.valueOf("bigint"), parseTypeSignature(StandardTypes.BIGINT), 0, REGULAR, Optional.empty()); Domain singleValueDomain = Domain.singleValue(BIGINT, 123L); TupleDomain<HiveColumnHandle> domain = withColumnDomains(ImmutableMap.of(columnHandle, singleValueDomain)); MessageType fileSchema = new MessageType("hive_schema", new PrimitiveType(OPTIONAL, INT64, "my_primitive")); Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, fileSchema); TupleDomain<ColumnDescriptor> tupleDomain = getParquetTupleDomain(descriptorsByPath, domain); assertEquals(tupleDomain.getDomains().get().size(), 1); ColumnDescriptor descriptor = tupleDomain.getDomains().get().keySet().iterator().next(); assertEquals(descriptor.getPath().length, 1); assertEquals(descriptor.getPath()[0], "my_primitive"); Domain predicateDomain = Iterables.getOnlyElement(tupleDomain.getDomains().get().values()); assertEquals(predicateDomain, singleValueDomain); }
@Test public void testParquetTupleDomainStruct() { HiveColumnHandle columnHandle = new HiveColumnHandle("my_struct", HiveType.valueOf("struct<a:int,b:int>"), parseTypeSignature(StandardTypes.ROW), 0, REGULAR, Optional.empty()); RowType.Field rowField = new RowType.Field(Optional.of("my_struct"), INTEGER); RowType rowType = RowType.from(ImmutableList.of(rowField)); TupleDomain<HiveColumnHandle> domain = withColumnDomains(ImmutableMap.of(columnHandle, Domain.notNull(rowType))); MessageType fileSchema = new MessageType("hive_schema", new GroupType(OPTIONAL, "my_struct", new PrimitiveType(OPTIONAL, INT32, "a"), new PrimitiveType(OPTIONAL, INT32, "b"))); Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, fileSchema); TupleDomain<ColumnDescriptor> tupleDomain = getParquetTupleDomain(descriptorsByPath, domain); assertTrue(tupleDomain.getDomains().get().isEmpty()); }
@Test public void testParquetTupleDomainPrimitiveArray() { HiveColumnHandle columnHandle = new HiveColumnHandle("my_array", HiveType.valueOf("array<int>"), parseTypeSignature(StandardTypes.ARRAY), 0, REGULAR, Optional.empty()); TupleDomain<HiveColumnHandle> domain = withColumnDomains(ImmutableMap.of(columnHandle, Domain.notNull(new ArrayType(INTEGER)))); MessageType fileSchema = new MessageType("hive_schema", new GroupType(OPTIONAL, "my_array", new GroupType(REPEATED, "bag", new PrimitiveType(OPTIONAL, INT32, "array_element")))); Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, fileSchema); TupleDomain<ColumnDescriptor> tupleDomain = getParquetTupleDomain(descriptorsByPath, domain); assertTrue(tupleDomain.getDomains().get().isEmpty()); }
@Test public void testParquetTupleDomainStructArray() { HiveColumnHandle columnHandle = new HiveColumnHandle("my_array_struct", HiveType.valueOf("array<struct<a:int>>"), parseTypeSignature(StandardTypes.ARRAY), 0, REGULAR, Optional.empty()); RowType.Field rowField = new RowType.Field(Optional.of("a"), INTEGER); RowType rowType = RowType.from(ImmutableList.of(rowField)); TupleDomain<HiveColumnHandle> domain = withColumnDomains(ImmutableMap.of(columnHandle, Domain.notNull(new ArrayType(rowType)))); MessageType fileSchema = new MessageType("hive_schema", new GroupType(OPTIONAL, "my_array_struct", new GroupType(REPEATED, "bag", new GroupType(OPTIONAL, "array_element", new PrimitiveType(OPTIONAL, INT32, "a"))))); Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, fileSchema); TupleDomain<ColumnDescriptor> tupleDomain = getParquetTupleDomain(descriptorsByPath, domain); assertTrue(tupleDomain.getDomains().get().isEmpty()); }
@Test public void testParquetTupleDomainMap() { HiveColumnHandle columnHandle = new HiveColumnHandle("my_map", HiveType.valueOf("map<int,int>"), parseTypeSignature(StandardTypes.MAP), 0, REGULAR, Optional.empty()); MapType mapType = new MapType( INTEGER, INTEGER, methodHandle(TestParquetPredicateUtils.class, "throwUnsupportedOperationException"), methodHandle(TestParquetPredicateUtils.class, "throwUnsupportedOperationException"), methodHandle(TestParquetPredicateUtils.class, "throwUnsupportedOperationException"), methodHandle(TestParquetPredicateUtils.class, "throwUnsupportedOperationException")); TupleDomain<HiveColumnHandle> domain = withColumnDomains(ImmutableMap.of(columnHandle, Domain.notNull(mapType))); MessageType fileSchema = new MessageType("hive_schema", new GroupType(OPTIONAL, "my_map", new GroupType(REPEATED, "map", new PrimitiveType(REQUIRED, INT32, "key"), new PrimitiveType(OPTIONAL, INT32, "value")))); Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, fileSchema); TupleDomain<ColumnDescriptor> tupleDomain = getParquetTupleDomain(descriptorsByPath, domain); assertTrue(tupleDomain.getDomains().get().isEmpty()); }
public MessageType getConvertedMessageType() { // the root should be a GroupType if (currentType == null) return new MessageType(currentName, new ArrayList<Type>()); GroupType rootType = currentType.asGroupType(); return new MessageType(currentName, rootType.getFields()); }
/** * * @param pigSchema the pig schema * @return the resulting Parquet schema */ public MessageType convert(Schema pigSchema) { return new MessageType("pig_schema", convertTypes(pigSchema)); }
public static MessageType convert(final List<String> columnNames, final List<TypeInfo> columnTypes) { final MessageType schema = new MessageType("hive_schema", convertTypes(columnNames, columnTypes)); return schema; }
private MessageType computeSchema(DataModelDescriptor descriptor) { List<Type> fields = new ArrayList<>(); for (PropertyDescriptor property : descriptor.getPropertyDescriptors()) { Type field = computeParquetType(property); fields.add(field); } return new MessageType( descriptor.getDataModelClass().getName(), fields); }
public MessageType union(MessageType toMerge, boolean strict) { return new MessageType(this.getName(), mergeFields(toMerge, strict)); }
public static MessageType convert(final List<String> columnNames, final List<TypeInfo> columnTypes) { return new MessageType("hive_schema", convertTypes(columnNames, columnTypes)); }
public static MessageType convert(final List<String> columnNames, final List<TypeInfo> columnTypes) { return new MessageType("hive_schema", convertTypes(columnNames, columnTypes)); }
public static MessageType convert(final List<String> columnNames, final List<TypeInfo> columnTypes) { return new MessageType("hive_schema", convertTypes(columnNames, columnTypes)); }
private static MessageType parse(String schemaString) { Tokenizer st = new Tokenizer(schemaString, " ;{}()\n\t"); String t = st.nextToken(); check(t, "message", "start with 'message'", st); String name = st.nextToken(); Type[] fields = readGroupTypeFields(st.nextToken(), st); return new MessageType(name, fields); }
@Test public void testParquetTupleDomainPrimitiveArray() { HiveColumnHandle columnHandle = new HiveColumnHandle("my_array", HiveType.valueOf("array<int>"), parseTypeSignature(StandardTypes.ARRAY), 0, REGULAR, Optional.empty()); TupleDomain<HiveColumnHandle> domain = withColumnDomains(ImmutableMap.of(columnHandle, Domain.notNull(new ArrayType(INTEGER)))); MessageType fileSchema = new MessageType("hive_schema", new GroupType(OPTIONAL, "my_array", new GroupType(REPEATED, "bag", new PrimitiveType(OPTIONAL, INT32, "array_element")))); Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, fileSchema); TupleDomain<ColumnDescriptor> tupleDomain = getParquetTupleDomain(descriptorsByPath, domain); assertTrue(tupleDomain.getDomains().get().isEmpty()); }