/** * Get schema for a directory using {@link AvroUtils#getDirectorySchema(Path, FileSystem, boolean)}. */ protected Schema getDirectorySchema(Path directory) throws IOException { return AvroUtils.getDirectorySchema(directory, this.fs, true); }
public static void writeSchemaToFile(Schema schema, Path filePath, FileSystem fs, boolean overwrite) throws IOException { writeSchemaToFile(schema, filePath, fs, overwrite, new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.READ)); }
public static Schema getSchemaFromUrl(Path schemaUrl, FileSystem fs) throws IOException { return AvroUtils.parseSchemaFromFile(schemaUrl, fs); }
if (dedupKeyOption == DedupKeyOption.ALL) { LOG.info("Using all attributes in the schema (except Map, Arrar and Enum fields) for compaction"); keySchema = AvroUtils.removeUncomparableFields(topicSchema).get(); } else if (dedupKeyOption == DedupKeyOption.KEY) { LOG.info("Using key attributes in the schema for compaction"); keySchema = AvroUtils.removeUncomparableFields(getKeySchema(topicSchema)).get(); } else if (keySchemaFileSpecified()) { Path keySchemaFile = getKeySchemaFile(); LOG.info("Using attributes specified in schema file " + keySchemaFile + " for compaction"); try { keySchema = AvroUtils.parseSchemaFromFile(keySchemaFile, this.fs); } catch (IOException e) { LOG.error("Failed to parse avro schema from " + keySchemaFile + ", using key attributes in the schema for compaction"); keySchema = AvroUtils.removeUncomparableFields(getKeySchema(topicSchema)).get(); keySchema = AvroUtils.removeUncomparableFields(getKeySchema(topicSchema)).get(); keySchema = AvroUtils.removeUncomparableFields(getKeySchema(topicSchema)).get();
val = getObjectFromMap((Map)data, pathList.get(field)); } else if (data instanceof List) { val = getObjectFromArray((List)data, Integer.parseInt(pathList.get(field))); } else { val = ((Record)data).get(pathList.get(field)); AvroUtils.getFieldHelper(retVal, getObjectFromMap((Map) data, pathList.get(field)), pathList, ++field); return; List<String> newPathList = new ArrayList<>(pathList); newPathList.set(field, String.valueOf(i)); AvroUtils.getFieldHelper(retVal, val, newPathList, field + 1); i++; .getFieldHelper(retVal, getObjectFromArray((List) data, Integer.parseInt(pathList.get(field))), pathList, ++field); AvroUtils.getFieldHelper(retVal, ((Record) data).get(pathList.get(field)), pathList, ++field); return;
/** * Remove map, array, enum fields, as well as union fields that contain map, array or enum, * from an Avro schema. A schema with these fields cannot be used as Mapper key in a * MapReduce job. */ public static Optional<Schema> removeUncomparableFields(Schema schema) { return removeUncomparableFields(schema, Sets.<Schema> newHashSet()); }
public static Map<String, Object> getMultiFieldValue(GenericRecord record, String fieldLocation) { Preconditions.checkNotNull(record); Preconditions.checkArgument(!Strings.isNullOrEmpty(fieldLocation)); Splitter splitter = Splitter.on(FIELD_LOCATION_DELIMITER).omitEmptyStrings().trimResults(); List<String> pathList = splitter.splitToList(fieldLocation); if (pathList.size() == 0) { return Collections.emptyMap(); } HashMap<String, Object> retVal = new HashMap<String, Object>(); AvroUtils.getFieldHelper(retVal, record, pathList, 0); return retVal; }
private static List<FileStatus> getDirectorySchemaHelper(Path directory, FileSystem fs) throws IOException { List<FileStatus> files = Lists.newArrayList(); if (fs.exists(directory)) { getAllNestedAvroFiles(fs.getFileStatus(directory), files, fs); if (files.size() > 0) { Collections.sort(files, FileListUtils.LATEST_MOD_TIME_ORDER); } } return files; }
/** * Given a GenericRecord, this method will return the schema of the field specified by the path parameter. The * fieldLocation parameter is an ordered string specifying the location of the nested field to retrieve. For example, * field1.nestedField1 takes the the schema of the field "field1", and retrieves the schema "nestedField1" from it. * @param schema is the record to retrieve the schema from * @param fieldLocation is the location of the field * @return the schema of the field */ public static Optional<Schema> getFieldSchema(Schema schema, String fieldLocation) { Preconditions.checkNotNull(schema); Preconditions.checkArgument(!Strings.isNullOrEmpty(fieldLocation)); Splitter splitter = Splitter.on(FIELD_LOCATION_DELIMITER).omitEmptyStrings().trimResults(); List<String> pathList = Lists.newArrayList(splitter.split(fieldLocation)); if (pathList.size() == 0) { return Optional.absent(); } return AvroUtils.getFieldSchemaHelper(schema, pathList, 0); }
@Override protected Object decodeRecord(ByteArrayBasedKafkaRecord messageAndOffset) throws IOException { Object deserialized = kafkaDeserializer.deserialize(this.topicName, messageAndOffset.getMessageBytes()); // For Confluent's Schema Registry the read schema is the latest registered schema to support schema evolution return (this.latestSchema == null) ? deserialized : AvroUtils.convertRecordSchema((GenericRecord) deserialized, this.latestSchema); }
/** * Get the latest avro schema for a directory * @param directory the input dir that contains avro files * @param fs the {@link FileSystem} for the given directory. * @param latest true to return latest schema, false to return oldest schema * @return the latest/oldest schema in the directory * @throws IOException */ public static Schema getDirectorySchema(Path directory, FileSystem fs, boolean latest) throws IOException { Schema schema = null; try (Closer closer = Closer.create()) { List<FileStatus> files = getDirectorySchemaHelper(directory, fs); if (files == null || files.size() == 0) { LOG.warn("There is no previous avro file in the directory: " + directory); } else { FileStatus file = latest ? files.get(0) : files.get(files.size() - 1); LOG.debug("Path to get the avro schema: " + file); FsInput fi = new FsInput(file.getPath(), fs.getConf()); GenericDatumReader<GenericRecord> genReader = new GenericDatumReader<>(); schema = closer.register(new DataFileReader<>(fi, genReader)).getSchema(); } } catch (IOException ioe) { throw new IOException("Cannot get the schema for directory " + directory, ioe); } return schema; }
if (dedupKeyOption == MRCompactorAvroKeyDedupJobRunner.DedupKeyOption.ALL) { log.info("Using all attributes in the schema (except Map, Arrar and Enum fields) for compaction"); keySchema = AvroUtils.removeUncomparableFields(topicSchema).get(); } else if (dedupKeyOption == MRCompactorAvroKeyDedupJobRunner.DedupKeyOption.KEY) { log.info("Using key attributes in the schema for compaction"); keySchema = AvroUtils.removeUncomparableFields(MRCompactorAvroKeyDedupJobRunner.getKeySchema(topicSchema)).get(); } else if (keySchemaFileSpecified) { Path keySchemaFile = new Path(state.getProp(MRCompactorAvroKeyDedupJobRunner.COMPACTION_JOB_AVRO_KEY_SCHEMA_LOC)); log.info("Using attributes specified in schema file " + keySchemaFile + " for compaction"); try { keySchema = AvroUtils.parseSchemaFromFile(keySchemaFile, this.fs); } catch (IOException e) { log.error("Failed to parse avro schema from " + keySchemaFile + ", using key attributes in the schema for compaction"); keySchema = AvroUtils.removeUncomparableFields(MRCompactorAvroKeyDedupJobRunner.getKeySchema(topicSchema)).get(); log.warn(String.format("Key schema %s is not compatible with record schema %s.", keySchema, topicSchema) + "Using key attributes in the schema for compaction"); keySchema = AvroUtils.removeUncomparableFields(MRCompactorAvroKeyDedupJobRunner.getKeySchema(topicSchema)).get(); keySchema = AvroUtils.removeUncomparableFields(MRCompactorAvroKeyDedupJobRunner.getKeySchema(topicSchema)).get();
private static Optional<Schema> removeUncomparableFieldsFromUnion(Schema union, Set<Schema> processed) { Preconditions.checkArgument(union.getType() == Schema.Type.UNION); if (processed.contains(union)) { return Optional.absent(); } processed.add(union); List<Schema> newUnion = Lists.newArrayList(); for (Schema unionType : union.getTypes()) { Optional<Schema> newType = removeUncomparableFields(unionType, processed); if (newType.isPresent()) { newUnion.add(newType.get()); } } // Discard the union field if one or more types are removed from the union. if (newUnion.size() != union.getTypes().size()) { return Optional.absent(); } return Optional.of(Schema.createUnion(newUnion)); }
/** * Given a GenericRecord, this method will return the field specified by the path parameter. The * fieldLocation parameter is an ordered string specifying the location of the nested field to retrieve. For example, * field1.nestedField1 takes field "field1", and retrieves "nestedField1" from it. * @param schema is the record to retrieve the schema from * @param fieldLocation is the location of the field * @return the field */ public static Optional<Field> getField(Schema schema, String fieldLocation) { Preconditions.checkNotNull(schema); Preconditions.checkArgument(!Strings.isNullOrEmpty(fieldLocation)); Splitter splitter = Splitter.on(FIELD_LOCATION_DELIMITER).omitEmptyStrings().trimResults(); List<String> pathList = Lists.newArrayList(splitter.split(fieldLocation)); if (pathList.size() == 0) { return Optional.absent(); } return AvroUtils.getFieldHelper(schema, pathList, 0); }
private static void getAllNestedAvroFiles(FileStatus dir, List<FileStatus> files, FileSystem fs) throws IOException { if (dir.isDirectory()) { FileStatus[] filesInDir = fs.listStatus(dir.getPath()); if (filesInDir != null) { for (FileStatus f : filesInDir) { getAllNestedAvroFiles(f, files, fs); } } } else if (dir.getPath().getName().endsWith(AVRO_SUFFIX)) { files.add(dir); } }
/** * Helper method that does the actual work for {@link #getFieldSchema(Schema, String)} * @param schema passed from {@link #getFieldSchema(Schema, String)} * @param pathList passed from {@link #getFieldSchema(Schema, String)} * @param field keeps track of the index used to access the list pathList * @return the schema of the field */ private static Optional<Schema> getFieldSchemaHelper(Schema schema, List<String> pathList, int field) { if (schema.getType() == Type.RECORD && schema.getField(pathList.get(field)) == null) { return Optional.absent(); } switch (schema.getType()) { case UNION: throw new AvroRuntimeException("Union of complex types cannot be handled : " + schema); case MAP: if ((field + 1) == pathList.size()) { return Optional.fromNullable(schema.getValueType()); } return AvroUtils.getFieldSchemaHelper(schema.getValueType(), pathList, ++field); case RECORD: if ((field + 1) == pathList.size()) { return Optional.fromNullable(schema.getField(pathList.get(field)).schema()); } return AvroUtils.getFieldSchemaHelper(schema.getField(pathList.get(field)).schema(), pathList, ++field); default: throw new AvroRuntimeException("Invalid type in schema : " + schema); } }
/** * Get the latest avro schema for a directory * @param directory the input dir that contains avro files * @param conf configuration * @param latest true to return latest schema, false to return oldest schema * @return the latest/oldest schema in the directory * @throws IOException */ public static Schema getDirectorySchema(Path directory, Configuration conf, boolean latest) throws IOException { return getDirectorySchema(directory, FileSystem.get(conf), latest); }
/** * If url for schema already exists, return the url. If not create a new temporary schema file and return a the url. */ private Path getOrGenerateSchemaFile(Schema schema) throws IOException { Preconditions.checkNotNull(schema, "Avro Schema should not be null"); String hashedSchema = Hashing.sha256().hashString(schema.toString(), StandardCharsets.UTF_8).toString(); if (!this.schemaPaths.containsKey(hashedSchema)) { Path schemaFilePath = new Path(this.schemaDir, String.valueOf(System.currentTimeMillis() + ".avsc")); AvroUtils.writeSchemaToFile(schema, schemaFilePath, fs, true); this.schemaPaths.put(hashedSchema, schemaFilePath); } return this.schemaPaths.get(hashedSchema); } }
private static Optional<Schema> removeUncomparableFieldsFromRecord(Schema record, Set<Schema> processed) { Preconditions.checkArgument(record.getType() == Schema.Type.RECORD); if (processed.contains(record)) { return Optional.absent(); } processed.add(record); List<Field> fields = Lists.newArrayList(); for (Field field : record.getFields()) { Optional<Schema> newFieldSchema = removeUncomparableFields(field.schema(), processed); if (newFieldSchema.isPresent()) { fields.add(new Field(field.name(), newFieldSchema.get(), field.doc(), field.defaultValue())); } } Schema newSchema = Schema.createRecord(record.getName(), record.getDoc(), record.getNamespace(), false); newSchema.setFields(fields); return Optional.of(newSchema); }
/** * Helper method that does the actual work for {@link #getField(Schema, String)} * @param schema passed from {@link #getFieldSchema(Schema, String)} * @param pathList passed from {@link #getFieldSchema(Schema, String)} * @param field keeps track of the index used to access the list pathList * @return the field */ private static Optional<Field> getFieldHelper(Schema schema, List<String> pathList, int field) { Field curField = schema.getField(pathList.get(field)); if (field + 1 == pathList.size()) { return Optional.fromNullable(curField); } Schema fieldSchema = curField.schema(); switch (fieldSchema.getType()) { case UNION: throw new AvroRuntimeException("Union of complex types cannot be handled : " + schema); case MAP: return AvroUtils.getFieldHelper(fieldSchema.getValueType(), pathList, ++field); case RECORD: return AvroUtils.getFieldHelper(fieldSchema, pathList, ++field); default: throw new AvroRuntimeException("Invalid type in schema : " + schema); } }