gobblin.util.AvroUtils java code examples

/**
 * Get schema for a directory using {@link AvroUtils#getDirectorySchema(Path, FileSystem, boolean)}.
 */
protected Schema getDirectorySchema(Path directory) throws IOException {
 return AvroUtils.getDirectorySchema(directory, this.fs, true);
}

public static void writeSchemaToFile(Schema schema, Path filePath, FileSystem fs, boolean overwrite)
  throws IOException {
 writeSchemaToFile(schema, filePath, fs, overwrite, new FsPermission(FsAction.ALL, FsAction.ALL, FsAction.READ));
}

public static Schema getSchemaFromUrl(Path schemaUrl, FileSystem fs) throws IOException {
 return AvroUtils.parseSchemaFromFile(schemaUrl, fs);
}

if (dedupKeyOption == DedupKeyOption.ALL) {
 LOG.info("Using all attributes in the schema (except Map, Arrar and Enum fields) for compaction");
 keySchema = AvroUtils.removeUncomparableFields(topicSchema).get();
} else if (dedupKeyOption == DedupKeyOption.KEY) {
 LOG.info("Using key attributes in the schema for compaction");
 keySchema = AvroUtils.removeUncomparableFields(getKeySchema(topicSchema)).get();
} else if (keySchemaFileSpecified()) {
 Path keySchemaFile = getKeySchemaFile();
 LOG.info("Using attributes specified in schema file " + keySchemaFile + " for compaction");
 try {
  keySchema = AvroUtils.parseSchemaFromFile(keySchemaFile, this.fs);
 } catch (IOException e) {
  LOG.error("Failed to parse avro schema from " + keySchemaFile
    + ", using key attributes in the schema for compaction");
  keySchema = AvroUtils.removeUncomparableFields(getKeySchema(topicSchema)).get();
  keySchema = AvroUtils.removeUncomparableFields(getKeySchema(topicSchema)).get();
 keySchema = AvroUtils.removeUncomparableFields(getKeySchema(topicSchema)).get();

  val = getObjectFromMap((Map)data, pathList.get(field));
 } else if (data instanceof List) {
  val = getObjectFromArray((List)data, Integer.parseInt(pathList.get(field)));
 } else {
  val = ((Record)data).get(pathList.get(field));
 AvroUtils.getFieldHelper(retVal, getObjectFromMap((Map) data, pathList.get(field)), pathList, ++field);
 return;
   List<String> newPathList = new ArrayList<>(pathList);
   newPathList.set(field, String.valueOf(i));
   AvroUtils.getFieldHelper(retVal, val, newPathList, field + 1);
   i++;
    .getFieldHelper(retVal, getObjectFromArray((List) data, Integer.parseInt(pathList.get(field))), pathList, ++field);
AvroUtils.getFieldHelper(retVal, ((Record) data).get(pathList.get(field)), pathList, ++field);
return;

/**
 * Remove map, array, enum fields, as well as union fields that contain map, array or enum,
 * from an Avro schema. A schema with these fields cannot be used as Mapper key in a
 * MapReduce job.
 */
public static Optional<Schema> removeUncomparableFields(Schema schema) {
 return removeUncomparableFields(schema, Sets.<Schema> newHashSet());
}

public static Map<String, Object> getMultiFieldValue(GenericRecord record, String fieldLocation) {
 Preconditions.checkNotNull(record);
 Preconditions.checkArgument(!Strings.isNullOrEmpty(fieldLocation));
 Splitter splitter = Splitter.on(FIELD_LOCATION_DELIMITER).omitEmptyStrings().trimResults();
 List<String> pathList = splitter.splitToList(fieldLocation);
 if (pathList.size() == 0) {
  return Collections.emptyMap();
 }
 HashMap<String, Object> retVal = new HashMap<String, Object>();
 AvroUtils.getFieldHelper(retVal, record, pathList, 0);
 return retVal;
}

private static List<FileStatus> getDirectorySchemaHelper(Path directory, FileSystem fs) throws IOException {
 List<FileStatus> files = Lists.newArrayList();
 if (fs.exists(directory)) {
  getAllNestedAvroFiles(fs.getFileStatus(directory), files, fs);
  if (files.size() > 0) {
   Collections.sort(files, FileListUtils.LATEST_MOD_TIME_ORDER);
  }
 }
 return files;
}

/**
 * Given a GenericRecord, this method will return the schema of the field specified by the path parameter. The
 * fieldLocation parameter is an ordered string specifying the location of the nested field to retrieve. For example,
 * field1.nestedField1 takes the the schema of the field "field1", and retrieves the schema "nestedField1" from it.
 * @param schema is the record to retrieve the schema from
 * @param fieldLocation is the location of the field
 * @return the schema of the field
 */
public static Optional<Schema> getFieldSchema(Schema schema, String fieldLocation) {
 Preconditions.checkNotNull(schema);
 Preconditions.checkArgument(!Strings.isNullOrEmpty(fieldLocation));
 Splitter splitter = Splitter.on(FIELD_LOCATION_DELIMITER).omitEmptyStrings().trimResults();
 List<String> pathList = Lists.newArrayList(splitter.split(fieldLocation));
 if (pathList.size() == 0) {
  return Optional.absent();
 }
 return AvroUtils.getFieldSchemaHelper(schema, pathList, 0);
}

@Override
protected Object decodeRecord(ByteArrayBasedKafkaRecord messageAndOffset) throws IOException {
 Object deserialized = kafkaDeserializer.deserialize(this.topicName, messageAndOffset.getMessageBytes());
 
 // For Confluent's Schema Registry the read schema is the latest registered schema to support schema evolution
 return (this.latestSchema == null) ? deserialized
   : AvroUtils.convertRecordSchema((GenericRecord) deserialized, this.latestSchema);
}

/**
 * Get the latest avro schema for a directory
 * @param directory the input dir that contains avro files
 * @param fs the {@link FileSystem} for the given directory.
 * @param latest true to return latest schema, false to return oldest schema
 * @return the latest/oldest schema in the directory
 * @throws IOException
 */
public static Schema getDirectorySchema(Path directory, FileSystem fs, boolean latest) throws IOException {
 Schema schema = null;
 try (Closer closer = Closer.create()) {
  List<FileStatus> files = getDirectorySchemaHelper(directory, fs);
  if (files == null || files.size() == 0) {
   LOG.warn("There is no previous avro file in the directory: " + directory);
  } else {
   FileStatus file = latest ? files.get(0) : files.get(files.size() - 1);
   LOG.debug("Path to get the avro schema: " + file);
   FsInput fi = new FsInput(file.getPath(), fs.getConf());
   GenericDatumReader<GenericRecord> genReader = new GenericDatumReader<>();
   schema = closer.register(new DataFileReader<>(fi, genReader)).getSchema();
  }
 } catch (IOException ioe) {
  throw new IOException("Cannot get the schema for directory " + directory, ioe);
 }
 return schema;
}

if (dedupKeyOption == MRCompactorAvroKeyDedupJobRunner.DedupKeyOption.ALL) {
 log.info("Using all attributes in the schema (except Map, Arrar and Enum fields) for compaction");
 keySchema = AvroUtils.removeUncomparableFields(topicSchema).get();
} else if (dedupKeyOption == MRCompactorAvroKeyDedupJobRunner.DedupKeyOption.KEY) {
 log.info("Using key attributes in the schema for compaction");
 keySchema = AvroUtils.removeUncomparableFields(MRCompactorAvroKeyDedupJobRunner.getKeySchema(topicSchema)).get();
} else if (keySchemaFileSpecified) {
 Path keySchemaFile = new Path(state.getProp(MRCompactorAvroKeyDedupJobRunner.COMPACTION_JOB_AVRO_KEY_SCHEMA_LOC));
 log.info("Using attributes specified in schema file " + keySchemaFile + " for compaction");
 try {
  keySchema = AvroUtils.parseSchemaFromFile(keySchemaFile, this.fs);
 } catch (IOException e) {
  log.error("Failed to parse avro schema from " + keySchemaFile
      + ", using key attributes in the schema for compaction");
  keySchema = AvroUtils.removeUncomparableFields(MRCompactorAvroKeyDedupJobRunner.getKeySchema(topicSchema)).get();
  log.warn(String.format("Key schema %s is not compatible with record schema %s.", keySchema, topicSchema)
      + "Using key attributes in the schema for compaction");
  keySchema = AvroUtils.removeUncomparableFields(MRCompactorAvroKeyDedupJobRunner.getKeySchema(topicSchema)).get();
 keySchema = AvroUtils.removeUncomparableFields(MRCompactorAvroKeyDedupJobRunner.getKeySchema(topicSchema)).get();

private static Optional<Schema> removeUncomparableFieldsFromUnion(Schema union, Set<Schema> processed) {
 Preconditions.checkArgument(union.getType() == Schema.Type.UNION);
 if (processed.contains(union)) {
  return Optional.absent();
 }
 processed.add(union);
 List<Schema> newUnion = Lists.newArrayList();
 for (Schema unionType : union.getTypes()) {
  Optional<Schema> newType = removeUncomparableFields(unionType, processed);
  if (newType.isPresent()) {
   newUnion.add(newType.get());
  }
 }
 // Discard the union field if one or more types are removed from the union.
 if (newUnion.size() != union.getTypes().size()) {
  return Optional.absent();
 }
 return Optional.of(Schema.createUnion(newUnion));
}

/**
 * Given a GenericRecord, this method will return the field specified by the path parameter. The
 * fieldLocation parameter is an ordered string specifying the location of the nested field to retrieve. For example,
 * field1.nestedField1 takes field "field1", and retrieves "nestedField1" from it.
 * @param schema is the record to retrieve the schema from
 * @param fieldLocation is the location of the field
 * @return the field
 */
public static Optional<Field> getField(Schema schema, String fieldLocation) {
 Preconditions.checkNotNull(schema);
 Preconditions.checkArgument(!Strings.isNullOrEmpty(fieldLocation));
 Splitter splitter = Splitter.on(FIELD_LOCATION_DELIMITER).omitEmptyStrings().trimResults();
 List<String> pathList = Lists.newArrayList(splitter.split(fieldLocation));
 if (pathList.size() == 0) {
  return Optional.absent();
 }
 return AvroUtils.getFieldHelper(schema, pathList, 0);
}

private static void getAllNestedAvroFiles(FileStatus dir, List<FileStatus> files, FileSystem fs) throws IOException {
 if (dir.isDirectory()) {
  FileStatus[] filesInDir = fs.listStatus(dir.getPath());
  if (filesInDir != null) {
   for (FileStatus f : filesInDir) {
    getAllNestedAvroFiles(f, files, fs);
   }
  }
 } else if (dir.getPath().getName().endsWith(AVRO_SUFFIX)) {
  files.add(dir);
 }
}

/**
 * Helper method that does the actual work for {@link #getFieldSchema(Schema, String)}
 * @param schema passed from {@link #getFieldSchema(Schema, String)}
 * @param pathList passed from {@link #getFieldSchema(Schema, String)}
 * @param field keeps track of the index used to access the list pathList
 * @return the schema of the field
 */
private static Optional<Schema> getFieldSchemaHelper(Schema schema, List<String> pathList, int field) {
 if (schema.getType() == Type.RECORD && schema.getField(pathList.get(field)) == null) {
  return Optional.absent();
 }
 switch (schema.getType()) {
  case UNION:
   throw new AvroRuntimeException("Union of complex types cannot be handled : " + schema);
  case MAP:
   if ((field + 1) == pathList.size()) {
    return Optional.fromNullable(schema.getValueType());
   }
   return AvroUtils.getFieldSchemaHelper(schema.getValueType(), pathList, ++field);
  case RECORD:
   if ((field + 1) == pathList.size()) {
    return Optional.fromNullable(schema.getField(pathList.get(field)).schema());
   }
   return AvroUtils.getFieldSchemaHelper(schema.getField(pathList.get(field)).schema(), pathList, ++field);
  default:
   throw new AvroRuntimeException("Invalid type in schema : " + schema);
 }
}

/**
 * Get the latest avro schema for a directory
 * @param directory the input dir that contains avro files
 * @param conf configuration
 * @param latest true to return latest schema, false to return oldest schema
 * @return the latest/oldest schema in the directory
 * @throws IOException
 */
public static Schema getDirectorySchema(Path directory, Configuration conf, boolean latest) throws IOException {
 return getDirectorySchema(directory, FileSystem.get(conf), latest);
}

 /**
  * If url for schema already exists, return the url. If not create a new temporary schema file and return a the url.
  */
 private Path getOrGenerateSchemaFile(Schema schema) throws IOException {

  Preconditions.checkNotNull(schema, "Avro Schema should not be null");

  String hashedSchema = Hashing.sha256().hashString(schema.toString(), StandardCharsets.UTF_8).toString();

  if (!this.schemaPaths.containsKey(hashedSchema)) {

   Path schemaFilePath = new Path(this.schemaDir, String.valueOf(System.currentTimeMillis() + ".avsc"));
   AvroUtils.writeSchemaToFile(schema, schemaFilePath, fs, true);

   this.schemaPaths.put(hashedSchema, schemaFilePath);
  }

  return this.schemaPaths.get(hashedSchema);
 }
}

private static Optional<Schema> removeUncomparableFieldsFromRecord(Schema record, Set<Schema> processed) {
 Preconditions.checkArgument(record.getType() == Schema.Type.RECORD);
 if (processed.contains(record)) {
  return Optional.absent();
 }
 processed.add(record);
 List<Field> fields = Lists.newArrayList();
 for (Field field : record.getFields()) {
  Optional<Schema> newFieldSchema = removeUncomparableFields(field.schema(), processed);
  if (newFieldSchema.isPresent()) {
   fields.add(new Field(field.name(), newFieldSchema.get(), field.doc(), field.defaultValue()));
  }
 }
 Schema newSchema = Schema.createRecord(record.getName(), record.getDoc(), record.getNamespace(), false);
 newSchema.setFields(fields);
 return Optional.of(newSchema);
}

/**
 * Helper method that does the actual work for {@link #getField(Schema, String)}
 * @param schema passed from {@link #getFieldSchema(Schema, String)}
 * @param pathList passed from {@link #getFieldSchema(Schema, String)}
 * @param field keeps track of the index used to access the list pathList
 * @return the field
 */
private static Optional<Field> getFieldHelper(Schema schema, List<String> pathList, int field) {
 Field curField = schema.getField(pathList.get(field));
 if (field + 1 == pathList.size()) {
  return Optional.fromNullable(curField);
 }
 Schema fieldSchema = curField.schema();
 switch (fieldSchema.getType()) {
  case UNION:
   throw new AvroRuntimeException("Union of complex types cannot be handled : " + schema);
  case MAP:
   return AvroUtils.getFieldHelper(fieldSchema.getValueType(), pathList, ++field);
  case RECORD:
   return AvroUtils.getFieldHelper(fieldSchema, pathList, ++field);
  default:
   throw new AvroRuntimeException("Invalid type in schema : " + schema);
 }
}

Javadoc

A Utils class for dealing with Avro objects

Most used methods

getDirectorySchema
Get the latest avro schema for a directory
writeSchemaToFile
parseSchemaFromFile
Parse Avro schema from a schema file.
removeUncomparableFields
convertRecordSchema
Change the schema of an Avro record.
getAllNestedAvroFiles
getDirectorySchemaHelper
getFieldHelper
Helper method that does the actual work for #getField(Schema,String)
getFieldSchemaHelper
Helper method that does the actual work for #getFieldSchema(Schema,String)
getMultiFieldValue
getObjectFromArray
Get an object from an array given an index.
getObjectFromMap
This method is to get object from map given a key as string. Avro persists string as Utf8

Popular in Java

Finding current android device location
getContentResolver (Context)
compareTo (BigDecimal)
addToBackStack (FragmentTransaction)
SocketException (java.net)
This SocketException may be thrown during socket creation or setting options, and is the superclass
ByteBuffer (java.nio)
A buffer for bytes. A byte buffer can be created in either one of the following ways: * #allocate
Calendar (java.util)
Calendar is an abstract base class for converting between a Date object and a set of integer fields
BlockingQueue (java.util.concurrent)
A java.util.Queue that additionally supports operations that wait for the queue to become non-empty
BorderLayout (java.awt)
A border layout lays out a container, arranging and resizing its components to fit in five regions:
Get (org.apache.hadoop.hbase.client)
Used to perform Get operations on a single row. To get everything for a row, instantiate a Get objec
Top plugins for WebStorm

How to useAvroUtils in gobblin.util

Best Java code snippets using gobblin.util.AvroUtils (Showing top 20 results out of 315)

How to use
AvroUtils
in
gobblin.util