/** * Constructs an AvroDimensionRowParser object based on the DimensionFieldNameMapper object. * * @param dimensionFieldNameMapper Object that defines the dimension field name transformations */ public AvroDimensionRowParser(DimensionFieldNameMapper dimensionFieldNameMapper) { this.dimensionFieldNameMapper = memoize(dimensionFieldNameMapper); }
/** * Returns a stream which parses avro records into dimension rows. * * @param dataFileReader An open file reader for avro records * @param dimension The dimension object used to configure the dimension * * @return A stream over the open file which produces dimension rows * * @throws IllegalArgumentException thrown if JSON object `fields` is not present */ private Stream<DimensionRow> streamDimensionRows( DataFileReader<GenericRecord> dataFileReader, Dimension dimension ) throws IllegalArgumentException { // Validate Schema if (!doesSchemaContainAllDimensionFields(dimension, dataFileReader.getSchema())) { String msg = "The AVRO schema file does not contain all the configured dimension fields"; LOG.error(msg); throw new IllegalArgumentException(msg); } // Generates a set of dimension Rows after retrieving the appropriate fields return StreamSupport.stream(dataFileReader.spliterator(), false) .map(record -> recordToMap(record, dimension)) .map(dimension::parseDimensionRow); }
/** * Parses the avro file and sends dimension rows to a consumer. * * @param dimension The dimension object used to configure the dimension * @param avroFilePath The path of the AVRO data file (.avro) * @param consumer A consumer to process records from the avro file * * @throws IllegalArgumentException thrown if JSON object `fields` is not present */ public void parseAvroFileDimensionRows(Dimension dimension, String avroFilePath, Consumer<DimensionRow> consumer) throws IllegalArgumentException { GenericDatumReader datumReader = new GenericDatumReader(); // Creates an AVRO DataFileReader object that reads the AVRO data file one record at a time try (DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(new File(avroFilePath), datumReader)) { streamDimensionRows(dataFileReader, dimension).forEach(consumer); } catch (IOException e) { String msg = String.format("Unable to process the file, at the location %s", avroFilePath); LOG.error(msg, e); throw new IllegalArgumentException(msg, e); } }
/** * Transform an avro generic record into a set map of fields and values. * * @param genericRecord The avro record being read * @param dimension The dimension for the row being loaded * * @return A map of fields and values for a dimension row */ private Map<String, String> recordToMap(GenericRecord genericRecord, Dimension dimension) { return dimension.getDimensionFields() .stream() .collect( Collectors.toMap( DimensionField::getName, dimensionField -> resolveRecordValue( genericRecord, dimensionFieldNameMapper.convert(dimension, dimensionField) ) )); }
/** * Parses the avro file and returns the dimension rows. * * @param dimension The dimension object used to configure the dimension * @param avroFilePath The path of the AVRO data file (.avro) * * @return A set of dimension rows * * @throws IllegalArgumentException thrown if JSON object `fields` is not present */ public Set<DimensionRow> parseAvroFileDimensionRows(Dimension dimension, String avroFilePath) throws IllegalArgumentException { GenericDatumReader datumReader = new GenericDatumReader(); // Creates an AVRO DataFileReader object that reads the AVRO data file one record at a time try (DataFileReader<GenericRecord> dataFileReader = new DataFileReader<>(new File(avroFilePath), datumReader)) { return streamDimensionRows(dataFileReader, dimension).collect(Collectors.toSet()); } catch (IOException e) { String msg = String.format("Unable to process the file, at the location %s", avroFilePath); LOG.error(msg, e); throw new IllegalArgumentException(msg, e); } }