/** * Utility function to convert from an Avro {@link GenericRecord} to a BigQuery {@link TableRow}. * * <p>See <a href="https://cloud.google.com/bigquery/exporting-data-from-bigquery#config">"Avro * format"</a> for more information. */ static TableRow convertGenericRecordToTableRow(GenericRecord record, TableSchema schema) { return convertGenericRecordToTableRow(record, schema.getFields()); }
static Schema toGenericAvroSchema(String schemaName, List<TableFieldSchema> fieldSchemas) { List<Field> avroFields = new ArrayList<>(); for (TableFieldSchema bigQueryField : fieldSchemas) { avroFields.add(convertField(bigQueryField)); } return Schema.createRecord( schemaName, "org.apache.beam.sdk.io.gcp.bigquery", "Translated Avro Schema for " + schemaName, false, avroFields); }
@Nullable private static Object getTypedCellValue(Schema schema, TableFieldSchema fieldSchema, Object v) { // Per https://cloud.google.com/bigquery/docs/reference/v2/tables#schema, the mode field // is optional (and so it may be null), but defaults to "NULLABLE". String mode = firstNonNull(fieldSchema.getMode(), "NULLABLE"); switch (mode) { case "REQUIRED": return convertRequiredField(schema.getType(), schema.getLogicalType(), fieldSchema, v); case "REPEATED": return convertRepeatedField(schema, fieldSchema, v); case "NULLABLE": return convertNullableField(schema, fieldSchema, v); default: throw new UnsupportedOperationException( "Parsing a field with BigQuery field schema mode " + fieldSchema.getMode()); } }
return formatTimestamp(doubleValue.toString()); case "RECORD": verify(v instanceof GenericRecord, "Expected GenericRecord, got %s", v.getClass()); return convertGenericRecordToTableRow((GenericRecord) v, fieldSchema.getFields()); case "BYTES": verify(v instanceof ByteBuffer, "Expected ByteBuffer, got %s", v.getClass());
private long writeRows( String tableId, List<TableRow> rows, TableSchema schema, String destinationPattern) throws IOException { Schema avroSchema = BigQueryAvroUtils.toGenericAvroSchema(tableId, schema.getFields()); List<TableRow> rowsToWrite = Lists.newArrayList(); int shard = 0; for (TableRow row : rows) { rowsToWrite.add(row); if (rowsToWrite.size() == 5) { writeRowsHelper(rowsToWrite, avroSchema, destinationPattern, shard++); rowsToWrite.clear(); } } if (!rowsToWrite.isEmpty()) { writeRowsHelper(rowsToWrite, avroSchema, destinationPattern, shard++); } return shard; }
return BigQueryAvroUtils.formatTimestamp((String) v);
@Nullable private static Object convertNullableField( Schema avroSchema, TableFieldSchema fieldSchema, Object v) { // NULLABLE fields are represented as an Avro Union of the corresponding type and "null". verify( avroSchema.getType() == Type.UNION, "Expected Avro schema type UNION, not %s, for BigQuery NULLABLE field %s", avroSchema.getType(), fieldSchema.getName()); List<Schema> unionTypes = avroSchema.getTypes(); verify( unionTypes.size() == 2, "BigQuery NULLABLE field %s should be an Avro UNION of NULL and another type, not %s", fieldSchema.getName(), unionTypes); if (v == null) { return null; } Type firstType = unionTypes.get(0).getType(); if (!firstType.equals(Type.NULL)) { return convertRequiredField(firstType, unionTypes.get(0).getLogicalType(), fieldSchema, v); } return convertRequiredField( unionTypes.get(1).getType(), unionTypes.get(1).getLogicalType(), fieldSchema, v); }
private static TableRow convertGenericRecordToTableRow( GenericRecord record, List<TableFieldSchema> fields) { TableRow row = new TableRow(); for (TableFieldSchema subSchema : fields) { // Per https://cloud.google.com/bigquery/docs/reference/v2/tables#schema, the name field // is required, so it may not be null. Field field = record.getSchema().getField(subSchema.getName()); Object convertedValue = getTypedCellValue(field.schema(), subSchema, record.get(field.name())); if (convertedValue != null) { // To match the JSON files exported by BigQuery, do not include null values in the output. row.set(field.name(), convertedValue); } } return row; }
private static Field convertField(TableFieldSchema bigQueryField) { Type avroType = BIG_QUERY_TO_AVRO_TYPES.get(bigQueryField.getType()); Schema elementSchema; if (avroType == Type.RECORD) { elementSchema = toGenericAvroSchema(bigQueryField.getName(), bigQueryField.getFields()); } else { elementSchema = Schema.create(avroType); } Schema fieldSchema; if (bigQueryField.getMode() == null || "NULLABLE".equals(bigQueryField.getMode())) { fieldSchema = Schema.createUnion(Schema.create(Type.NULL), elementSchema); } else if ("REQUIRED".equals(bigQueryField.getMode())) { fieldSchema = elementSchema; } else if ("REPEATED".equals(bigQueryField.getMode())) { fieldSchema = Schema.createArray(elementSchema); } else { throw new IllegalArgumentException( String.format("Unknown BigQuery Field Mode: %s", bigQueryField.getMode())); } return new Field( bigQueryField.getName(), fieldSchema, bigQueryField.getDescription(), (Object) null /* Cast to avoid deprecated JsonNode constructor. */); } }
private static List<Object> convertRepeatedField( Schema schema, TableFieldSchema fieldSchema, Object v) { Type arrayType = schema.getType(); verify( arrayType == Type.ARRAY, "BigQuery REPEATED field %s should be Avro ARRAY, not %s", fieldSchema.getName(), arrayType); // REPEATED fields are represented as Avro arrays. if (v == null) { // Handle the case of an empty repeated field. return new ArrayList<>(); } @SuppressWarnings("unchecked") List<Object> elements = (List<Object>) v; ArrayList<Object> values = new ArrayList<>(); Type elementType = schema.getElementType().getType(); LogicalType elementLogicalType = schema.getElementType().getLogicalType(); for (Object element : elements) { values.add(convertRequiredField(elementType, elementLogicalType, fieldSchema, element)); } return values; }
@Override public TableRow apply(SchemaAndRecord schemaAndRecord) { return BigQueryAvroUtils.convertGenericRecordToTableRow( schemaAndRecord.getRecord(), schemaAndRecord.getTableSchema()); } }
tableSchema.setFields(fields); Schema avroSchema = BigQueryAvroUtils.toGenericAvroSchema("testSchema", tableSchema.getFields());
TableRow convertedRow = BigQueryAvroUtils.convertGenericRecordToTableRow(record, tableSchema); TableRow row = new TableRow().set("number", "5").set("associates", new ArrayList<TableRow>()); assertEquals(row, convertedRow); record.put("anniversaryDatetime", new String("2000-01-01 00:00:00.000005")); record.put("anniversaryTime", new Utf8("00:00:00.000005")); TableRow convertedRow = BigQueryAvroUtils.convertGenericRecordToTableRow(record, tableSchema); TableRow row = new TableRow() record.put("associates", Lists.newArrayList(nestedRecord)); record.put("birthdayMoney", ByteBuffer.wrap(birthdayMoneyBytes)); TableRow convertedRow = BigQueryAvroUtils.convertGenericRecordToTableRow(record, tableSchema); TableRow row = new TableRow()