@Override public void write(InternalRow record) { appender.add(record); }
@Override public void add(ManifestFile file) { writer.add(file); }
default void addAll(Iterator<D> values) { while (values.hasNext()) { add(values.next()); } }
@Override public void write(InternalRow row) throws IOException { key.partition(row); if (!key.equals(currentKey)) { closeCurrent(); if (completedPartitions.contains(key)) { // if rows are not correctly grouped, detect and fail the write PartitionKey existingKey = Iterables.find(completedPartitions, key::equals, null); LOG.warn("Duplicate key: {} == {}", existingKey, key); throw new IllegalStateException("Already closed file for partition: " + key.toPath()); } this.currentKey = key.copy(); this.currentPath = outputPathFunc.apply(currentKey); OutputFile file = HadoopOutputFile.fromPath(currentPath, conf); this.currentAppender = factory.newAppender(file, format); } currentAppender.add(row); }
public void add(ManifestEntry entry) { switch (entry.status()) { case ADDED: addedFiles += 1; break; case EXISTING: existingFiles += 1; break; case DELETED: deletedFiles += 1; break; } stats.update(entry.file().partition()); writer.add(entry); }
protected GenericData.Record writeAndRead(String desc, Schema writeSchema, Schema readSchema, GenericData.Record record) throws IOException { File file = temp.newFile(desc + ".avro"); file.delete(); try (FileAppender<GenericData.Record> appender = Avro.write(Files.localOutput(file)) .schema(writeSchema) .build()) { appender.add(record); } Iterable<GenericData.Record> records = Avro.read(Files.localInput(file)) .project(readSchema) .build(); return Iterables.getOnlyElement(records); } }
protected void writeAndValidate(Schema schema) throws IOException { List<Record> expected = RandomAvroData.generate(schema, 100, 0L); File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); try (FileAppender<Record> writer = Avro.write(Files.localOutput(testFile)) .schema(schema) .named("test") .build()) { for (Record rec : expected) { writer.add(rec); } } List<Record> rows; try (AvroIterable<Record> reader = Avro.read(Files.localInput(testFile)) .project(schema) .build()) { rows = Lists.newArrayList(reader); } for (int i = 0; i < expected.size(); i += 1) { AvroTestHelpers.assertEquals(schema.asStruct(), expected.get(i), rows.get(i)); } } }
protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema, Record record) throws IOException { File file = temp.newFile(desc + ".avro"); file.delete(); try (FileAppender<Record> appender = Avro.write(Files.localOutput(file)) .schema(writeSchema) .createWriterFunc(DataWriter::create) .build()) { appender.add(record); } Iterable<Record> records = Avro.read(Files.localInput(file)) .project(readSchema) .createReaderFunc(DataReader::create) .build(); return Iterables.getOnlyElement(records); } }
protected GenericData.Record writeAndRead(String desc, Schema writeSchema, Schema readSchema, GenericData.Record record) throws IOException { File file = temp.newFile(desc + ".parquet"); file.delete(); try (FileAppender<GenericData.Record> appender = Parquet.write(Files.localOutput(file)) .schema(writeSchema) .build()) { appender.add(record); } Iterable<GenericData.Record> records = Parquet.read(Files.localInput(file)) .project(readSchema) .callInit() .build(); return Iterables.getOnlyElement(records); } }
protected void writeAndValidate(Schema schema) throws IOException { List<Record> expected = RandomData.generateList(schema, 100, 0L); File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); try (FileAppender<Record> writer = Avro.write(Files.localOutput(testFile)) .schema(schema) .named("test") .build()) { for (Record rec : expected) { writer.add(rec); } } List<InternalRow> rows; try (AvroIterable<InternalRow> reader = Avro.read(Files.localInput(testFile)) .createReaderFunc(SparkAvroReader::new) .project(schema) .build()) { rows = Lists.newArrayList(reader); } for (int i = 0; i < expected.size(); i += 1) { assertEqualsUnsafe(schema.asStruct(), expected.get(i), rows.get(i)); } } }
protected void writeAndValidate(Schema schema) throws IOException { List<Record> expected = RandomGenericData.generate(schema, 100, 0L); File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); try (FileAppender<Record> writer = Avro.write(Files.localOutput(testFile)) .schema(schema) .createWriterFunc(DataWriter::create) .named("test") .build()) { for (Record rec : expected) { writer.add(rec); } } List<Record> rows; try (AvroIterable<Record> reader = Avro.read(Files.localInput(testFile)) .project(schema) .createReaderFunc(DataReader::create) .build()) { rows = Lists.newArrayList(reader); } for (int i = 0; i < expected.size(); i += 1) { DataTestHelpers.assertEquals(schema.asStruct(), expected.get(i), rows.get(i)); } } }
protected Record writeAndRead(String desc, Schema writeSchema, Schema readSchema, Record record) throws IOException { File file = temp.newFile(desc + ".parquet"); file.delete(); try (FileAppender<Record> appender = Parquet.write(Files.localOutput(file)) .schema(writeSchema) .createWriterFunc(GenericParquetWriter::buildWriter) .build()) { appender.add(record); } Iterable<Record> records = Parquet.read(Files.localInput(file)) .project(readSchema) .createReaderFunc(fileSchema -> GenericParquetReaders.buildReader(readSchema, fileSchema)) .build(); return Iterables.getOnlyElement(records); } }
.build()) { for (Record rec : records) { writer.add(rec);
builder.set("_no_nulls", ""); // optional, but always non-null builder.set("_non_dict", UUID.randomUUID().toString()); // not dictionary-encoded appender.add(builder.build());
@BeforeClass public static void createInputFile() throws IOException { if (PARQUET_FILE.exists()) { Assert.assertTrue(PARQUET_FILE.delete()); } OutputFile outFile = Files.localOutput(PARQUET_FILE); try (FileAppender<Record> appender = Parquet.write(outFile) .schema(FILE_SCHEMA) .build()) { GenericRecordBuilder builder = new GenericRecordBuilder(convert(FILE_SCHEMA, "table")); // create 50 records for (int i = 0; i < 50; i += 1) { builder.set("_id", 30 + i); // min=30, max=79, num-nulls=0 builder.set("_no_stats", TOO_LONG_FOR_STATS); // value longer than 4k will produce no stats builder.set("_required", "req"); // required, always non-null builder.set("_all_nulls", null); // never non-null builder.set("_some_nulls", (i % 10 == 0) ? null : "some"); // includes some null values builder.set("_no_nulls", ""); // optional, but always non-null appender.add(builder.build()); } } InputFile inFile = Files.localInput(PARQUET_FILE); try (ParquetFileReader reader = ParquetFileReader.open(ParquetIO.file(inFile))) { Assert.assertEquals("Should create only one row group", 1, reader.getRowGroups().size()); ROW_GROUP_METADATA = reader.getRowGroups().get(0); PARQUET_SCHEMA = reader.getFileMetaData().getSchema(); } PARQUET_FILE.deleteOnExit(); }