public DatasetDefinition() { this(null, true, Formats.AVRO.getName()); }
public DatasetDefinition(Class<?> targetClass, PartitionStrategy partitionStrategy) { this(targetClass, Formats.AVRO.getName(), partitionStrategy); }
public DatasetDefinition(boolean allowNullValues) { this(null, allowNullValues, Formats.AVRO.getName()); }
/** * Instantiates a new {@code DataStoreWriter} for writing Parquet records to a {@code org.kitesdk.data.Dataset}. * * @param entityClass the {@code Class} that the writer will write to the Dataset * @param datasetRepositoryFactory the {@code DatasetRepositoryFactory} to be used for the writer */ public ParquetDatasetStoreWriter(Class<T> entityClass, DatasetRepositoryFactory datasetRepositoryFactory) { this(entityClass, datasetRepositoryFactory, new DatasetDefinition(entityClass, false, Formats.PARQUET.getName())); }
/** * Instantiates a new {@code DataStoreWriter} for writing Avro records to a {@code org.kitesdk.data.Dataset}. * * @param entityClass the {@code Class} that the writer will write to the Dataset * @param datasetRepositoryFactory the {@code DatasetRepositoryFactory} to be used for the writer */ public AvroPojoDatasetStoreWriter(Class<T> entityClass, DatasetRepositoryFactory datasetRepositoryFactory) { this(entityClass, datasetRepositoryFactory, new DatasetDefinition(entityClass, false, Formats.AVRO.getName())); }
private static void checkCompressionType(Format format, @Nullable CompressionType compressionType) { if (compressionType == null) { return; } ValidationException.check(format.getSupportedCompressionTypes() .contains(compressionType), "Format %s doesn't support compression format %s", format.getName(), compressionType.getName()); }
@Bean public DatasetDefinition fileInfoDatasetDefinition() { DatasetDefinition definition = new DatasetDefinition(); definition.setFormat(Formats.AVRO.getName()); definition.setTargetClass(FileInfo.class); definition.setAllowNullValues(false); return definition; } }
@Override public void initialize() { Preconditions.checkState(state.equals(ReaderWriterState.NEW), "A reader may not be opened more than once - current state:%s", state); final Format format = descriptor.getFormat(); if (!SUPPORTED_FORMATS.contains(format)) { throw new UnknownFormatException("Cannot open format:" + format.getName()); } this.state = ReaderWriterState.OPEN; }
@Override public <T> void write(Collection<T> records) { if (records == null || records.size() < 1) { return; } //TODO: add support for using Spring Data Commons MappingContext @SuppressWarnings("unchecked") Class<T> pojoClass = (Class<T>) records.iterator().next().getClass(); DatasetDefinition datasetDefinition = getDatasetDefinitionToUseFor(pojoClass); DataStoreWriter<T> writer; if (Formats.PARQUET.getName().equals(datasetDefinition.getFormat().getName())) { writer = new ParquetDatasetStoreWriter<T>(pojoClass, dsFactory, datasetDefinition); } else { writer = new AvroPojoDatasetStoreWriter<T>(pojoClass, dsFactory, datasetDefinition); } try { for (T rec : records) { writer.write(rec); } writer.flush(); } catch (IOException e) { throw new StoreException("Error writing " + pojoClass.getName(), e); } finally { try { writer.close(); } catch (IOException ignore) {} } }
@Override protected DatasetWriter<T> createWriter() { if (Formats.AVRO.getName().equals(getDatasetDefinition().getFormat().getName())) { Dataset<T> dataset = DatasetUtils.getOrCreateDataset(getDatasetRepositoryFactory(), getDatasetDefinition(), getEntityClass(), getEntityClass()); return dataset.newWriter(); } else { throw new StoreException("Invalid format " + getDatasetDefinition().getFormat() + " specified, you must use 'avro' with " + this.getClass().getSimpleName() + "."); } }
@Override protected DatasetWriter<GenericRecord> createWriter() { if (Formats.PARQUET.getName().equals(getDatasetDefinition().getFormat().getName())) { Dataset<GenericRecord> dataset = DatasetUtils.getOrCreateDataset(getDatasetRepositoryFactory(), getDatasetDefinition(), getEntityClass(), GenericRecord.class); schema = dataset.getDescriptor().getSchema(); return dataset.newWriter(); } else { throw new StoreException("Invalid format " + getDatasetDefinition().getFormat() + " specified, you must use 'parquet' with " + this.getClass().getSimpleName() + "."); } }
private DatasetWriter<GenericRecord> newWriter( final UserGroupInformation login, final URI uri) { View<GenericRecord> view = KerberosUtil.runPrivileged(login, new PrivilegedExceptionAction<Dataset<GenericRecord>>() { @Override public Dataset<GenericRecord> run() { return Datasets.load(uri); } }); DatasetDescriptor descriptor = view.getDataset().getDescriptor(); String formatName = descriptor.getFormat().getName(); Preconditions.checkArgument(allowedFormats().contains(formatName), "Unsupported format: " + formatName); Schema newSchema = descriptor.getSchema(); if (targetSchema == null || !newSchema.equals(targetSchema)) { this.targetSchema = descriptor.getSchema(); // target dataset schema has changed, invalidate all readers based on it readers.invalidateAll(); } this.reuseDatum = !("parquet".equals(formatName)); this.datasetName = view.getDataset().getName(); return view.newWriter(); }
properties.setProperty(FORMAT_FIELD_NAME, descriptor.getFormat().getName()); properties.setProperty(COMPRESSION_TYPE_FIELD_NAME, descriptor.getCompressionType().getName());
} else { throw new UnknownFormatException( "No known serde for format:" + format.getName());
} else { throw new UnknownFormatException( "No known serde for format:" + format.getName());
Formats.fromString(format).equals(existingFormat), "Found %s data, but --format is %s", existingFormat.getName(), format); descriptorBuilder.format(existingFormat);