return Datasets.load(uri).getDataset().getDescriptor().getSchema(); } else if ("resource".equals(uri.getScheme())) { try (InputStream in = Resources.getResource(uri.getSchemeSpecificPart()).openStream()) {
@Override public void sync() throws EventDeliveryException { if (nEventsHandled > 0) { if (Formats.PARQUET.equals( dataset.getDataset().getDescriptor().getFormat())) { // We need to close the writer on sync if we're writing to a Parquet // dataset close(); } else { if (writer instanceof Syncable) { ((Syncable) writer).sync(); } } } }
DatasetDescriptor descriptor = view.getDataset().getDescriptor(); Format format = descriptor.getFormat(); Preconditions.checkArgument(allowedFormats().contains(format.getName()),
final Schema schema = target.getDataset().getDescriptor().getSchema();
public CSVRecordParser(CSVProperties props, View<E> view, @Nullable List<String> header) { this(props, view.getDataset().getDescriptor().getSchema(), view.getType(), header); }
protected void checkSchemaForRead() { IncompatibleSchemaException.check(canRead, "Cannot read data with this view's schema:\n" + "Current schema: %s\nDataset schema: %s", dataset.getDescriptor().getSchema(), getSchema()); } }
public DatasetWriterCacheLoader(FileSystemView<E> view, ConfAccessor conf) { this.view = view; this.convert = new PathConversion( view.getDataset().getDescriptor().getSchema()); this.conf = conf; }
protected void checkSchemaForWrite() { IncompatibleSchemaException.check(canWrite, "Cannot write data with this view's schema, " + "it cannot be read with the dataset's schema:\n" + "Current schema: %s\nDataset schema: %s", getSchema(), dataset.getDescriptor().getSchema()); }
public IncrementalDatasetWriterCacheLoader(FileSystemView<E> view, ConfAccessor conf) { this.view = view; this.convert = new PathConversion( view.getDataset().getDescriptor().getSchema()); this.conf = conf; }
@SuppressWarnings("unchecked") private static <T> AvroType<T> ptype(View<T> view) { Class<T> recordClass = view.getType(); if (GenericRecord.class.isAssignableFrom(recordClass)) { return (AvroType<T>) Avros.generics( view.getDataset().getDescriptor().getSchema()); } else { return Avros.records(recordClass); } }
PathIterator pathIterator() { if (dataset.getDescriptor().isPartitioned()) { return new PathIterator(fs, root, partitionIterator()); } else { return new PathIterator(fs, root, null); } }
@Override public void setConf(Configuration configuration) { conf = configuration; View<E> view = load(configuration); String partitionDir = conf.get(KITE_PARTITION_DIR); if (view.getDataset().getDescriptor().isPartitioned() && partitionDir != null) { delegate = getDelegateInputFormatForPartition(view.getDataset(), partitionDir, conf); } else { delegate = getDelegateInputFormat(view, conf); } }
public DatasetRecordWriter(View<E> view, boolean copyRecords) { this.datasetWriter = view.newWriter(); this.schema = view.getDataset().getDescriptor().getSchema(); this.dataModel = DataModelUtil.getDataModelForType( view.getType()); this.copyRecords = copyRecords; }
@Override public DatasetReader<E> newReader() { checkSchemaForRead(); AbstractDatasetReader<E> reader = new MultiFileDatasetReader<E>(fs, pathIterator(), dataset.getDescriptor(), constraints, getAccessor()); reader.initialize(); return reader; }
@Override protected DatasetWriter<GenericRecord> createWriter() { if (Formats.PARQUET.getName().equals(getDatasetDefinition().getFormat().getName())) { Dataset<GenericRecord> dataset = DatasetUtils.getOrCreateDataset(getDatasetRepositoryFactory(), getDatasetDefinition(), getEntityClass(), GenericRecord.class); schema = dataset.getDescriptor().getSchema(); return dataset.newWriter(); } else { throw new StoreException("Invalid format " + getDatasetDefinition().getFormat() + " specified, you must use 'parquet' with " + this.getClass().getSimpleName() + "."); } }
@Override public <T> View<T> asType(Class<T> type) { if (DataModelUtil.isGeneric(type)) { // if the type is generic, don't reset the schema return project(getSchema(), type); } // otherwise, the type determines the schema return project(getDataset().getDescriptor().getSchema(), type); }
@Override protected DatasetReader<GenericRecord> createReader() { Dataset<GenericRecord> dataset = DatasetUtils.getOrCreateDataset(getDatasetRepositoryFactory(), getDatasetDefinition(), getEntityClass(), GenericRecord.class); schema = dataset.getDescriptor().getSchema(); return dataset.newReader(); }
private static <E> Dataset<E> loadOrCreateTaskAttemptDataset(TaskAttemptContext taskContext) { String taskAttemptDatasetName = getTaskAttemptDatasetName(taskContext); DatasetRepository repo = getDatasetRepository(taskContext); Dataset<E> jobDataset = loadJobDataset(taskContext); if (repo.exists(TEMP_NAMESPACE, taskAttemptDatasetName)) { return repo.load(TEMP_NAMESPACE, taskAttemptDatasetName); } else { return repo.create(TEMP_NAMESPACE, taskAttemptDatasetName, copy(jobDataset.getDescriptor())); } }
@Test public void testDeleteRemovesDatasetPath() throws IOException { ensureCreated(); Dataset<Record> dataset = repo.load(NAMESPACE, NAME); Path dataPath = new Path(dataset.getDescriptor().getLocation()); Assert.assertTrue(fileSystem.exists(dataPath)); repo.delete(NAMESPACE, NAME); Assert.assertFalse(fileSystem.exists(dataPath)); }
@Test public void testLoad() { ensureCreated(); Dataset dataset = repo.load(NAMESPACE, NAME); Assert.assertNotNull("Dataset is loaded and produced", dataset); Assert.assertEquals("Dataset name is propagated", NAME, dataset.getName()); Assert.assertEquals("Dataset schema is loaded", testSchema, dataset.getDescriptor().getSchema()); }