public FileSystemViewKeyInputFormat(FileSystemDataset<E> dataset, Configuration conf) { this.dataset = dataset; this.view = null; LOG.debug("Dataset: {}", dataset); Format format = dataset.getDescriptor().getFormat(); setConfigProperties(conf, format, dataset.getSchema(), dataset.getType()); }
public FileSystemViewKeyInputFormat(FileSystemView<E> view, Configuration conf) { this.dataset = (FileSystemDataset<E>) view.getDataset(); this.view = view; LOG.debug("View: {}", view); Format format = dataset.getDescriptor().getFormat(); setConfigProperties(conf, format, view.getSchema(), view.getType()); }
/** * Adds configuration for {@code DatasetKeyOutputFormat} to write to the * given {@link Dataset} or {@link View} instance. * * @param view a dataset or view * @return this for method chaining */ public ConfigBuilder writeTo(View<?> view) { if (view instanceof FileSystemDataset) { FileSystemDataset dataset = (FileSystemDataset) view; conf.set(KITE_PARTITION_DIR, String.valueOf(dataset.getDescriptor().getLocation())); } withType(view.getType()); return writeTo(view.getUri()); }
/** * Adds configuration for {@code DatasetKeyOutputFormat} to write to the * given {@link Dataset} or {@link View} instance. * * @param view a dataset or view * @return this for method chaining */ public ConfigBuilder writeTo(View<?> view) { if (view instanceof FileSystemDataset) { FileSystemDataset dataset = (FileSystemDataset) view; conf.set(KITE_PARTITION_DIR, String.valueOf(dataset.getDescriptor().getLocation())); } withType(view.getType()); return writeTo(view.getUri()); }
@Override public void merge(FileSystemDataset<E> update) { DatasetDescriptor updateDescriptor = update.getDescriptor();
@Override @SuppressWarnings({"unchecked", "deprecation"}) public List<InputSplit> getSplits(JobContext jobContext) throws IOException { Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext); Job job = new Job(conf); Format format = dataset.getDescriptor().getFormat(); if (setInputPaths(jobContext, job)) { if (Formats.AVRO.equals(format)) { AvroJob.setInputKeySchema(job, dataset.getDescriptor().getSchema()); AvroCombineInputFormat<E> delegate = new AvroCombineInputFormat<E>(); return delegate.getSplits(jobContext); } else if (Formats.PARQUET.equals(format)) { AvroParquetCombineInputFormat delegate = new AvroParquetCombineInputFormat(); return delegate.getSplits(jobContext); } else if (Formats.JSON.equals(format)) { return new JSONInputFormat().getSplits(jobContext); } else if (Formats.CSV.equals(format)) { // this generates an unchecked cast exception? return new CSVInputFormat().getSplits(jobContext); } else if (Formats.INPUTFORMAT.equals(format)) { return InputFormatUtil.newInputFormatInstance(dataset.getDescriptor()) .getSplits(jobContext); } else { throw new UnsupportedOperationException( "Not a supported format: " + format); } } else { return ImmutableList.of(); } }
@SuppressWarnings("unchecked") private RecordReader<E, Void> createUnfilteredRecordReader(InputSplit inputSplit, TaskAttemptContext taskAttemptContext) throws IOException, InterruptedException { Format format = dataset.getDescriptor().getFormat(); if (Formats.AVRO.equals(format)) { return new AvroKeyReaderWrapper(new AvroCombineInputFormat<E>()); } else if (Formats.PARQUET.equals(format)) { return new ValueReaderWrapper(new AvroParquetCombineInputFormat()); } else if (Formats.JSON.equals(format)) { JSONInputFormat<E> delegate = new JSONInputFormat<E>(); delegate.setView(view != null ? view : dataset); return delegate.createRecordReader(inputSplit, taskAttemptContext); } else if (Formats.CSV.equals(format)) { CSVInputFormat<E> delegate = new CSVInputFormat<E>(); delegate.setView(view != null ? view : dataset); return delegate.createRecordReader(inputSplit, taskAttemptContext); } else if (Formats.INPUTFORMAT.equals(format)) { return InputFormatUtil.newRecordReader(dataset.getDescriptor()); } else { throw new UnsupportedOperationException( "Not a supported format: " + format); } }
@Override @edu.umd.cs.findbugs.annotations.SuppressWarnings( value="BC_UNCONFIRMED_CAST_OF_RETURN_VALUE", justification="Writer is known to be IncrementalWriter") public FileSystemWriter.IncrementalWriter<E> load(StorageKey key) throws Exception { Preconditions.checkState(view.getDataset() instanceof FileSystemDataset, "FileSystemWriters cannot create writer for " + view.getDataset()); FileSystemDataset dataset = (FileSystemDataset) view.getDataset(); Path partition = convert.fromKey(key); FileSystemWriter<E> writer = FileSystemWriter.newWriter( dataset.getFileSystem(), new Path(dataset.getDirectory(), partition), conf.getRollIntervalMillis(), conf.getTargetFileSize(), dataset.getDescriptor(), view.getAccessor().getWriteSchema()); PartitionListener listener = dataset.getPartitionListener(); if (listener != null) { listener.partitionAdded( dataset.getNamespace(), dataset.getName(), partition.toString()); } // initialize the writer after calling the listener // this lets the listener decide if and how to create the // partition directory writer.initialize(); return (FileSystemWriter.IncrementalWriter<E>) writer; }
@Override public FileSystemWriter<E> load(StorageKey key) throws Exception { Preconditions.checkState(view.getDataset() instanceof FileSystemDataset, "FileSystemWriters cannot create writer for " + view.getDataset()); FileSystemDataset dataset = (FileSystemDataset) view.getDataset(); Path partition = convert.fromKey(key); FileSystemWriter<E> writer = FileSystemWriter.newWriter( dataset.getFileSystem(), new Path(dataset.getDirectory(), partition), conf.getRollIntervalMillis(), conf.getTargetFileSize(), dataset.getDescriptor(), view.getAccessor().getWriteSchema()); PartitionListener listener = dataset.getPartitionListener(); if (listener != null) { listener.partitionAdded( dataset.getNamespace(), dataset.getName(), partition.toString()); } // initialize the writer after calling the listener // this lets the listener decide if and how to create the // partition directory writer.initialize(); return writer; }
@Test public void testUnpartitionedReplace() { // recreate temporary without a partition strategy Datasets.delete("dataset:file:/tmp/datasets/temporary"); DatasetDescriptor descriptor = new DatasetDescriptor .Builder(unpartitioned.getDescriptor()) .location((URI) null) // clear the location .build(); temporary = Datasets.create("dataset:file:/tmp/datasets/temporary", descriptor, TestRecord.class); Assert.assertTrue("Should allow replacing an unpartitioned dataset", unpartitioned.canReplace(unpartitioned)); // make sure there are multiple files writeTestRecords(unpartitioned); writeTestRecords(unpartitioned); writeTestRecords(temporary); writeTestRecords(temporary); Set<String> originalFiles = Sets.newHashSet( Iterators.transform(unpartitioned.pathIterator(), new GetFilename())); Set<String> replacementFiles = Sets.newHashSet( Iterators.transform(temporary.pathIterator(), new GetFilename())); Iterators.transform(temporary.pathIterator(), new GetFilename()); Assert.assertFalse("Sanity check", originalFiles.equals(replacementFiles)); unpartitioned.replace(unpartitioned, temporary); Set<String> replacedFiles = Sets.newHashSet( Iterators.transform(unpartitioned.pathIterator(), new GetFilename())); Assert.assertEquals("Should contain the replacement files", replacementFiles, replacedFiles); }
.build(); Assert.assertTrue("Dataset is partitioned", ds.getDescriptor() .isPartitioned()); Assert.assertEquals(partitionStrategy, ds.getDescriptor() .getPartitionStrategy());
.build(); Assert.assertTrue("Dataset is partitioned", ds.getDescriptor() .isPartitioned()); Assert.assertEquals(partitionStrategy, ds.getDescriptor() .getPartitionStrategy());
.build(); Assert.assertTrue("Dataset is partitioned", ds.getDescriptor() .isPartitioned()); Assert.assertEquals(partitionStrategy, ds.getDescriptor() .getPartitionStrategy());
.build(); Assert.assertTrue("Dataset is partitioned", ds.getDescriptor() .isPartitioned()); Assert.assertEquals(partitionStrategy, ds.getDescriptor() .getPartitionStrategy());
@Test public void testWriteAndRead() throws IOException { FileSystemDataset<Record> ds = new FileSystemDataset.Builder<Record>() .namespace("ns") .name("test") .configuration(getConfiguration()) .descriptor(new DatasetDescriptor.Builder() .schemaUri(USER_SCHEMA_URL) .format(format) .compressionType(compressionType) .location(testDirectory) .build()) .type(Record.class) .build(); Assert.assertFalse("Dataset is not partitioned", ds.getDescriptor() .isPartitioned()); writeTestUsers(ds, 10); checkTestUsers(ds, 10); }