public DatasetSourceTarget(View<E> view, AvroType<E> avroType) { super(view); this.view = view; this.avroType = avroType; Configuration temp = new Configuration(false /* use an empty conf */ ); DatasetKeyInputFormat.configure(temp).readFrom(view); this.formatBundle = inputBundle(temp); }
@Override public void setConf(Configuration configuration) { conf = configuration; View<E> view = load(configuration); String partitionDir = conf.get(KITE_PARTITION_DIR); if (view.getDataset().getDescriptor().isPartitioned() && partitionDir != null) { delegate = getDelegateInputFormatForPartition(view.getDataset(), partitionDir, conf); } else { delegate = getDelegateInputFormat(view, conf); } }
/** * Adds settings to {@code Configuration} to use {@code DatasetKeyInputFormat} * and returns a helper to add further configuration. * * @param conf a {@code Configuration} * * @since 0.15.0 */ public static ConfigBuilder configure(Configuration conf) { setInputFormatClass(conf); return new ConfigBuilder(conf); }
private InputFormat<E, Void> getDelegateInputFormatForPartition(Dataset<E> dataset, String partitionDir, Configuration conf) { if (!(dataset instanceof FileSystemDataset)) { throw new UnsupportedOperationException("Partitions only supported for " + "FileSystemDataset. Dataset: " + dataset); } FileSystemDataset<E> fsDataset = (FileSystemDataset<E>) dataset; LOG.debug("Getting delegate input format for dataset {} with partition directory {}", dataset, partitionDir); PartitionKey key = fsDataset.keyFromDirectory(new Path(partitionDir)); LOG.debug("Partition key: {}", key); if (key != null) { PartitionedDataset<E> partition = fsDataset.getPartition(key, false); LOG.debug("Partition: {}", partition); return getDelegateInputFormat(partition, conf); } throw new DatasetException("Cannot find partition " + partitionDir); }
private InputFormat<E, Void> getDelegateInputFormatForPartition(Dataset<E> dataset, String partitionDir, Configuration conf) { if (!(dataset instanceof FileSystemDataset)) { throw new UnsupportedOperationException("Partitions only supported for " + "FileSystemDataset. Dataset: " + dataset); } FileSystemDataset<E> fsDataset = (FileSystemDataset<E>) dataset; LOG.debug("Getting delegate input format for dataset {} with partition directory {}", dataset, partitionDir); PartitionKey key = fsDataset.keyFromDirectory(new Path(partitionDir)); LOG.debug("Partition key: {}", key); if (key != null) { PartitionedDataset<E> partition = fsDataset.getPartition(key, false); LOG.debug("Partition: {}", partition); return getDelegateInputFormat(partition, conf); } throw new DatasetException("Cannot find partition " + partitionDir); }
@SuppressWarnings("deprecation") private Job createJob() throws Exception { Job job = new Job(); DatasetKeyInputFormat.configure(job).readFrom(inputDataset).withType(GenericData.Record.class); job.setMapperClass(LineCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(GenericStatsReducer.class); DatasetKeyOutputFormat.configure(job).writeTo(outputDataset).withType(GenericData.Record.class); return job; }
@Override public void setConf(Configuration configuration) { conf = configuration; View<E> view = load(configuration); String partitionDir = conf.get(KITE_PARTITION_DIR); if (view.getDataset().getDescriptor().isPartitioned() && partitionDir != null) { delegate = getDelegateInputFormatForPartition(view.getDataset(), partitionDir, conf); } else { delegate = getDelegateInputFormat(view, conf); } }
/** * Adds settings to {@code Configuration} to use {@code DatasetKeyInputFormat} * and returns a helper to add further configuration. * * @param conf a {@code Configuration} * * @since 0.15.0 */ public static ConfigBuilder configure(Configuration conf) { setInputFormatClass(conf); return new ConfigBuilder(conf); }
@Test @SuppressWarnings("deprecation") public void testJobAppend() throws Exception { populateInputDataset(); populateOutputDataset(); // existing output will be overwritten Job job = new Job(); DatasetKeyInputFormat.configure(job).readFrom(inputDataset).withType(GenericData.Record.class); job.setMapperClass(LineCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(GenericStatsReducer.class); DatasetKeyOutputFormat.configure(job).appendTo(outputDataset).withType(GenericData.Record.class); Assert.assertTrue(job.waitForCompletion(true)); checkOutput(true); }
@Test @SuppressWarnings("deprecation") public void testJobOverwrite() throws Exception { populateInputDataset(); populateOutputDataset(); // existing output will be overwritten Job job = new Job(); DatasetKeyInputFormat.configure(job).readFrom(inputDataset).withType(GenericData.Record.class); job.setMapperClass(LineCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(GenericStatsReducer.class); DatasetKeyOutputFormat.configure(job).overwrite(outputDataset).withType(GenericData.Record.class); Assert.assertTrue(job.waitForCompletion(true)); checkOutput(false); }
Dataset<GenericRecord> outputDataset = repo.create("default", datasetName, descriptor); DatasetKeyInputFormat.configure(job).readFrom(inputDataset);
@Test @SuppressWarnings("deprecation") public void testJobOutputDatasetSignaledReady() throws Exception { Assume.assumeTrue(!Hadoop.isHadoop1()); populateInputDataset(); populateOutputDataset(); // existing output will be overwritten Job job = new Job(); DatasetKeyInputFormat.configure(job).readFrom(inputDataset).withType(GenericData.Record.class); job.setMapperClass(LineCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(GenericStatsReducer.class); DatasetKeyOutputFormat.configure(job).overwrite(outputDataset).withType(GenericData.Record.class); Assert.assertTrue(job.waitForCompletion(true)); Assert.assertTrue("Output dataset should be signaled ready", ((Signalable)outputDataset).isReady()); }
@Test @SuppressWarnings("deprecation") public void testSignalReadyOutputView() throws Exception { Assume.assumeTrue(!Hadoop.isHadoop1()); populateInputDataset(); populateOutputDataset(); // existing output will be overwritten Job job = new Job(); DatasetKeyInputFormat.configure(job).readFrom(inputDataset).withType(GenericData.Record.class); job.setMapperClass(LineCountMapper.class); job.setMapOutputKeyClass(Text.class); job.setMapOutputValueClass(IntWritable.class); job.setReducerClass(GenericStatsReducer.class); View<Record> outputView = outputDataset.with("name", "apple", "banana", "carrot"); DatasetKeyOutputFormat.configure(job).appendTo(outputView).withType(GenericData.Record.class); Assert.assertTrue(job.waitForCompletion(true)); Assert.assertFalse("Output dataset should not be signaled ready", ((Signalable)outputDataset).isReady()); Assert.assertTrue("Output view should be signaled ready", ((Signalable)outputView).isReady()); }
DatasetKeyInputFormat.configure(job).readFrom(inputDataset);
public void run() throws IOException { Configuration conf = new Configuration(); DatasetKeyInputFormat.configure(conf).readFrom(eventsUri).withType(StandardEvent.class); DatasetKeyOutputFormat.configure(conf).writeTo(correlatedEventsUri).withType(CorrelatedEvents.class);