org.kitesdk.data.mapreduce.DatasetKeyInputFormat java code examples

public DatasetSourceTarget(View<E> view, AvroType<E> avroType) {
 super(view);
 this.view = view;
 this.avroType = avroType;
 Configuration temp = new Configuration(false /* use an empty conf */ );
 DatasetKeyInputFormat.configure(temp).readFrom(view);
 this.formatBundle = inputBundle(temp);
}

@Override
public void setConf(Configuration configuration) {
 conf = configuration;
 View<E> view = load(configuration);
 String partitionDir = conf.get(KITE_PARTITION_DIR);
 if (view.getDataset().getDescriptor().isPartitioned() && partitionDir != null) {
  delegate = getDelegateInputFormatForPartition(view.getDataset(), partitionDir, conf);
 } else {
  delegate = getDelegateInputFormat(view, conf);
 }
}

/**
 * Adds settings to {@code Configuration} to use {@code DatasetKeyInputFormat}
 * and returns a helper to add further configuration.
 *
 * @param conf a {@code Configuration}
 *
 * @since 0.15.0
 */
public static ConfigBuilder configure(Configuration conf) {
 setInputFormatClass(conf);
 return new ConfigBuilder(conf);
}

private InputFormat<E, Void> getDelegateInputFormatForPartition(Dataset<E> dataset,
  String partitionDir, Configuration conf) {
 if (!(dataset instanceof FileSystemDataset)) {
  throw new UnsupportedOperationException("Partitions only supported for " +
    "FileSystemDataset. Dataset: " + dataset);
 }
 FileSystemDataset<E> fsDataset = (FileSystemDataset<E>) dataset;
 LOG.debug("Getting delegate input format for dataset {} with partition directory {}",
   dataset, partitionDir);
 PartitionKey key = fsDataset.keyFromDirectory(new Path(partitionDir));
 LOG.debug("Partition key: {}", key);
 if (key != null) {
  PartitionedDataset<E> partition = fsDataset.getPartition(key, false);
  LOG.debug("Partition: {}", partition);
  return getDelegateInputFormat(partition, conf);
 }
 throw new DatasetException("Cannot find partition " + partitionDir);
}

private InputFormat<E, Void> getDelegateInputFormatForPartition(Dataset<E> dataset,
  String partitionDir, Configuration conf) {
 if (!(dataset instanceof FileSystemDataset)) {
  throw new UnsupportedOperationException("Partitions only supported for " +
    "FileSystemDataset. Dataset: " + dataset);
 }
 FileSystemDataset<E> fsDataset = (FileSystemDataset<E>) dataset;
 LOG.debug("Getting delegate input format for dataset {} with partition directory {}",
   dataset, partitionDir);
 PartitionKey key = fsDataset.keyFromDirectory(new Path(partitionDir));
 LOG.debug("Partition key: {}", key);
 if (key != null) {
  PartitionedDataset<E> partition = fsDataset.getPartition(key, false);
  LOG.debug("Partition: {}", partition);
  return getDelegateInputFormat(partition, conf);
 }
 throw new DatasetException("Cannot find partition " + partitionDir);
}

@SuppressWarnings("deprecation")
private Job createJob() throws Exception {
 Job job = new Job();
 DatasetKeyInputFormat.configure(job).readFrom(inputDataset).withType(GenericData.Record.class);
 job.setMapperClass(LineCountMapper.class);
 job.setMapOutputKeyClass(Text.class);
 job.setMapOutputValueClass(IntWritable.class);
 job.setReducerClass(GenericStatsReducer.class);
 DatasetKeyOutputFormat.configure(job).writeTo(outputDataset).withType(GenericData.Record.class);
 return job;
}

@Override
public void setConf(Configuration configuration) {
 conf = configuration;
 View<E> view = load(configuration);
 String partitionDir = conf.get(KITE_PARTITION_DIR);
 if (view.getDataset().getDescriptor().isPartitioned() && partitionDir != null) {
  delegate = getDelegateInputFormatForPartition(view.getDataset(), partitionDir, conf);
 } else {
  delegate = getDelegateInputFormat(view, conf);
 }
}

/**
 * Adds settings to {@code Configuration} to use {@code DatasetKeyInputFormat}
 * and returns a helper to add further configuration.
 *
 * @param conf a {@code Configuration}
 *
 * @since 0.15.0
 */
public static ConfigBuilder configure(Configuration conf) {
 setInputFormatClass(conf);
 return new ConfigBuilder(conf);
}

@Test
@SuppressWarnings("deprecation")
public void testJobAppend() throws Exception {
 populateInputDataset();
 populateOutputDataset(); // existing output will be overwritten
 Job job = new Job();
 DatasetKeyInputFormat.configure(job).readFrom(inputDataset).withType(GenericData.Record.class);
 job.setMapperClass(LineCountMapper.class);
 job.setMapOutputKeyClass(Text.class);
 job.setMapOutputValueClass(IntWritable.class);
 job.setReducerClass(GenericStatsReducer.class);
 DatasetKeyOutputFormat.configure(job).appendTo(outputDataset).withType(GenericData.Record.class);
 Assert.assertTrue(job.waitForCompletion(true));
 checkOutput(true);
}

@Test
@SuppressWarnings("deprecation")
public void testJobOverwrite() throws Exception {
 populateInputDataset();
 populateOutputDataset(); // existing output will be overwritten
 Job job = new Job();
 DatasetKeyInputFormat.configure(job).readFrom(inputDataset).withType(GenericData.Record.class);
 job.setMapperClass(LineCountMapper.class);
 job.setMapOutputKeyClass(Text.class);
 job.setMapOutputValueClass(IntWritable.class);
 job.setReducerClass(GenericStatsReducer.class);
 DatasetKeyOutputFormat.configure(job).overwrite(outputDataset).withType(GenericData.Record.class);
 Assert.assertTrue(job.waitForCompletion(true));
 checkOutput(false);
}

Dataset<GenericRecord> outputDataset = repo.create("default", datasetName, descriptor);
DatasetKeyInputFormat.configure(job).readFrom(inputDataset);

@Test
@SuppressWarnings("deprecation")
public void testJobOutputDatasetSignaledReady() throws Exception {
 Assume.assumeTrue(!Hadoop.isHadoop1());
 populateInputDataset();
 populateOutputDataset(); // existing output will be overwritten
 Job job = new Job();
 DatasetKeyInputFormat.configure(job).readFrom(inputDataset).withType(GenericData.Record.class);
 job.setMapperClass(LineCountMapper.class);
 job.setMapOutputKeyClass(Text.class);
 job.setMapOutputValueClass(IntWritable.class);
 job.setReducerClass(GenericStatsReducer.class);
 DatasetKeyOutputFormat.configure(job).overwrite(outputDataset).withType(GenericData.Record.class);
 Assert.assertTrue(job.waitForCompletion(true));
 Assert.assertTrue("Output dataset should be signaled ready",
   ((Signalable)outputDataset).isReady());
}

@Test
@SuppressWarnings("deprecation")
public void testSignalReadyOutputView() throws Exception {
 Assume.assumeTrue(!Hadoop.isHadoop1());
 populateInputDataset();
 populateOutputDataset(); // existing output will be overwritten
 Job job = new Job();
 DatasetKeyInputFormat.configure(job).readFrom(inputDataset).withType(GenericData.Record.class);
 job.setMapperClass(LineCountMapper.class);
 job.setMapOutputKeyClass(Text.class);
 job.setMapOutputValueClass(IntWritable.class);
 job.setReducerClass(GenericStatsReducer.class);
 View<Record> outputView = outputDataset.with("name", "apple", "banana", "carrot");
 DatasetKeyOutputFormat.configure(job).appendTo(outputView).withType(GenericData.Record.class);
 Assert.assertTrue(job.waitForCompletion(true));
 Assert.assertFalse("Output dataset should not be signaled ready",
   ((Signalable)outputDataset).isReady());
 Assert.assertTrue("Output view should be signaled ready",
   ((Signalable)outputView).isReady());
}

DatasetKeyInputFormat.configure(job).readFrom(inputDataset);

public void run() throws IOException {
 Configuration conf = new Configuration();
 DatasetKeyInputFormat.configure(conf).readFrom(eventsUri).withType(StandardEvent.class);
 DatasetKeyOutputFormat.configure(conf).writeTo(correlatedEventsUri).withType(CorrelatedEvents.class);

Javadoc

A MapReduce InputFormat for reading from a Dataset. Since a Dataset only contains entities (not key/value pairs), this output format ignores the value.

Most used methods

configure
Configures the Job to use the DatasetKeyInputFormat and returns a helper to add further configuratio
getDelegateInputFormat
getDelegateInputFormatForPartition
load
setInputFormatClass

Popular in Java

Parsing JSON documents to java classes using gson
compareTo (BigDecimal)
getApplicationContext (Context)
getExternalFilesDir (Context)
Thread (java.lang)
A thread is a thread of execution in a program. The Java Virtual Machine allows an application to ha
Queue (java.util)
A collection designed for holding elements prior to processing. Besides basic java.util.Collection o
TreeMap (java.util)
Walk the nodes of the tree left-to-right or right-to-left. Note that in descending iterations, next
Callable (java.util.concurrent)
A task that returns a result and may throw an exception. Implementors define a single method with no
ConcurrentHashMap (java.util.concurrent)
A plug-in replacement for JDK1.5 java.util.concurrent.ConcurrentHashMap. This version is based on or
Annotation (javassist.bytecode.annotation)
The annotation structure.An instance of this class is returned bygetAnnotations() in AnnotationsAttr
Github Copilot alternatives

How to useDatasetKeyInputFormat in org.kitesdk.data.mapreduce

Best Java code snippets using org.kitesdk.data.mapreduce.DatasetKeyInputFormat (Showing top 15 results out of 315)

How to use
DatasetKeyInputFormat
in
org.kitesdk.data.mapreduce