private static DatasetRepository getDatasetRepository(JobContext jobContext) { Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext); DatasetRepository repo = DatasetRepositories.repositoryFor(conf.get(KITE_OUTPUT_URI)); if (repo instanceof TemporaryDatasetRepositoryAccessor) { Dataset<Object> dataset = load(jobContext).getDataset(); String namespace = dataset.getNamespace(); repo = ((TemporaryDatasetRepositoryAccessor) repo) .getTemporaryRepository(namespace, getJobDatasetName(jobContext)); } return repo; }
@Override @SuppressWarnings("unchecked") public RecordWriter<E, Void> getRecordWriter(TaskAttemptContext taskAttemptContext) { Configuration conf = Hadoop.TaskAttemptContext .getConfiguration.invoke(taskAttemptContext); View<E> target = load(taskAttemptContext); View<E> working; if (usePerTaskAttemptDatasets(target, conf)) { working = loadOrCreateTaskAttemptView(taskAttemptContext); } else { working = target; } boolean copyRecords = conf.getBoolean(KITE_COPY_RECORDS, false); String partitionDir = conf.get(KITE_PARTITION_DIR); if (working.getDataset().getDescriptor().isPartitioned() && partitionDir != null) { if (!(target instanceof FileSystemDataset)) { throw new UnsupportedOperationException("Partitions only supported for " + "FileSystemDataset. Dataset: " + target); } FileSystemDataset fsDataset = (FileSystemDataset) target; PartitionKey key = fsDataset.keyFromDirectory(new Path(partitionDir)); if (key != null && !key.getValues().isEmpty()) { working = fsDataset.getPartition(key, true); } return new DatasetRecordWriter<E>(working, copyRecords); } else { return new DatasetRecordWriter<E>(working, copyRecords); } }
private static DatasetRepository getDatasetRepository(JobContext jobContext) { Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext); DatasetRepository repo = DatasetRepositories.repositoryFor(conf.get(KITE_OUTPUT_URI)); if (repo instanceof TemporaryDatasetRepositoryAccessor) { Dataset<Object> dataset = load(jobContext).getDataset(); String namespace = dataset.getNamespace(); repo = ((TemporaryDatasetRepositoryAccessor) repo) .getTemporaryRepository(namespace, getJobDatasetName(jobContext)); } return repo; }
@Override @SuppressWarnings("unchecked") public RecordWriter<E, Void> getRecordWriter(TaskAttemptContext taskAttemptContext) { Configuration conf = Hadoop.TaskAttemptContext .getConfiguration.invoke(taskAttemptContext); View<E> target = load(taskAttemptContext); View<E> working; if (usePerTaskAttemptDatasets(target)) { working = loadOrCreateTaskAttemptView(taskAttemptContext); } else { working = target; } boolean copyRecords = conf.getBoolean(KITE_COPY_RECORDS, false); String partitionDir = conf.get(KITE_PARTITION_DIR); if (working.getDataset().getDescriptor().isPartitioned() && partitionDir != null) { if (!(target instanceof FileSystemDataset)) { throw new UnsupportedOperationException("Partitions only supported for " + "FileSystemDataset. Dataset: " + target); } FileSystemDataset fsDataset = (FileSystemDataset) target; PartitionKey key = fsDataset.keyFromDirectory(new Path(partitionDir)); if (key != null && !key.getValues().isEmpty()) { working = fsDataset.getPartition(key, true); } return new DatasetRecordWriter<E>(working, copyRecords); } else { return new DatasetRecordWriter<E>(working, copyRecords); } }
@Override public OutputCommitter getOutputCommitter(TaskAttemptContext taskAttemptContext) { Configuration conf = Hadoop.TaskAttemptContext.getConfiguration.invoke(taskAttemptContext); DefaultConfiguration.init(conf); View<E> view = load(taskAttemptContext); return usePerTaskAttemptDatasets(view, conf) ? new MergeOutputCommitter<E>() : new NullOutputCommitter(); }
@Override public OutputCommitter getOutputCommitter(TaskAttemptContext taskAttemptContext) { Configuration conf = Hadoop.TaskAttemptContext.getConfiguration.invoke(taskAttemptContext); DefaultConfiguration.init(conf); View<E> view = load(taskAttemptContext); return usePerTaskAttemptDatasets(view) ? new MergeOutputCommitter<E>() : new NullOutputCommitter(); }
@Override public void checkOutputSpecs(JobContext jobContext) { // The committer setup will fail if the output dataset does not exist View<E> target = load(jobContext); Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext); switch (conf.getEnum(KITE_WRITE_MODE, WriteMode.DEFAULT)) { case APPEND: break; case OVERWRITE: // if the merge won't use replace, then delete the existing data if (!canReplace(target)) { target.deleteAll(); } break; default: case DEFAULT: boolean isReady = false; if (target instanceof Signalable) { isReady = ((Signalable)target).isReady(); } if (isReady || !target.isEmpty()) { throw new DatasetException( "View is not empty or has been signaled as ready: " + target); } break; } }
@Override public void checkOutputSpecs(JobContext jobContext) { // The committer setup will fail if the output dataset does not exist View<E> target = load(jobContext); Configuration conf = Hadoop.JobContext.getConfiguration.invoke(jobContext); switch (conf.getEnum(KITE_WRITE_MODE, WriteMode.DEFAULT)) { case APPEND: break; case OVERWRITE: // if the merge won't use replace, then delete the existing data if (!canReplace(target)) { target.deleteAll(); } break; default: case DEFAULT: boolean isReady = false; if (target instanceof Signalable) { isReady = ((Signalable)target).isReady(); } if (isReady || !target.isEmpty()) { throw new DatasetException( "View is not empty or has been signaled as ready: " + target); } break; } }
/** * The job dataset may already exist if the ApplicationMaster was restarted */ @SuppressWarnings("unchecked") private static <E> Dataset<E> loadOrCreateJobDataset(JobContext jobContext) { Dataset<Object> dataset = load(jobContext).getDataset(); String jobDatasetName = getJobDatasetName(jobContext); DatasetRepository repo = getDatasetRepository(jobContext); if (repo.exists(TEMP_NAMESPACE, jobDatasetName)) { Dataset<E> tempDataset = repo.load(TEMP_NAMESPACE, jobDatasetName, DatasetKeyOutputFormat.<E>getType(jobContext)); try { Compatibility.checkCompatible(dataset.getDescriptor(), tempDataset.getDescriptor()); return tempDataset; } catch (RuntimeException ex) { // swallow } } return repo.create(TEMP_NAMESPACE, jobDatasetName, copy(dataset.getDescriptor()), DatasetKeyOutputFormat.<E>getType(jobContext)); }
/** * The job dataset may already exist if the ApplicationMaster was restarted */ @SuppressWarnings("unchecked") private static <E> Dataset<E> loadOrCreateJobDataset(JobContext jobContext) { Dataset<Object> dataset = load(jobContext).getDataset(); String jobDatasetName = getJobDatasetName(jobContext); DatasetRepository repo = getDatasetRepository(jobContext); if (repo.exists(TEMP_NAMESPACE, jobDatasetName)) { Dataset<E> tempDataset = repo.load(TEMP_NAMESPACE, jobDatasetName, DatasetKeyOutputFormat.<E>getType(jobContext)); try { Compatibility.checkCompatible(dataset.getDescriptor(), tempDataset.getDescriptor()); return tempDataset; } catch (RuntimeException ex) { // swallow } } return repo.create(TEMP_NAMESPACE, jobDatasetName, copy(dataset.getDescriptor()), DatasetKeyOutputFormat.<E>getType(jobContext)); }