@Override public JsonElement serialize(OutputFormatProvider src, Type typeOfSrc, JsonSerializationContext context) { JsonObject jsonObj = new JsonObject(); jsonObj.addProperty("outputFormatClass", src.getOutputFormatClassName()); jsonObj.add("outputFormatConfig", context.serialize(src.getOutputFormatConfiguration())); return jsonObj; } }
public DatasetOutputFormatProvider(String namespace, String datasetName, Map<String, String> datasetArgs, Dataset dataset) { if (dataset instanceof OutputFormatProvider) { this.outputFormatClassName = ((OutputFormatProvider) dataset).getOutputFormatClassName(); this.configuration = ((OutputFormatProvider) dataset).getOutputFormatConfiguration(); } else if (dataset instanceof BatchWritable) { this.outputFormatClassName = MapReduceBatchWritableOutputFormat.class.getName(); this.configuration = createDatasetConfiguration(namespace, datasetName, datasetArgs); } else { throw new IllegalArgumentException("Dataset '" + dataset + "' is neither OutputFormatProvider nor BatchWritable."); } this.dataset = dataset; }
public DatasetOutputFormatProvider(String namespace, String datasetName, Map<String, String> datasetArgs, Dataset dataset) { if (dataset instanceof OutputFormatProvider) { this.outputFormatClassName = ((OutputFormatProvider) dataset).getOutputFormatClassName(); this.configuration = ((OutputFormatProvider) dataset).getOutputFormatConfiguration(); } else if (dataset instanceof BatchWritable) { this.outputFormatClassName = MapReduceBatchWritableOutputFormat.class.getName(); this.configuration = createDatasetConfiguration(namespace, datasetName, datasetArgs); } else { throw new IllegalArgumentException("Dataset '" + dataset + "' is neither OutputFormatProvider nor BatchWritable."); } this.dataset = dataset; }
public ProvidedOutput(Output originalOutput, OutputFormatProvider outputFormatProvider) { this.output = originalOutput; this.outputFormatProvider = outputFormatProvider; this.outputFormatClassName = outputFormatProvider.getOutputFormatClassName(); this.outputFormatConfiguration = outputFormatProvider.getOutputFormatConfiguration(); if (outputFormatClassName == null) { throw new IllegalArgumentException(String.format("Output '%s' provided null as the output format", output.getAlias())); } if (outputFormatConfiguration == null) { throw new IllegalArgumentException(String.format("Output '%s' provided null as the output format configuration", output.getAlias())); } }
public ProvidedOutput(Output originalOutput, OutputFormatProvider outputFormatProvider) { this.output = originalOutput; this.outputFormatProvider = outputFormatProvider; this.outputFormatClassName = outputFormatProvider.getOutputFormatClassName(); this.outputFormatConfiguration = outputFormatProvider.getOutputFormatConfiguration(); if (outputFormatClassName == null) { throw new IllegalArgumentException(String.format("Output '%s' provided null as the output format", output.getAlias())); } if (outputFormatConfiguration == null) { throw new IllegalArgumentException(String.format("Output '%s' provided null as the output format configuration", output.getAlias())); } }
public <K, V> void writeFromRDD(JavaPairRDD<K, V> rdd, JavaSparkExecutionContext sec, String sinkName, Class<K> keyClass, Class<V> valueClass) { Set<String> outputNames = sinkOutputs.get(sinkName); if (outputNames == null || outputNames.isEmpty()) { // should never happen if validation happened correctly at pipeline configure time throw new IllegalArgumentException(sinkName + " has no outputs. " + "Please check that the sink calls addOutput at some point."); } for (String outputName : outputNames) { OutputFormatProvider outputFormatProvider = outputFormatProviders.get(outputName); if (outputFormatProvider != null) { Configuration hConf = new Configuration(); hConf.clear(); for (Map.Entry<String, String> entry : outputFormatProvider.getOutputFormatConfiguration().entrySet()) { hConf.set(entry.getKey(), entry.getValue()); } hConf.set(MRJobConfig.OUTPUT_FORMAT_CLASS_ATTR, outputFormatProvider.getOutputFormatClassName()); rdd.saveAsNewAPIHadoopDataset(hConf); } DatasetInfo datasetInfo = datasetInfos.get(outputName); if (datasetInfo != null) { sec.saveAsDataset(rdd, datasetInfo.getDatasetName(), datasetInfo.getDatasetArgs()); } } }
private DatasetProperties createProperties(OutputFormatProvider outputFormatProvider) { FileSetProperties.Builder fileProperties = SnapshotFileSet.getBaseProperties(config); addFileProperties(fileProperties); fileProperties.setOutputFormat(outputFormatProvider.getOutputFormatClassName()); for (Map.Entry<String, String> formatProperty : outputFormatProvider.getOutputFormatConfiguration().entrySet()) { fileProperties.setOutputProperty(formatProperty.getKey(), formatProperty.getValue()); } return fileProperties.build(); }
private DatasetProperties createProperties(OutputFormatProvider outputFormatProvider) { FileSetProperties.Builder properties = FileSetProperties.builder(); if (!Strings.isNullOrEmpty(tpfsSinkConfig.basePath)) { properties.setBasePath(tpfsSinkConfig.basePath); } properties.setOutputFormat(outputFormatProvider.getOutputFormatClassName()); for (Map.Entry<String, String> formatProperty : outputFormatProvider.getOutputFormatConfiguration().entrySet()) { properties.setOutputProperty(formatProperty.getKey(), formatProperty.getValue()); } addFileSetProperties(properties); return properties.build(); }
/** * Sets output formats and corresponding properties. * * @param job hadoop job on which configurations will be set * @param outputsMap list of outputs */ public static void setOutputs(Job job, List<ProvidedOutput> outputsMap) throws ClassNotFoundException { OutputFormatProvider rootOutputFormatProvider; rootOutputFormatProvider = getRootOutputFormatProvider(job, outputsMap); // Set root outputformat and its configuration for the Hadoop job. job.getConfiguration().set(ROOT_OUTPUT_FORMAT, rootOutputFormatProvider.getOutputFormatClassName()); for (Map.Entry<String, String> confEntry : rootOutputFormatProvider.getOutputFormatConfiguration().entrySet()) { job.getConfiguration().set(confEntry.getKey(), confEntry.getValue()); } for (ProvidedOutput output : outputsMap) { MultipleOutputs.addNamedOutput(job, output.getOutput().getAlias(), output.getOutputFormatClassName(), job.getOutputKeyClass(), job.getOutputValueClass(), output.getOutputFormatConfiguration()); } }
/** * Sets output formats and corresponding properties. * * @param job hadoop job on which configurations will be set * @param outputsMap list of outputs */ public static void setOutputs(Job job, List<ProvidedOutput> outputsMap) throws ClassNotFoundException { OutputFormatProvider rootOutputFormatProvider; rootOutputFormatProvider = getRootOutputFormatProvider(job, outputsMap); // Set root outputformat and its configuration for the Hadoop job. job.getConfiguration().set(ROOT_OUTPUT_FORMAT, rootOutputFormatProvider.getOutputFormatClassName()); for (Map.Entry<String, String> confEntry : rootOutputFormatProvider.getOutputFormatConfiguration().entrySet()) { job.getConfiguration().set(confEntry.getKey(), confEntry.getValue()); } for (ProvidedOutput output : outputsMap) { MultipleOutputs.addNamedOutput(job, output.getOutput().getAlias(), output.getOutputFormatClassName(), job.getOutputKeyClass(), job.getOutputValueClass(), output.getOutputFormatConfiguration()); } }
void addOutput(String stageName, Output output) { if (output instanceof Output.DatasetOutput) { // Note if output format provider is trackable then it comes in as DatasetOutput Output.DatasetOutput datasetOutput = (Output.DatasetOutput) output; addOutput(stageName, datasetOutput.getName(), datasetOutput.getAlias(), datasetOutput.getArguments()); } else if (output instanceof Output.OutputFormatProviderOutput) { Output.OutputFormatProviderOutput ofpOutput = (Output.OutputFormatProviderOutput) output; addOutput(stageName, ofpOutput.getAlias(), new BasicOutputFormatProvider(ofpOutput.getOutputFormatProvider().getOutputFormatClassName(), ofpOutput.getOutputFormatProvider().getOutputFormatConfiguration())); } else { throw new IllegalArgumentException("Unknown output format type: " + output.getClass().getCanonicalName()); } }
@Override public void configurePipeline(PipelineConfigurer pipelineConfigurer) { tpfsSinkConfig.validate(); String outputFormatName = getOutputFormatName(); OutputFormatProvider outputFormatProvider = pipelineConfigurer.usePlugin("outputformat", outputFormatName, FORMAT_PLUGIN_ID, tpfsSinkConfig.getProperties()); if (outputFormatProvider == null) { throw new IllegalArgumentException( String.format("Could not find the '%s' output format plugin. " + "Please ensure the '%s' format plugin is installed.", outputFormatName, outputFormatName)); } // get output format configuration to give the output format plugin a chance to validate it's config // and fail pipeline deployment if it is invalid outputFormatProvider.getOutputFormatConfiguration(); // create the dataset at configure time if no macros were provided on necessary fields if (!tpfsSinkConfig.containsMacro("name") && !tpfsSinkConfig.containsMacro("basePath") && !tpfsSinkConfig.containsMacro("schema")) { pipelineConfigurer.createDataset(tpfsSinkConfig.name, TimePartitionedFileSet.class.getName(), createProperties(outputFormatProvider)); } }
@Override public final void prepareRun(BatchSinkContext context) throws InstantiationException { config.validate(); // set format specific properties. OutputFormatProvider outputFormatProvider = context.newPluginInstance(FORMAT_PLUGIN_ID); // record field level lineage information // needs to happen before context.addOutput(), otherwise an external dataset without schema will be created. Schema schema = config.getSchema(); if (schema == null) { schema = context.getInputSchema(); } LineageRecorder lineageRecorder = new LineageRecorder(context, config.getReferenceName()); lineageRecorder.createExternalDataset(schema); if (schema != null && schema.getFields() != null && !schema.getFields().isEmpty()) { recordLineage(lineageRecorder, schema.getFields().stream().map(Schema.Field::getName).collect(Collectors.toList())); } Map<String, String> outputProperties = new HashMap<>(outputFormatProvider.getOutputFormatConfiguration()); outputProperties.putAll(getFileSystemProperties(context)); outputProperties.put(FileOutputFormat.OUTDIR, getOutputDir(context.getLogicalStartTime())); context.addOutput(Output.of(config.getReferenceName(), new SinkOutputFormatProvider(outputFormatProvider.getOutputFormatClassName(), outputProperties))); }
@Override public void configurePipeline(PipelineConfigurer pipelineConfigurer) { config.validate(); String outputFormatName = getOutputFormatPlugin(); OutputFormatProvider outputFormatProvider = pipelineConfigurer.usePlugin("outputformat", outputFormatName, FORMAT_PLUGIN_ID, config.getProperties()); if (outputFormatProvider == null) { throw new IllegalArgumentException( String.format("Could not find the '%s' output format plugin. " + "Please ensure the '%s' format plugin is installed.", outputFormatName, outputFormatName)); } // validate config properties outputFormatProvider.getOutputFormatConfiguration(); if (!config.containsMacro("name") && !config.containsMacro("basePath") && !config.containsMacro("fileProperties")) { FileSetProperties.Builder fileProperties = SnapshotFileSet.getBaseProperties(config); addFileProperties(fileProperties); pipelineConfigurer.createDataset(config.getName(), PartitionedFileSet.class, createProperties(outputFormatProvider)); } }