/** * This is the file extension for each partition when using TimePartitionedFileSet as an OutputFormatProvider. * It's used to generate the end of the output file path for each partition. * @param pathFormat The format for the path; for example: 'yyyy-MM-dd/HH-mm,America/Los_Angeles', * which will create a file path ending in the format of 2015-01-01/20-42, * with the time of the partition being the time in the timezone of Los Angeles (PST or PDT). * The pathFormat will be the format provided to * {@link java.text.SimpleDateFormat}. If left blank, then the partitions will be of the form * 2015-01-01/20-42.142017372000, with the time being the time UTC. * Note that each partition must have a unique file path or a runtime exception will be thrown. */ public static void setOutputPathFormat(Map<String, String> arguments, String pathFormat) { setOutputPathFormat(arguments, pathFormat, null); }
@Override @Nullable protected Collection<PartitionKey> computeInputKeys() { Long startTime = TimePartitionedFileSetArguments.getInputStartTime(getRuntimeArguments()); Long endTime = TimePartitionedFileSetArguments.getInputEndTime(getRuntimeArguments()); if (startTime == null && endTime == null) { // no times specified; perhaps a partition filter was specified. super will deal with that return super.computeInputKeys(); } if (startTime == null) { throw new DataSetException("Start time for input time range must be given as argument."); } if (endTime == null) { throw new DataSetException("End time for input time range must be given as argument."); } return getPartitionPathsByTime(startTime, endTime); }
protected Map<String, String> updateArgumentsIfNeeded(Map<String, String> arguments) { Long time = TimePartitionedFileSetArguments.getOutputPartitionTime(arguments); if (time != null) { // set the output path according to partition time if (FileSetArguments.getOutputPath(arguments) == null) { String outputPathFormat = TimePartitionedFileSetArguments.getOutputPathFormat(arguments); String path; if (Strings.isNullOrEmpty(outputPathFormat)) { path = String.format("%tF/%tH-%tM.%d", time, time, time, time); } else { SimpleDateFormat format = new SimpleDateFormat(outputPathFormat); String timeZoneID = TimePartitionedFileSetArguments.getOutputPathTimeZone(arguments); if (!Strings.isNullOrEmpty(timeZoneID)) { format.setTimeZone(TimeZone.getTimeZone(timeZoneID)); } path = format.format(new Date(time)); } arguments = Maps.newHashMap(arguments); FileSetArguments.setOutputPath(arguments, path); } // add the corresponding partition key to the arguments PartitionKey outputKey = TimePartitionedFileSetDataset.partitionKeyForTime(time); PartitionedFileSetArguments.setOutputPartitionKey(arguments, outputKey); } // delegate to super class for anything it needs to do return updateArgumentsIfNeeded(arguments, TimePartitionedFileSetDataset.PARTITIONING); } }
@Override public void run(JavaSparkExecutionContext sec) throws Exception { JavaSparkContext jsc = new JavaSparkContext(); String input = sec.getRuntimeArguments().get("input"); String output = sec.getRuntimeArguments().get("output"); // read the dataset JavaPairRDD<Long, String> inputData = sec.fromDataset(input); JavaPairRDD<String, Integer> stringLengths = transformRDD(inputData); // write the character count to dataset sec.saveAsDataset(stringLengths, output); String inputPartitionTime = sec.getRuntimeArguments().get("inputKey"); String outputPartitionTime = sec.getRuntimeArguments().get("outputKey"); // read and write datasets with dataset arguments if (inputPartitionTime != null && outputPartitionTime != null) { Map<String, String> inputArgs = new HashMap<>(); TimePartitionedFileSetArguments.setInputStartTime(inputArgs, Long.parseLong(inputPartitionTime) - 100); TimePartitionedFileSetArguments.setInputEndTime(inputArgs, Long.parseLong(inputPartitionTime) + 100); // read the dataset with user custom dataset args JavaPairRDD<Long, String> customPartitionData = sec.fromDataset(input, inputArgs); // create a new RDD with the same key but with a new value which is the length of the string JavaPairRDD<String, Integer> customPartitionStringLengths = transformRDD(customPartitionData); // write the character count to dataset with user custom dataset args Map<String, String> outputArgs = new HashMap<>(); TimePartitionedFileSetArguments.setOutputPartitionTime(outputArgs, Long.parseLong(outputPartitionTime)); sec.saveAsDataset(customPartitionStringLengths, output, outputArgs); } }
TimePartitionedFileSetArguments.setOutputPartitionTime(outputArgs, time); final ImmutableMap<String, String> assignedMetadata = ImmutableMap.of("region", "13", "data.source.name", "input", "data.source.type", "table"); TimePartitionedFileSetArguments.setOutputPartitionMetadata(outputArgs, assignedMetadata); runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, outputArgs)); Assert.assertTrue( TimePartitionedFileSetArguments.setOutputPartitionTime(outputArgs, time5); runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, outputArgs)); TimePartitionedFileSetArguments.setInputStartTime(inputArgs, time - TimeUnit.MINUTES.toMillis(5)); TimePartitionedFileSetArguments.setInputEndTime(inputArgs, time5 + TimeUnit.MINUTES.toMillis(5)); runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, inputArgs)); runtimeArguments.put(AppWithTimePartitionedFileSet.ROW_TO_WRITE, "a"); TimePartitionedFileSetArguments.setInputStartTime(inputArgs, time - TimeUnit.MINUTES.toMillis(5)); TimePartitionedFileSetArguments.setInputEndTime(inputArgs, time + TimeUnit.MINUTES.toMillis(2)); runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, inputArgs)); runtimeArguments.put(AppWithTimePartitionedFileSet.ROW_TO_WRITE, "b"); TimePartitionedFileSetArguments.setInputStartTime(inputArgs, time - TimeUnit.MINUTES.toMillis(10)); TimePartitionedFileSetArguments.setInputEndTime(inputArgs, time - TimeUnit.MINUTES.toMillis(9)); runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, inputArgs)); runtimeArguments.put(AppWithTimePartitionedFileSet.ROW_TO_WRITE, "n");
throws IOException, DatasetManagementException, InterruptedException, TransactionFailureException { Map<String, String> arguments = Maps.newHashMap(); TimePartitionedFileSetArguments.setInputStartTime(arguments, time + start * MINUTE); TimePartitionedFileSetArguments.setInputEndTime(arguments, time + end * MINUTE); final TimePartitionedFileSet tpfs = dsFrameworkUtil.getInstance(TPFS_INSTANCE, arguments); TransactionAware txAwareDataset = (TransactionAware) tpfs;
TimePartitionedFileSetArguments.setInputStartTime(arguments, time8 - 30 * MINUTE); TimePartitionedFileSetArguments.setInputEndTime(arguments, time8 + 30 * MINUTE); testInputConfiguration(arguments, path8); TimePartitionedFileSetArguments.setInputPartitionFilter(arguments, filter9); testInputConfiguration(arguments, path8); TimePartitionedFileSetArguments.setInputPartitionFilter(arguments, filter9); testInputConfiguration(arguments, path9); TimePartitionedFileSetArguments.setInputStartTime(arguments, time8 + 30 * MINUTE); testInputConfigurationFailure(arguments, " with only a start time"); arguments.clear(); TimePartitionedFileSetArguments.setInputEndTime(arguments, time8 + 30 * MINUTE); testInputConfigurationFailure(arguments, " with only an end time");
TimePartitionedFileSetArguments.setOutputPartitionTime(args, date.getTime()); TimeZone timeZone = Calendar.getInstance().getTimeZone(); TimePartitionedFileSetArguments.setOutputPathFormat(args, "yyyy-MM-dd/HH_mm", timeZone.getID()); TimePartitionedFileSet ds = dsFrameworkUtil.getInstance(TPFS_INSTANCE, args); .build(); TimePartitionedFileSet ds1 = dsFrameworkUtil.getInstance(TPFS_INSTANCE, args); TimePartitionedFileSetArguments.setOutputPartitionKey(args, key); outputConfig = ds1.getOutputFormatConfiguration(); Assert.assertTrue(outputConfig.get(FileOutputFormat.OUTDIR).endsWith("2015-01-01/20_42")); TimePartitionedFileSetArguments.setOutputPartitionKey(args, key); TimePartitionedFileSet ds2 = dsFrameworkUtil.getInstance(TPFS_INSTANCE, args); outputConfig = ds2.getOutputFormatConfiguration();
TimePartitionedFileSetArguments.setOutputPartitionTime(sinkArgs, outputPartitionTime); if (!Strings.isNullOrEmpty(tpfsSinkConfig.filePathFormat)) { TimePartitionedFileSetArguments.setOutputPathFormat(sinkArgs, tpfsSinkConfig.filePathFormat, tpfsSinkConfig.timeZone);
TimePartitionedFileSetArguments.setInputStartTime(inputArgs, inputTime - 100); TimePartitionedFileSetArguments.setInputEndTime(inputArgs, inputTime + 100); Map<String, String> outputArgs = new HashMap<>(); TimePartitionedFileSetArguments.setOutputPartitionTime(outputArgs, outputTime); Map<String, String> args = new HashMap<>(); args.putAll(RuntimeArguments.addScope(Scope.DATASET, "tpfs", inputArgs));
@Override public void prepareRun(BatchSourceContext context) throws DatasetManagementException, InstantiationException { config.validate(); InputFormatProvider inputFormatProvider = context.newPluginInstance(FORMAT_PLUGIN_ID); DatasetProperties datasetProperties = createProperties(inputFormatProvider); // If macros provided at runtime, dataset still needs to be created if (!context.datasetExists(config.getName())) { String tpfsName = config.getName(); context.createDataset(tpfsName, TimePartitionedFileSet.class.getName(), datasetProperties); } Schema schema = config.getSchema(); if (schema.getFields() != null) { String formatName = getInputFormatName(); FieldOperation operation = new FieldReadOperation("Read", String.format("Read from TimePartitionedFileSet in %s format.", formatName), EndPoint.of(context.getNamespace(), config.getName()), schema.getFields().stream().map(Schema.Field::getName).collect(Collectors.toList())); context.record(Collections.singletonList(operation)); } long duration = TimeParser.parseDuration(config.getDuration()); long delay = Strings.isNullOrEmpty(config.getDelay()) ? 0 : TimeParser.parseDuration(config.getDelay()); long endTime = context.getLogicalStartTime() - delay; long startTime = endTime - duration; Map<String, String> sourceArgs = Maps.newHashMap(datasetProperties.getProperties()); TimePartitionedFileSetArguments.setInputStartTime(sourceArgs, startTime); TimePartitionedFileSetArguments.setInputEndTime(sourceArgs, endTime); context.setInput(Input.ofDataset(config.getName(), sourceArgs)); }
protected Map<String, String> updateArgumentsIfNeeded(Map<String, String> arguments) { Long time = TimePartitionedFileSetArguments.getOutputPartitionTime(arguments); if (time != null) { // set the output path according to partition time if (FileSetArguments.getOutputPath(arguments) == null) { String outputPathFormat = TimePartitionedFileSetArguments.getOutputPathFormat(arguments); String path; if (Strings.isNullOrEmpty(outputPathFormat)) { path = String.format("%tF/%tH-%tM.%d", time, time, time, time); } else { SimpleDateFormat format = new SimpleDateFormat(outputPathFormat); String timeZoneID = TimePartitionedFileSetArguments.getOutputPathTimeZone(arguments); if (!Strings.isNullOrEmpty(timeZoneID)) { format.setTimeZone(TimeZone.getTimeZone(timeZoneID)); } path = format.format(new Date(time)); } arguments = Maps.newHashMap(arguments); FileSetArguments.setOutputPath(arguments, path); } // add the corresponding partition key to the arguments PartitionKey outputKey = TimePartitionedFileSetDataset.partitionKeyForTime(time); PartitionedFileSetArguments.setOutputPartitionKey(arguments, outputKey); } // delegate to super class for anything it needs to do return updateArgumentsIfNeeded(arguments, TimePartitionedFileSetDataset.PARTITIONING); } }
@Override @Nullable protected Collection<PartitionKey> computeInputKeys() { Long startTime = TimePartitionedFileSetArguments.getInputStartTime(getRuntimeArguments()); Long endTime = TimePartitionedFileSetArguments.getInputEndTime(getRuntimeArguments()); if (startTime == null && endTime == null) { // no times specified; perhaps a partition filter was specified. super will deal with that return super.computeInputKeys(); } if (startTime == null) { throw new DataSetException("Start time for input time range must be given as argument."); } if (endTime == null) { throw new DataSetException("End time for input time range must be given as argument."); } return getPartitionPathsByTime(startTime, endTime); }