@Override public void prepareRun(BatchSourceContext context) throws DatasetManagementException, InstantiationException { config.validate(); InputFormatProvider inputFormatProvider = context.newPluginInstance(FORMAT_PLUGIN_ID); DatasetProperties datasetProperties = createProperties(inputFormatProvider); // If macros provided at runtime, dataset still needs to be created if (!context.datasetExists(config.getName())) { String tpfsName = config.getName(); context.createDataset(tpfsName, TimePartitionedFileSet.class.getName(), datasetProperties); } Schema schema = config.getSchema(); if (schema.getFields() != null) { String formatName = getInputFormatName(); FieldOperation operation = new FieldReadOperation("Read", String.format("Read from TimePartitionedFileSet in %s format.", formatName), EndPoint.of(context.getNamespace(), config.getName()), schema.getFields().stream().map(Schema.Field::getName).collect(Collectors.toList())); context.record(Collections.singletonList(operation)); } long duration = TimeParser.parseDuration(config.getDuration()); long delay = Strings.isNullOrEmpty(config.getDelay()) ? 0 : TimeParser.parseDuration(config.getDelay()); long endTime = context.getLogicalStartTime() - delay; long startTime = endTime - duration; Map<String, String> sourceArgs = Maps.newHashMap(datasetProperties.getProperties()); TimePartitionedFileSetArguments.setInputStartTime(sourceArgs, startTime); TimePartitionedFileSetArguments.setInputEndTime(sourceArgs, endTime); context.setInput(Input.ofDataset(config.getName(), sourceArgs)); }
@Override public void run(JavaSparkExecutionContext sec) throws Exception { JavaSparkContext jsc = new JavaSparkContext(); String input = sec.getRuntimeArguments().get("input"); String output = sec.getRuntimeArguments().get("output"); // read the dataset JavaPairRDD<Long, String> inputData = sec.fromDataset(input); JavaPairRDD<String, Integer> stringLengths = transformRDD(inputData); // write the character count to dataset sec.saveAsDataset(stringLengths, output); String inputPartitionTime = sec.getRuntimeArguments().get("inputKey"); String outputPartitionTime = sec.getRuntimeArguments().get("outputKey"); // read and write datasets with dataset arguments if (inputPartitionTime != null && outputPartitionTime != null) { Map<String, String> inputArgs = new HashMap<>(); TimePartitionedFileSetArguments.setInputStartTime(inputArgs, Long.parseLong(inputPartitionTime) - 100); TimePartitionedFileSetArguments.setInputEndTime(inputArgs, Long.parseLong(inputPartitionTime) + 100); // read the dataset with user custom dataset args JavaPairRDD<Long, String> customPartitionData = sec.fromDataset(input, inputArgs); // create a new RDD with the same key but with a new value which is the length of the string JavaPairRDD<String, Integer> customPartitionStringLengths = transformRDD(customPartitionData); // write the character count to dataset with user custom dataset args Map<String, String> outputArgs = new HashMap<>(); TimePartitionedFileSetArguments.setOutputPartitionTime(outputArgs, Long.parseLong(outputPartitionTime)); sec.saveAsDataset(customPartitionStringLengths, output, outputArgs); } }
Map<String, String> arguments = Maps.newHashMap(); TimePartitionedFileSetArguments.setInputStartTime(arguments, time + start * MINUTE); TimePartitionedFileSetArguments.setInputEndTime(arguments, time + end * MINUTE); final TimePartitionedFileSet tpfs = dsFrameworkUtil.getInstance(TPFS_INSTANCE, arguments); TransactionAware txAwareDataset = (TransactionAware) tpfs;
TimePartitionedFileSetArguments.setInputEndTime(arguments, time8 + 30 * MINUTE); testInputConfiguration(arguments, path8); testInputConfigurationFailure(arguments, " with only a start time"); arguments.clear(); TimePartitionedFileSetArguments.setInputEndTime(arguments, time8 + 30 * MINUTE); testInputConfigurationFailure(arguments, " with only an end time");
TimePartitionedFileSetArguments.setInputEndTime(inputArgs, inputTime + 100); Map<String, String> outputArgs = new HashMap<>(); TimePartitionedFileSetArguments.setOutputPartitionTime(outputArgs, outputTime);
Map<String, String> inputArgs = Maps.newHashMap(); TimePartitionedFileSetArguments.setInputStartTime(inputArgs, time - TimeUnit.MINUTES.toMillis(5)); TimePartitionedFileSetArguments.setInputEndTime(inputArgs, time5 + TimeUnit.MINUTES.toMillis(5)); runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, inputArgs)); runtimeArguments.put(AppWithTimePartitionedFileSet.ROW_TO_WRITE, "a"); TimePartitionedFileSetArguments.setInputEndTime(inputArgs, time + TimeUnit.MINUTES.toMillis(2)); runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, inputArgs)); runtimeArguments.put(AppWithTimePartitionedFileSet.ROW_TO_WRITE, "b"); TimePartitionedFileSetArguments.setInputEndTime(inputArgs, time - TimeUnit.MINUTES.toMillis(9)); runtimeArguments.putAll(RuntimeArguments.addScope(Scope.DATASET, TIME_PARTITIONED, inputArgs)); runtimeArguments.put(AppWithTimePartitionedFileSet.ROW_TO_WRITE, "n");