@Override public int hashCode() { return Objects.hash(super.hashCode(), inputFields, outputFields); } }
@Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } if (!super.equals(o)) { return false; } FieldTransformOperation that = (FieldTransformOperation) o; return Objects.equals(inputFields, that.inputFields) && Objects.equals(outputFields, that.outputFields); }
for (FieldOperation fieldOperation : fieldOperations) { Operation newOperation = null; String newOperationName = prefixedOperationName(stageName, fieldOperation.getName()); Set<String> currentOperationOutputs = new LinkedHashSet<>(); switch (fieldOperation.getType()) { case READ: FieldReadOperation read = (FieldReadOperation) fieldOperation; newOperation = new ReadOperation(newOperationName, read.getDescription(), read.getSource(), read.getOutputFields()); currentOperationOutputs.addAll(read.getOutputFields()); break; case TRANSFORM: FieldTransformOperation transform = (FieldTransformOperation) fieldOperation; List<InputField> inputFields = createInputFields(transform.getInputFields(), stageName, processedOperations); newOperation = new TransformOperation(newOperationName, transform.getDescription(), inputFields, transform.getOutputFields()); currentOperationOutputs.addAll(transform.getOutputFields()); break; case WRITE: FieldWriteOperation write = (FieldWriteOperation) fieldOperation; inputFields = createInputFields(write.getInputFields(), stageName, processedOperations); newOperation = new WriteOperation(newOperationName, write.getDescription(), write.getSink(), inputFields); break;
switch (pipelineOperation.getType()) { case READ: FieldReadOperation read = (FieldReadOperation) pipelineOperation; updateInvalidOutputs(Collections.emptyList(), unusedOutputs, redundantOutputs); validInputsSoFar.addAll(read.getOutputFields()); for (String field : read.getOutputFields()) { List<String> origins = unusedOutputs.computeIfAbsent(field, k -> new ArrayList<>()); origins.add(pipelineOperation.getName()); validateInputs(pipelineOperation.getName(), transform.getInputFields(), validInputsSoFar); updateInvalidOutputs(transform.getInputFields(), unusedOutputs, redundantOutputs); validInputsSoFar.addAll(transform.getOutputFields()); for (String field : transform.getOutputFields()) { List<String> origins = unusedOutputs.computeIfAbsent(field, k -> new ArrayList<>()); origins.add(pipelineOperation.getName()); validateInputs(pipelineOperation.getName(), write.getInputFields(), validInputsSoFar); updateInvalidOutputs(write.getInputFields(), unusedOutputs, redundantOutputs); break;
FieldOperation joinOperation = new FieldTransformOperation("Join", JOIN_OPERATION_DESCRIPTION, joinInputs, new ArrayList<>(joinOutputs)); operations.add(joinOperation); FieldOperation identity = new FieldTransformOperation(operationName, IDENTITY_OPERATION_DESCRIPTION, Collections.singletonList(stagedInputField), outputFieldInfo.name); FieldOperation transform = new FieldTransformOperation(operationName, RENAME_OPERATION_DESCRIPTION, Collections.singletonList(stagedInputField), outputFieldInfo.name);
@Override public void prepareRun(BatchSourceContext context) throws DatasetManagementException { super.prepareRun(context); Schema schema = tableConfig.getSchema(); if (schema != null && schema.getFields() != null) { FieldOperation operation = new FieldReadOperation("Read", "Read from Table dataset", EndPoint.of(context.getNamespace(), tableConfig.getName()), schema.getFields().stream().map(Schema.Field::getName) .collect(Collectors.toList())); context.record(Collections.singletonList(operation)); } }
@Override public void prepareRun(BatchSinkContext context) throws DatasetManagementException { super.prepareRun(context); String schemaString = tableSinkConfig.getSchemaStr(); if (schemaString != null) { try { Schema schema = Schema.parseJson(schemaString); if (schema.getFields() != null) { FieldOperation operation = new FieldWriteOperation("Write", "Wrote to CDAP Table", EndPoint.of(context.getNamespace(), tableSinkConfig.getName()), schema.getFields().stream().map(Schema.Field::getName) .collect(Collectors.toList())); context.record(Collections.singletonList(operation)); } } catch (IOException e) { throw new IllegalStateException("Failed to parse schema.", e); } } }
@Override public void prepareRun(StageSubmitterContext context) throws Exception { super.prepareRun(context); List<String> inputFields = new ArrayList<>(); List<String> outputFields = new ArrayList<>(); Schema inputSchema = context.getInputSchema(); if (SchemaValidator.canRecordLineage(inputSchema, "input")) { //noinspection ConstantConditions inputFields = inputSchema.getFields().stream().map(Schema.Field::getName).collect(Collectors.toList()); } Schema outputSchema = context.getOutputSchema(); if (SchemaValidator.canRecordLineage(outputSchema, "output")) { //noinspection ConstantConditions outputFields = outputSchema.getFields().stream().map(Schema.Field::getName).collect(Collectors.toList()); } FieldOperation dataPrepOperation = new FieldTransformOperation("Python", config.script, inputFields, outputFields); context.record(Collections.singletonList(dataPrepOperation)); }
@Override public int hashCode() { return Objects.hash(super.hashCode(), source, outputFields); } }
@Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } if (!super.equals(o)) { return false; } FieldWriteOperation that = (FieldWriteOperation) o; return Objects.equals(inputFields, that.inputFields) && Objects.equals(sink, that.sink); }
@Override public void prepareRun(BatchSourceContext context) throws Exception { InputFormatProvider inputFormatProvider = context.newPluginInstance(FORMAT_PLUGIN_ID); DatasetProperties datasetProperties = createProperties(inputFormatProvider); // Dataset must still be created if macros provided at configure time if (!context.datasetExists(config.getName())) { context.createDataset(config.getName(), PartitionedFileSet.class.getName(), datasetProperties); } PartitionedFileSet partitionedFileSet = context.getDataset(config.getName()); SnapshotFileSet snapshotFileSet = new SnapshotFileSet(partitionedFileSet); Map<String, String> arguments = new HashMap<>(datasetProperties.getProperties()); if (config.getFileProperties() != null) { arguments = GSON.fromJson(config.getFileProperties(), MAP_TYPE); } Schema schema = config.getSchema(); if (schema.getFields() != null) { String formatName = getInputFormatName(); FieldOperation operation = new FieldReadOperation("Read", String.format("Read from SnapshotFile source in %s format.", formatName), EndPoint.of(context.getNamespace(), config.getName()), schema.getFields().stream().map(Schema.Field::getName).collect(Collectors.toList())); context.record(Collections.singletonList(operation)); } context.setInput(Input.ofDataset(config.getName(), snapshotFileSet.getInputArguments(arguments))); }
if (schema.getFields() != null) { FieldOperation operation = new FieldWriteOperation("Write", "Wrote to TPFS dataset", EndPoint.of(context.getNamespace(), tpfsSinkConfig.name), schema.getFields().stream().map(Schema.Field::getName)
@Override public void prepareRun(StageSubmitterContext context) throws Exception { super.prepareRun(context); List<String> inputFields = new ArrayList<>(); List<String> outputFields = new ArrayList<>(); Schema inputSchema = context.getInputSchema(); if (SchemaValidator.canRecordLineage(inputSchema, "input")) { //noinspection ConstantConditions inputFields = inputSchema.getFields().stream().map(Schema.Field::getName).collect(Collectors.toList()); } Schema outputSchema = context.getOutputSchema(); if (SchemaValidator.canRecordLineage(outputSchema, "output")) { //noinspection ConstantConditions outputFields = outputSchema.getFields().stream().map(Schema.Field::getName).collect(Collectors.toList()); } FieldOperation dataPrepOperation = new FieldTransformOperation("JavaScript", config.script, inputFields, outputFields); context.record(Collections.singletonList(dataPrepOperation)); }
@Override public int hashCode() { return Objects.hash(super.hashCode(), inputFields, sink); } }
@Override public boolean equals(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } if (!super.equals(o)) { return false; } FieldReadOperation that = (FieldReadOperation) o; return Objects.equals(source, that.source) && Objects.equals(outputFields, that.outputFields); }
@Override public void prepareRun(BatchSourceContext context) throws DatasetManagementException, InstantiationException { config.validate(); InputFormatProvider inputFormatProvider = context.newPluginInstance(FORMAT_PLUGIN_ID); DatasetProperties datasetProperties = createProperties(inputFormatProvider); // If macros provided at runtime, dataset still needs to be created if (!context.datasetExists(config.getName())) { String tpfsName = config.getName(); context.createDataset(tpfsName, TimePartitionedFileSet.class.getName(), datasetProperties); } Schema schema = config.getSchema(); if (schema.getFields() != null) { String formatName = getInputFormatName(); FieldOperation operation = new FieldReadOperation("Read", String.format("Read from TimePartitionedFileSet in %s format.", formatName), EndPoint.of(context.getNamespace(), config.getName()), schema.getFields().stream().map(Schema.Field::getName).collect(Collectors.toList())); context.record(Collections.singletonList(operation)); } long duration = TimeParser.parseDuration(config.getDuration()); long delay = Strings.isNullOrEmpty(config.getDelay()) ? 0 : TimeParser.parseDuration(config.getDelay()); long endTime = context.getLogicalStartTime() - delay; long startTime = endTime - duration; Map<String, String> sourceArgs = Maps.newHashMap(datasetProperties.getProperties()); TimePartitionedFileSetArguments.setInputStartTime(sourceArgs, startTime); TimePartitionedFileSetArguments.setInputEndTime(sourceArgs, endTime); context.setInput(Input.ofDataset(config.getName(), sourceArgs)); }
@Override public void prepareRun(BatchAggregatorContext context) throws Exception { super.prepareRun(context); LinkedList<FieldOperation> fllOperations = new LinkedList<>(); // in configurePipeline all the necessary checks have been performed already to set output schema if (SchemaValidator.canRecordLineage(context.getOutputSchema(), "output")) { Schema inputSchema = context.getInputSchema(); // for every function record the field level operation details for (GroupByConfig.FunctionInfo functionInfo : conf.getAggregates()) { Schema.Field outputSchemaField = getOutputSchemaField(functionInfo, inputSchema); String operationName = String.format("Group %s", functionInfo.getField()); String description = String.format("Aggregate function applied: '%s'.", functionInfo.getFunction()); FieldOperation operation = new FieldTransformOperation(operationName, description, Collections.singletonList(functionInfo.getField()), outputSchemaField.getName()); fllOperations.add(operation); } } context.record(fllOperations); }