private void addMergeOperation(Set<String> stageInputs, Map<String, Operation> processedOperations) { Set<String> sortedInputs = new TreeSet<>(stageInputs); String mergeOperationName = prefixedOperationName(Joiner.on(SEPARATOR).join(sortedInputs), "merge"); String mergeDescription = "Merged stages: " + Joiner.on(",").join(sortedInputs); if (processedOperations.containsKey(mergeOperationName)) { // it is possible that same stages act as an input to multiple stages. // we should still only add single merge operation for them return; } List<InputField> inputFields = new ArrayList<>(); for (String inputStage : sortedInputs) { List<String> parentStages = findParentStages(inputStage); for (String parentStage : parentStages) { Map<String, String> fieldOrigins = stageOutputsWithOrigins.get(parentStage); for (Map.Entry<String, String> fieldOrigin : fieldOrigins.entrySet()) { inputFields.add(InputField.of(fieldOrigin.getValue(), fieldOrigin.getKey())); } } } Set<String> outputs = new LinkedHashSet<>(); for (InputField inputField : inputFields) { outputs.add(inputField.getName()); } TransformOperation merge = new TransformOperation(mergeOperationName, mergeDescription, inputFields, new ArrayList<>(outputs)); processedOperations.put(merge.getName(), merge); }
/** * Checks whether the given field is used in the next operations or not * * @param nextOperation the next operation which should either be a {@link TransformOperation} or {@link * WriteOperation} * @param inputField the field whose usage needs to be checked * @return true if the field is used in the nextOperation */ private boolean containsInputField(Operation nextOperation, InputField inputField) { Set<InputField> inputFields = new HashSet<>(); if (OperationType.WRITE == nextOperation.getType()) { WriteOperation nextWrite = (WriteOperation) nextOperation; inputFields = new HashSet<>(nextWrite.getInputs()); } else if (OperationType.TRANSFORM == nextOperation.getType()) { TransformOperation nextTransform = (TransformOperation) nextOperation; inputFields = new HashSet<>(nextTransform.getInputs()); } // if the next operation inputFields does contains the given fieldName return true return inputFields.contains(inputField); }
String mergeOperationName = mergeOperationName(stageInputs); Operation operation = processedOperations.get(mergeOperationName); List<String> outputs = ((TransformOperation) operation).getOutputs(); for (String field : fields) {
@Test(expected = IllegalArgumentException.class) public void testSelfReferentialOperations() { TransformOperation parse = new TransformOperation("parse", "parse", Arrays.asList(InputField.of("read", "body"), InputField.of("parse", "name")), "name", "address"); FieldLineageInfo.getTopologicallySortedOperations(Collections.singleton(parse)); }
private FieldOperationInfo convertToFieldOperationInfo(Operation operation) { FieldOperationInput inputs = null; FieldOperationOutput outputs = null; switch (operation.getType()) { case READ: ReadOperation read = (ReadOperation) operation; inputs = FieldOperationInput.of(read.getSource()); outputs = FieldOperationOutput.of(read.getOutputs()); break; case TRANSFORM: TransformOperation transform = (TransformOperation) operation; inputs = FieldOperationInput.of(transform.getInputs()); outputs = FieldOperationOutput.of(transform.getOutputs()); break; case WRITE: WriteOperation write = (WriteOperation) operation; inputs = FieldOperationInput.of(write.getInputs()); outputs = FieldOperationOutput.of(write.getDestination()); break; } return new FieldOperationInfo(operation.getName(), operation.getDescription(), inputs, outputs); }
/** * Helper method to compute the outgoing connections * @param currentOperation current operation which needs to evaluated * @param visitedOperations a {@link Set} containing all the operations which has been processed so * far. */ private void computeOutgoing(Operation currentOperation, Set<Operation> visitedOperations) { // mark this operation if not already done if (!visitedOperations.add(currentOperation)) { return; } // base condition: if the current operation is write we have reached the end if (currentOperation.getType() == OperationType.WRITE) { return; } // if this is a transform operation then traverse down to all the outgoing operation from this operation // expanding further the traversal and exploring the operations if (currentOperation.getType() == OperationType.TRANSFORM) { TransformOperation transform = (TransformOperation) currentOperation; Set<Operation> operations = operationOutgoingConnections.get(transform.getName()); for (Operation operation : operations) { computeOutgoing(operation, visitedOperations); } } }
@Test(expected = IllegalArgumentException.class) public void testCycle() { EndPoint readEndPoint = EndPoint.of("ns", "file1"); EndPoint writeEndPoint = EndPoint.of("ns", "file2"); ReadOperation read = new ReadOperation("read", "read", readEndPoint, "offset", "body"); TransformOperation parse = new TransformOperation("parse", "parse", Arrays.asList(InputField.of("read", "body"), InputField.of("normalize", "name")), "name", "address"); TransformOperation normalize = new TransformOperation("normalize", "normalize", Collections.singletonList(InputField.of("parse", "name")), "name"); WriteOperation write = new WriteOperation("write", "writing to another file", writeEndPoint, Arrays.asList(InputField.of("normalize", "name"), InputField.of("parse", "address"))); List<Operation> operations = new ArrayList<>(); operations.add(parse); operations.add(read); operations.add(normalize); operations.add(write); FieldLineageInfo.getTopologicallySortedOperations(new HashSet<>(operations)); }
private FieldOperationInfo convertToFieldOperationInfo(Operation operation) { FieldOperationInput inputs = null; FieldOperationOutput outputs = null; switch (operation.getType()) { case READ: ReadOperation read = (ReadOperation) operation; inputs = FieldOperationInput.of(read.getSource()); outputs = FieldOperationOutput.of(read.getOutputs()); break; case TRANSFORM: TransformOperation transform = (TransformOperation) operation; inputs = FieldOperationInput.of(transform.getInputs()); outputs = FieldOperationOutput.of(transform.getOutputs()); break; case WRITE: WriteOperation write = (WriteOperation) operation; inputs = FieldOperationInput.of(write.getInputs()); outputs = FieldOperationOutput.of(write.getDestination()); break; } return new FieldOperationInfo(operation.getName(), operation.getDescription(), inputs, outputs); }
/** * Helper method to compute the outgoing connections * @param currentOperation current operation which needs to evaluated * @param visitedOperations a {@link Set} containing all the operations which has been processed so * far. */ private void computeOutgoing(Operation currentOperation, Set<Operation> visitedOperations) { // mark this operation if not already done if (!visitedOperations.add(currentOperation)) { return; } // base condition: if the current operation is write we have reached the end if (currentOperation.getType() == OperationType.WRITE) { return; } // if this is a transform operation then traverse down to all the outgoing operation from this operation // expanding further the traversal and exploring the operations if (currentOperation.getType() == OperationType.TRANSFORM) { TransformOperation transform = (TransformOperation) currentOperation; Set<Operation> operations = operationOutgoingConnections.get(transform.getName()); for (Operation operation : operations) { computeOutgoing(operation, visitedOperations); } } }
TransformOperation parse = new TransformOperation("parse", "parsing body", Collections.singletonList(InputField.of("read", "body")), "first_name", "last_name"); TransformOperation concat = new TransformOperation("concat", "concatinating the fields", Arrays.asList(InputField.of("parse", "first_name"), InputField.of("parse", "last_name")), "name");
/** * Checks whether the given field is used in the next operations or not * * @param nextOperation the next operation which should either be a {@link TransformOperation} or {@link * WriteOperation} * @param inputField the field whose usage needs to be checked * @return true if the field is used in the nextOperation */ private boolean containsInputField(Operation nextOperation, InputField inputField) { Set<InputField> inputFields = new HashSet<>(); if (OperationType.WRITE == nextOperation.getType()) { WriteOperation nextWrite = (WriteOperation) nextOperation; inputFields = new HashSet<>(nextWrite.getInputs()); } else if (OperationType.TRANSFORM == nextOperation.getType()) { TransformOperation nextTransform = (TransformOperation) nextOperation; inputFields = new HashSet<>(nextTransform.getInputs()); } // if the next operation inputFields does contains the given fieldName return true return inputFields.contains(inputField); }
List<InputField> inputFields = createInputFields(transform.getInputFields(), stageName, processedOperations); newOperation = new TransformOperation(newOperationName, transform.getDescription(), inputFields, transform.getOutputFields()); currentOperationOutputs.addAll(transform.getOutputFields());
case TRANSFORM: TransformOperation transform = (TransformOperation) operation; Set<String> origins = transform.getInputs().stream().map(InputField::getOrigin).collect(Collectors.toSet());
@Test(expected = IllegalArgumentException.class) public void testCycleWithNonExistentOperationNames() { EndPoint readEndPoint = EndPoint.of("ns", "file1"); EndPoint writeEndPoint = EndPoint.of("ns", "file2"); ReadOperation read = new ReadOperation("read", "read", readEndPoint, "offset", "body"); TransformOperation parse = new TransformOperation("parse", "parse", Arrays.asList(InputField.of("read", "body"), InputField.of("normalize", "name"), InputField.of("nop1", "field1")), "name", "address"); TransformOperation normalize = new TransformOperation("normalize", "normalize", Arrays.asList(InputField.of("parse", "name"), InputField.of("nop2", "field2")), "name"); WriteOperation write = new WriteOperation("write", "writing to another file", writeEndPoint, Arrays.asList(InputField.of("normalize", "name"), InputField.of("parse", "address"), InputField.of("nop3", "field3"))); List<Operation> operations = new ArrayList<>(); operations.add(parse); operations.add(read); operations.add(normalize); operations.add(write); FieldLineageInfo.getTopologicallySortedOperations(new HashSet<>(operations)); }
case TRANSFORM: TransformOperation transform = (TransformOperation) operation; Set<String> origins = transform.getInputs().stream().map(InputField::getOrigin).collect(Collectors.toSet());
TransformOperation parse = new TransformOperation("parse", "parse descr", Collections.singletonList(InputField.of("read", "body")), "name", "address"); TransformOperation normalize = new TransformOperation("normalize", "normalize descr", Collections.singletonList(InputField.of("parse", "address")), "address");
/** * Recursively traverse the graph to calculate the incoming operation. * * @param currentOperation the current operation from which the graph needs to explored * @param visitedOperations all the operations visited so far */ private void getIncomingOperationsForFieldHelper(Operation currentOperation, Set<Operation> visitedOperations) { if (!visitedOperations.add(currentOperation)) { return; } // reached the end of backward traversal if (currentOperation.getType() == OperationType.READ) { return; } // for transform we traverse backward in graph further through the inputs of the transform if (currentOperation.getType() == OperationType.TRANSFORM) { TransformOperation transform = (TransformOperation) currentOperation; for (InputField field : transform.getInputs()) { getIncomingOperationsForFieldHelper(operationsMap.get(field.getOrigin()), visitedOperations); } } }
TransformOperation parse = new TransformOperation("parse", "parse descr", Collections.singletonList(InputField.of("read", "body")), "name", "address"); TransformOperation normalize = new TransformOperation("normalize", "normalize descr", Collections.singletonList(InputField.of("parse", "address")), "address");
/** * Recursively traverse the graph to calculate the incoming operation. * * @param currentOperation the current operation from which the graph needs to explored * @param visitedOperations all the operations visited so far */ private void getIncomingOperationsForFieldHelper(Operation currentOperation, Set<Operation> visitedOperations) { if (!visitedOperations.add(currentOperation)) { return; } // reached the end of backward traversal if (currentOperation.getType() == OperationType.READ) { return; } // for transform we traverse backward in graph further through the inputs of the transform if (currentOperation.getType() == OperationType.TRANSFORM) { TransformOperation transform = (TransformOperation) currentOperation; for (InputField field : transform.getInputs()) { getIncomingOperationsForFieldHelper(operationsMap.get(field.getOrigin()), visitedOperations); } } }
@Test public void testValidOperations() { ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("endpoint1"), "offset", "body"); TransformOperation parse = new TransformOperation("parse", "parse body", Collections.singletonList(InputField.of("read", "body")), "name", "address");