private Map<EndPointField, Set<EndPointField>> computeIncomingSummary() { if (writeOperations == null) { computeAndValidateFieldLineageInfo(this.operations); } Map<EndPointField, Set<EndPointField>> summary = new HashMap<>(); for (WriteOperation write : writeOperations) { List<InputField> inputs = write.getInputs(); for (InputField input : inputs) { computeIncomingSummaryHelper(new EndPointField(write.getDestination(), input.getName()), operationsMap.get(input.getOrigin()), write, summary); } } return summary; }
/** * Creates an instance of an input field. * * @param origin the name of the operation which created this input field * @param name the associated with the input field * @return the {@link InputField} */ public static InputField of(String origin, String name) { return new InputField(origin, name); }
private void addMergeOperation(Set<String> stageInputs, Map<String, Operation> processedOperations) { Set<String> sortedInputs = new TreeSet<>(stageInputs); String mergeOperationName = prefixedOperationName(Joiner.on(SEPARATOR).join(sortedInputs), "merge"); String mergeDescription = "Merged stages: " + Joiner.on(",").join(sortedInputs); if (processedOperations.containsKey(mergeOperationName)) { // it is possible that same stages act as an input to multiple stages. // we should still only add single merge operation for them return; } List<InputField> inputFields = new ArrayList<>(); for (String inputStage : sortedInputs) { List<String> parentStages = findParentStages(inputStage); for (String parentStage : parentStages) { Map<String, String> fieldOrigins = stageOutputsWithOrigins.get(parentStage); for (Map.Entry<String, String> fieldOrigin : fieldOrigins.entrySet()) { inputFields.add(InputField.of(fieldOrigin.getValue(), fieldOrigin.getKey())); } } } Set<String> outputs = new LinkedHashSet<>(); for (InputField inputField : inputFields) { outputs.add(inputField.getName()); } TransformOperation merge = new TransformOperation(mergeOperationName, mergeDescription, inputFields, new ArrayList<>(outputs)); processedOperations.put(merge.getName(), merge); }
private Map<EndPoint, Set<String>> computeDestinationFields() { if (writeOperations == null) { computeAndValidateFieldLineageInfo(this.operations); } Map<EndPoint, Set<String>> destinationFields = new HashMap<>(); for (WriteOperation write : this.writeOperations) { Set<String> endPointFields = destinationFields.computeIfAbsent(write.getDestination(), k -> new HashSet<>()); for (InputField field : write.getInputs()) { endPointFields.add(field.getName()); } } return destinationFields; }
/** * Recursively traverse the graph to calculate the incoming operation. * * @param currentOperation the current operation from which the graph needs to explored * @param visitedOperations all the operations visited so far */ private void getIncomingOperationsForFieldHelper(Operation currentOperation, Set<Operation> visitedOperations) { if (!visitedOperations.add(currentOperation)) { return; } // reached the end of backward traversal if (currentOperation.getType() == OperationType.READ) { return; } // for transform we traverse backward in graph further through the inputs of the transform if (currentOperation.getType() == OperationType.TRANSFORM) { TransformOperation transform = (TransformOperation) currentOperation; for (InputField field : transform.getInputs()) { getIncomingOperationsForFieldHelper(operationsMap.get(field.getOrigin()), visitedOperations); } } }
private Map<EndPoint, Set<String>> computeDestinationFields() { if (writeOperations == null) { computeAndValidateFieldLineageInfo(this.operations); } Map<EndPoint, Set<String>> destinationFields = new HashMap<>(); for (WriteOperation write : this.writeOperations) { Set<String> endPointFields = destinationFields.computeIfAbsent(write.getDestination(), k -> new HashSet<>()); for (InputField field : write.getInputs()) { endPointFields.add(field.getName()); } } return destinationFields; }
@Test(expected = IllegalArgumentException.class) public void testSelfReferentialOperations() { TransformOperation parse = new TransformOperation("parse", "parse", Arrays.asList(InputField.of("read", "body"), InputField.of("parse", "name")), "name", "address"); FieldLineageInfo.getTopologicallySortedOperations(Collections.singleton(parse)); }
private Map<EndPointField, Set<EndPointField>> computeIncomingSummary() { if (writeOperations == null) { computeAndValidateFieldLineageInfo(this.operations); } Map<EndPointField, Set<EndPointField>> summary = new HashMap<>(); for (WriteOperation write : writeOperations) { List<InputField> inputs = write.getInputs(); for (InputField input : inputs) { computeIncomingSummaryHelper(new EndPointField(write.getDestination(), input.getName()), operationsMap.get(input.getOrigin()), write, summary); } } return summary; }
/** * Recursively traverse the graph to calculate the incoming operation. * * @param currentOperation the current operation from which the graph needs to explored * @param visitedOperations all the operations visited so far */ private void getIncomingOperationsForFieldHelper(Operation currentOperation, Set<Operation> visitedOperations) { if (!visitedOperations.add(currentOperation)) { return; } // reached the end of backward traversal if (currentOperation.getType() == OperationType.READ) { return; } // for transform we traverse backward in graph further through the inputs of the transform if (currentOperation.getType() == OperationType.TRANSFORM) { TransformOperation transform = (TransformOperation) currentOperation; for (InputField field : transform.getInputs()) { getIncomingOperationsForFieldHelper(operationsMap.get(field.getOrigin()), visitedOperations); } } }
@Test(expected = IllegalArgumentException.class) public void testCycleWithNonExistentOperationNames() { EndPoint readEndPoint = EndPoint.of("ns", "file1"); EndPoint writeEndPoint = EndPoint.of("ns", "file2"); ReadOperation read = new ReadOperation("read", "read", readEndPoint, "offset", "body"); TransformOperation parse = new TransformOperation("parse", "parse", Arrays.asList(InputField.of("read", "body"), InputField.of("normalize", "name"), InputField.of("nop1", "field1")), "name", "address"); TransformOperation normalize = new TransformOperation("normalize", "normalize", Arrays.asList(InputField.of("parse", "name"), InputField.of("nop2", "field2")), "name"); WriteOperation write = new WriteOperation("write", "writing to another file", writeEndPoint, Arrays.asList(InputField.of("normalize", "name"), InputField.of("parse", "address"), InputField.of("nop3", "field3"))); List<Operation> operations = new ArrayList<>(); operations.add(parse); operations.add(read); operations.add(normalize); operations.add(write); FieldLineageInfo.getTopologicallySortedOperations(new HashSet<>(operations)); }
@Test(expected = IllegalArgumentException.class) public void testCycle() { EndPoint readEndPoint = EndPoint.of("ns", "file1"); EndPoint writeEndPoint = EndPoint.of("ns", "file2"); ReadOperation read = new ReadOperation("read", "read", readEndPoint, "offset", "body"); TransformOperation parse = new TransformOperation("parse", "parse", Arrays.asList(InputField.of("read", "body"), InputField.of("normalize", "name")), "name", "address"); TransformOperation normalize = new TransformOperation("normalize", "normalize", Collections.singletonList(InputField.of("parse", "name")), "name"); WriteOperation write = new WriteOperation("write", "writing to another file", writeEndPoint, Arrays.asList(InputField.of("normalize", "name"), InputField.of("parse", "address"))); List<Operation> operations = new ArrayList<>(); operations.add(parse); operations.add(read); operations.add(normalize); operations.add(write); FieldLineageInfo.getTopologicallySortedOperations(new HashSet<>(operations)); }
Collections.singletonList(InputField.of("read", "body")), "first_name", "last_name"); Arrays.asList(InputField.of("parse", "first_name"), InputField.of("parse", "last_name")), "name"); writeInput.add(InputField.of("read", "offset")); writeInput.add(InputField.of("concat", "name")); writeInput.add(InputField.of("read", "file_name"));
EndPoint source = read.getSource(); for (InputField inputField : inputFields) { if (inputField.getOrigin().equals(currentOperation.getName())) { sourceEndPointFields.add(new EndPointField(source, inputField.getName())); TransformOperation transform = (TransformOperation) currentOperation; for (InputField inputField : transform.getInputs()) { computeIncomingSummaryHelper(field, operationsMap.get(inputField.getOrigin()), currentOperation, summary);
WriteOperation write = new WriteOperation("write", "writing file", endPoint2, InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("parse", "address"), InputField.of("parse", "zip")); Collections.singletonList(InputField.of("read", "offset")), "offset"); write = new WriteOperation("write", "writing file", endPoint2, InputField.of("normalize", "offset"), InputField.of("parse", "name"), InputField.of("parse", "address"), InputField.of("parse", "zip"));
EndPoint source = read.getSource(); for (InputField inputField : inputFields) { if (inputField.getOrigin().equals(currentOperation.getName())) { sourceEndPointFields.add(new EndPointField(source, inputField.getName())); TransformOperation transform = (TransformOperation) currentOperation; for (InputField inputField : transform.getInputs()) { computeIncomingSummaryHelper(field, operationsMap.get(inputField.getOrigin()), currentOperation, summary);