private void addMergeOperation(Set<String> stageInputs, Map<String, Operation> processedOperations) { Set<String> sortedInputs = new TreeSet<>(stageInputs); String mergeOperationName = prefixedOperationName(Joiner.on(SEPARATOR).join(sortedInputs), "merge"); String mergeDescription = "Merged stages: " + Joiner.on(",").join(sortedInputs); if (processedOperations.containsKey(mergeOperationName)) { // it is possible that same stages act as an input to multiple stages. // we should still only add single merge operation for them return; } List<InputField> inputFields = new ArrayList<>(); for (String inputStage : sortedInputs) { List<String> parentStages = findParentStages(inputStage); for (String parentStage : parentStages) { Map<String, String> fieldOrigins = stageOutputsWithOrigins.get(parentStage); for (Map.Entry<String, String> fieldOrigin : fieldOrigins.entrySet()) { inputFields.add(InputField.of(fieldOrigin.getValue(), fieldOrigin.getKey())); } } } Set<String> outputs = new LinkedHashSet<>(); for (InputField inputField : inputFields) { outputs.add(inputField.getName()); } TransformOperation merge = new TransformOperation(mergeOperationName, mergeDescription, inputFields, new ArrayList<>(outputs)); processedOperations.put(merge.getName(), merge); }
@Test(expected = IllegalArgumentException.class) public void testSelfReferentialOperations() { TransformOperation parse = new TransformOperation("parse", "parse", Arrays.asList(InputField.of("read", "body"), InputField.of("parse", "name")), "name", "address"); FieldLineageInfo.getTopologicallySortedOperations(Collections.singleton(parse)); }
@Test(expected = IllegalArgumentException.class) public void testCycleWithNonExistentOperationNames() { EndPoint readEndPoint = EndPoint.of("ns", "file1"); EndPoint writeEndPoint = EndPoint.of("ns", "file2"); ReadOperation read = new ReadOperation("read", "read", readEndPoint, "offset", "body"); TransformOperation parse = new TransformOperation("parse", "parse", Arrays.asList(InputField.of("read", "body"), InputField.of("normalize", "name"), InputField.of("nop1", "field1")), "name", "address"); TransformOperation normalize = new TransformOperation("normalize", "normalize", Arrays.asList(InputField.of("parse", "name"), InputField.of("nop2", "field2")), "name"); WriteOperation write = new WriteOperation("write", "writing to another file", writeEndPoint, Arrays.asList(InputField.of("normalize", "name"), InputField.of("parse", "address"), InputField.of("nop3", "field3"))); List<Operation> operations = new ArrayList<>(); operations.add(parse); operations.add(read); operations.add(normalize); operations.add(write); FieldLineageInfo.getTopologicallySortedOperations(new HashSet<>(operations)); }
@Test(expected = IllegalArgumentException.class) public void testCycle() { EndPoint readEndPoint = EndPoint.of("ns", "file1"); EndPoint writeEndPoint = EndPoint.of("ns", "file2"); ReadOperation read = new ReadOperation("read", "read", readEndPoint, "offset", "body"); TransformOperation parse = new TransformOperation("parse", "parse", Arrays.asList(InputField.of("read", "body"), InputField.of("normalize", "name")), "name", "address"); TransformOperation normalize = new TransformOperation("normalize", "normalize", Collections.singletonList(InputField.of("parse", "name")), "name"); WriteOperation write = new WriteOperation("write", "writing to another file", writeEndPoint, Arrays.asList(InputField.of("normalize", "name"), InputField.of("parse", "address"))); List<Operation> operations = new ArrayList<>(); operations.add(parse); operations.add(read); operations.add(normalize); operations.add(write); FieldLineageInfo.getTopologicallySortedOperations(new HashSet<>(operations)); }
Collections.singletonList(InputField.of("read", "body")), "first_name", "last_name"); Arrays.asList(InputField.of("parse", "first_name"), InputField.of("parse", "last_name")), "name"); writeInput.add(InputField.of("read", "offset")); writeInput.add(InputField.of("concat", "name")); writeInput.add(InputField.of("read", "file_name"));
WriteOperation write = new WriteOperation("write", "writing file", endPoint2, InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("parse", "address"), InputField.of("parse", "zip")); Collections.singletonList(InputField.of("read", "offset")), "offset"); write = new WriteOperation("write", "writing file", endPoint2, InputField.of("normalize", "offset"), InputField.of("parse", "name"), InputField.of("parse", "address"), InputField.of("parse", "zip"));
ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("endpoint1"), "offset", "body"); TransformOperation parse = new TransformOperation("parse", "parse body", Collections.singletonList(InputField.of("read", "body")), "name", "address"); WriteOperation write = new WriteOperation("write", "write data", EndPoint.of("ns", "endpoint2"), Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("parse", "body"))); Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("parse", "body"))); operations.add(anotherWrite); operations.add(parse);
InputField inputField = InputField.of(readOperation.getName(), sourceField.getField()); if (containsInputField(outgoingOperation, inputField)) { computeOutgoing(outgoingOperation, visitedOperations);
ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("endpoint1"), "offset", "body"); TransformOperation parse = new TransformOperation("parse", "parse body", Collections.singletonList(InputField.of("read", "body")), "name", "address"); WriteOperation write = new WriteOperation("write", "write data", EndPoint.of("ns", "endpoint2"), Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("parse", "body"))); Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("parse", "body"))); Arrays.asList(InputField.of("invalid", "body"), InputField.of("anotherinvalid", "body")), "name", "address");
InputField inputField = InputField.of(readOperation.getName(), sourceField.getField()); if (containsInputField(outgoingOperation, inputField)) { computeOutgoing(outgoingOperation, visitedOperations);
Collections.singletonList(InputField.of("read", "body")), "name", "address"); TransformOperation normalize = new TransformOperation("normalize", "normalize descr", Collections.singletonList(InputField.of("parse", "address")), "address"); List<InputField> writeInputs = new ArrayList<>(); writeInputs.add(InputField.of("parse", "name")); writeInputs.add(InputField.of("normalize", "address")); WriteOperation write = new WriteOperation("write", "write descr", EndPoint.of("ns", "output"), writeInputs);
@Test public void testDisjointBranches() { // read1 -----> write1 // read2 -----> write2 ReadOperation read1 = new ReadOperation("read1", "read descr", EndPoint.of("ns", "input1"), "offset", "body"); WriteOperation write1 = new WriteOperation("write1", "write descr", EndPoint.of("ns", "output"), InputField.of("read1", "offset")); ReadOperation read2 = new ReadOperation("read2", "read descr", EndPoint.of("ns", "input2"), "offset", "body"); WriteOperation write2 = new WriteOperation("write2", "write descr", EndPoint.of("ns", "output"), InputField.of("read2", "offset")); Set<Operation> operations = new LinkedHashSet<>(); operations.add(write1); operations.add(write2); operations.add(read2); operations.add(read1); List<Operation> topologicallySortedOperations = FieldLineageInfo.getTopologicallySortedOperations(operations); assertBefore(topologicallySortedOperations, read1, write1); assertBefore(topologicallySortedOperations, read2, write2); }
Collections.singletonList(InputField.of("read", "body")), "name", "address"); TransformOperation normalize = new TransformOperation("normalize", "normalize descr", Collections.singletonList(InputField.of("parse", "address")), "address"); List<InputField> writeInputs = new ArrayList<>(); writeInputs.add(InputField.of("read", "offset")); writeInputs.add(InputField.of("parse", "name")); writeInputs.add(InputField.of("normalize", "address")); WriteOperation write = new WriteOperation("write", "write descr", EndPoint.of("ns", "output"), writeInputs);
ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("ns", "endpoint1"), "offset", "body"); TransformOperation parse = new TransformOperation("parse", "parse body", Collections.singletonList(InputField.of("read", "body")), "name", "address"); WriteOperation write = new WriteOperation("write", "write data", EndPoint.of("ns", "endpoint2"), Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("parse", "address"))); operations2.add(parse); TransformOperation normalize = new TransformOperation("normalize", "normalize address", Collections.singletonList(InputField.of("parse", "address")), "address"); operations2.add(normalize); WriteOperation anotherWrite = new WriteOperation("anotherwrite", "write data", EndPoint.of("ns", "endpoint2"), Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), InputField.of("normalize", "address"))); operations2.add(anotherWrite); FieldLineageInfo info2 = new FieldLineageInfo(operations2);
Collections.singletonList(InputField.of("read", "body")), "id", "name", "address", "zip"); WriteOperation infoWrite = new WriteOperation("infoWrite", "writing info", info, Arrays.asList(InputField.of("parse", "id"), InputField.of("parse", "name"))); WriteOperation locationWrite = new WriteOperation("locationWrite", "writing location", location, Arrays.asList(InputField.of("parse", "address"), InputField.of("parse", "zip")));
Arrays.asList(InputField.of("read1", "offset"), InputField.of("read2", "offset"), InputField.of("read1", "body"), InputField.of("read2", "body")), "offset", "body"); Collections.singletonList(InputField.of("merge", "body")), "name", "address"); Arrays.asList(InputField.of("merge", "offset"), InputField.of("parse", "name"), InputField.of("parse", "address")));
inputFields.add(InputField.of("read", "offset")); inputFields.add(InputField.of("parse", "name")); inputFields.add(InputField.of("parse", "address")); inputFields.add(InputField.of("parse", "zip")); FieldOperationOutput.of(Arrays.asList("offset", "body")))); FieldOperationInput input = FieldOperationInput.of(Collections.singletonList(InputField.of("read", "offset"))); FieldOperationOutput output = FieldOperationOutput.of(Collections.singletonList("offset")); fieldOperationInfos.add(new FieldOperationInfo("normalize", "normalizing offset", input, output)); inputFields.add(InputField.of("normalize", "offset")); inputFields.add(InputField.of("parse", "name")); inputFields.add(InputField.of("parse", "address")); inputFields.add(InputField.of("parse", "zip"));
ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("ns1", "endpoint1"), "offset", "body"); WriteOperation write = new WriteOperation("write", "some write", EndPoint.of("ns", "endpoint3"), InputField.of("read", "body")); "offset", "body"); WriteOperation anotherWrite = new WriteOperation("anotherWrite", "another write", EndPoint.of("ns", "endpoint3"), InputField.of("anotherRead", "body")); operations.add(anotherRead); operations.add(anotherWrite);