@GET @Path("/namespaces/{namespace-id}/datasets/{dataset-id}/lineage/fields/{field-name}/operations") public void datasetFieldLineageDetails(HttpRequest request, HttpResponder responder, @PathParam("namespace-id") String namespaceId, @PathParam("dataset-id") String datasetId, @PathParam("field-name") String field, @QueryParam("direction") @DefaultValue("both") String directionStr, @QueryParam("start") String startStr, @QueryParam("end") String endStr) throws BadRequestException { TimeRange range = parseRange(startStr, endStr); Constants.FieldLineage.Direction direction = parseDirection(directionStr); EndPointField endPointField = new EndPointField(EndPoint.of(namespaceId, datasetId), field); FieldLineageDetails details = fieldLineageAdmin.getOperationDetails(direction, endPointField, range.getStart(), range.getEnd()); responder.sendJson(HttpResponseStatus.OK, GSON.toJson(details)); }
@GET @Path("/namespaces/{namespace-id}/datasets/{dataset-id}/lineage/fields/{field-name}/operations") public void datasetFieldLineageDetails(HttpRequest request, HttpResponder responder, @PathParam("namespace-id") String namespaceId, @PathParam("dataset-id") String datasetId, @PathParam("field-name") String field, @QueryParam("direction") @DefaultValue("both") String directionStr, @QueryParam("start") String startStr, @QueryParam("end") String endStr) throws BadRequestException { TimeRange range = parseRange(startStr, endStr); Constants.FieldLineage.Direction direction = parseDirection(directionStr); EndPointField endPointField = new EndPointField(EndPoint.of(namespaceId, datasetId), field); FieldLineageDetails details = fieldLineageAdmin.getOperationDetails(direction, endPointField, range.getStart(), range.getEnd()); responder.sendJson(HttpResponseStatus.OK, GSON.toJson(details)); }
@GET @Path("/namespaces/{namespace-id}/datasets/{dataset-id}/lineage/fields") public void datasetFields(HttpRequest request, HttpResponder responder, @PathParam("namespace-id") String namespaceId, @PathParam("dataset-id") String datasetId, @QueryParam("start") String startStr, @QueryParam("end") String endStr, @QueryParam("prefix") String prefix, @QueryParam("includeCurrent") boolean includeCurrent) throws BadRequestException, IOException { TimeRange range = parseRange(startStr, endStr); Set<Field> result = fieldLineageAdmin.getFields(EndPoint.of(namespaceId, datasetId), range.getStart(), range.getEnd(), prefix, includeCurrent); // CDAP-14168: From 5.1 this endpoint supports returning a Set of Field object rather Set of String. For backward // compatibility in 5.1 the default behavior is to return a Set of String (field names). This default behavior // can be overridden by passing the query parameter 'includeCurrent' set to 'true' which will return set of // Field object. if (includeCurrent) { responder.sendJson(HttpResponseStatus.OK, GSON.toJson(result)); } else { responder.sendJson(HttpResponseStatus.OK, GSON.toJson(result.stream().map(Field::getName).collect(Collectors.toSet()))); } }
@GET @Path("/namespaces/{namespace-id}/datasets/{dataset-id}/lineage/fields/{field-name}") public void datasetFieldLineageSummary(HttpRequest request, HttpResponder responder, @PathParam("namespace-id") String namespaceId, @PathParam("dataset-id") String datasetId, @PathParam("field-name") String field, @QueryParam("direction") String directionStr, @QueryParam("start") String startStr, @QueryParam("end") String endStr) throws BadRequestException { TimeRange range = parseRange(startStr, endStr); Constants.FieldLineage.Direction direction = parseDirection(directionStr); EndPointField endPointField = new EndPointField(EndPoint.of(namespaceId, datasetId), field); FieldLineageSummary summary = fieldLineageAdmin.getSummary(direction, endPointField, range.getStart(), range.getEnd()); responder.sendJson(HttpResponseStatus.OK, GSON.toJson(summary)); }
@GET @Path("/namespaces/{namespace-id}/datasets/{dataset-id}/lineage/fields") public void datasetFields(HttpRequest request, HttpResponder responder, @PathParam("namespace-id") String namespaceId, @PathParam("dataset-id") String datasetId, @QueryParam("start") String startStr, @QueryParam("end") String endStr, @QueryParam("prefix") String prefix, @QueryParam("includeCurrent") boolean includeCurrent) throws BadRequestException, IOException { TimeRange range = parseRange(startStr, endStr); Set<Field> result = fieldLineageAdmin.getFields(EndPoint.of(namespaceId, datasetId), range.getStart(), range.getEnd(), prefix, includeCurrent); // CDAP-14168: From 5.1 this endpoint supports returning a Set of Field object rather Set of String. For backward // compatibility in 5.1 the default behavior is to return a Set of String (field names). This default behavior // can be overridden by passing the query parameter 'includeCurrent' set to 'true' which will return set of // Field object. if (includeCurrent) { responder.sendJson(HttpResponseStatus.OK, GSON.toJson(result)); } else { responder.sendJson(HttpResponseStatus.OK, GSON.toJson(result.stream().map(Field::getName).collect(Collectors.toSet()))); } }
@GET @Path("/namespaces/{namespace-id}/datasets/{dataset-id}/lineage/fields/{field-name}") public void datasetFieldLineageSummary(HttpRequest request, HttpResponder responder, @PathParam("namespace-id") String namespaceId, @PathParam("dataset-id") String datasetId, @PathParam("field-name") String field, @QueryParam("direction") String directionStr, @QueryParam("start") String startStr, @QueryParam("end") String endStr) throws BadRequestException { TimeRange range = parseRange(startStr, endStr); Constants.FieldLineage.Direction direction = parseDirection(directionStr); EndPointField endPointField = new EndPointField(EndPoint.of(namespaceId, datasetId), field); FieldLineageSummary summary = fieldLineageAdmin.getSummary(direction, endPointField, range.getStart(), range.getEnd()); responder.sendJson(HttpResponseStatus.OK, GSON.toJson(summary)); }
@Override public void prepareRun(BatchSourceContext context) throws DatasetManagementException { super.prepareRun(context); Schema schema = tableConfig.getSchema(); if (schema != null && schema.getFields() != null) { FieldOperation operation = new FieldReadOperation("Read", "Read from Table dataset", EndPoint.of(context.getNamespace(), tableConfig.getName()), schema.getFields().stream().map(Schema.Field::getName) .collect(Collectors.toList())); context.record(Collections.singletonList(operation)); } }
@Override public void prepareRun(BatchSinkContext context) throws DatasetManagementException { super.prepareRun(context); String schemaString = tableSinkConfig.getSchemaStr(); if (schemaString != null) { try { Schema schema = Schema.parseJson(schemaString); if (schema.getFields() != null) { FieldOperation operation = new FieldWriteOperation("Write", "Wrote to CDAP Table", EndPoint.of(context.getNamespace(), tableSinkConfig.getName()), schema.getFields().stream().map(Schema.Field::getName) .collect(Collectors.toList())); context.record(Collections.singletonList(operation)); } } catch (IOException e) { throw new IllegalStateException("Failed to parse schema.", e); } } }
private Set<EndPointField> summary() { Set<EndPointField> endPointFields = new HashSet<>(); EndPoint endPoint1 = EndPoint.of("ns", "file"); EndPoint endPoint2 = EndPoint.of("ns", "anotherfile"); endPointFields.add(new EndPointField(endPoint1, "a")); endPointFields.add(new EndPointField(endPoint1, "b")); endPointFields.add(new EndPointField(endPoint1, "c")); endPointFields.add(new EndPointField(endPoint2, "x")); endPointFields.add(new EndPointField(endPoint2, "y")); endPointFields.add(new EndPointField(endPoint2, "z")); return endPointFields; }
@Test public void testDisjointBranches() { // read1 -----> write1 // read2 -----> write2 ReadOperation read1 = new ReadOperation("read1", "read descr", EndPoint.of("ns", "input1"), "offset", "body"); WriteOperation write1 = new WriteOperation("write1", "write descr", EndPoint.of("ns", "output"), InputField.of("read1", "offset")); ReadOperation read2 = new ReadOperation("read2", "read descr", EndPoint.of("ns", "input2"), "offset", "body"); WriteOperation write2 = new WriteOperation("write2", "write descr", EndPoint.of("ns", "output"), InputField.of("read2", "offset")); Set<Operation> operations = new LinkedHashSet<>(); operations.add(write1); operations.add(write2); operations.add(read2); operations.add(read1); List<Operation> topologicallySortedOperations = FieldLineageInfo.getTopologicallySortedOperations(operations); assertBefore(topologicallySortedOperations, read1, write1); assertBefore(topologicallySortedOperations, read2, write2); }
@Override public void prepareRun(BatchSourceContext context) throws Exception { InputFormatProvider inputFormatProvider = context.newPluginInstance(FORMAT_PLUGIN_ID); DatasetProperties datasetProperties = createProperties(inputFormatProvider); // Dataset must still be created if macros provided at configure time if (!context.datasetExists(config.getName())) { context.createDataset(config.getName(), PartitionedFileSet.class.getName(), datasetProperties); } PartitionedFileSet partitionedFileSet = context.getDataset(config.getName()); SnapshotFileSet snapshotFileSet = new SnapshotFileSet(partitionedFileSet); Map<String, String> arguments = new HashMap<>(datasetProperties.getProperties()); if (config.getFileProperties() != null) { arguments = GSON.fromJson(config.getFileProperties(), MAP_TYPE); } Schema schema = config.getSchema(); if (schema.getFields() != null) { String formatName = getInputFormatName(); FieldOperation operation = new FieldReadOperation("Read", String.format("Read from SnapshotFile source in %s format.", formatName), EndPoint.of(context.getNamespace(), config.getName()), schema.getFields().stream().map(Schema.Field::getName).collect(Collectors.toList())); context.record(Collections.singletonList(operation)); } context.setInput(Input.ofDataset(config.getName(), snapshotFileSet.getInputArguments(arguments))); }
@Test(expected = IllegalArgumentException.class) public void testCycle() { EndPoint readEndPoint = EndPoint.of("ns", "file1"); EndPoint writeEndPoint = EndPoint.of("ns", "file2"); ReadOperation read = new ReadOperation("read", "read", readEndPoint, "offset", "body"); TransformOperation parse = new TransformOperation("parse", "parse", Arrays.asList(InputField.of("read", "body"), InputField.of("normalize", "name")), "name", "address"); TransformOperation normalize = new TransformOperation("normalize", "normalize", Collections.singletonList(InputField.of("parse", "name")), "name"); WriteOperation write = new WriteOperation("write", "writing to another file", writeEndPoint, Arrays.asList(InputField.of("normalize", "name"), InputField.of("parse", "address"))); List<Operation> operations = new ArrayList<>(); operations.add(parse); operations.add(read); operations.add(normalize); operations.add(write); FieldLineageInfo.getTopologicallySortedOperations(new HashSet<>(operations)); }
@Override public void prepareRun(BatchSourceContext context) throws DatasetManagementException, InstantiationException { config.validate(); InputFormatProvider inputFormatProvider = context.newPluginInstance(FORMAT_PLUGIN_ID); DatasetProperties datasetProperties = createProperties(inputFormatProvider); // If macros provided at runtime, dataset still needs to be created if (!context.datasetExists(config.getName())) { String tpfsName = config.getName(); context.createDataset(tpfsName, TimePartitionedFileSet.class.getName(), datasetProperties); } Schema schema = config.getSchema(); if (schema.getFields() != null) { String formatName = getInputFormatName(); FieldOperation operation = new FieldReadOperation("Read", String.format("Read from TimePartitionedFileSet in %s format.", formatName), EndPoint.of(context.getNamespace(), config.getName()), schema.getFields().stream().map(Schema.Field::getName).collect(Collectors.toList())); context.record(Collections.singletonList(operation)); } long duration = TimeParser.parseDuration(config.getDuration()); long delay = Strings.isNullOrEmpty(config.getDelay()) ? 0 : TimeParser.parseDuration(config.getDelay()); long endTime = context.getLogicalStartTime() - delay; long startTime = endTime - duration; Map<String, String> sourceArgs = Maps.newHashMap(datasetProperties.getProperties()); TimePartitionedFileSetArguments.setInputStartTime(sourceArgs, startTime); TimePartitionedFileSetArguments.setInputEndTime(sourceArgs, endTime); context.setInput(Input.ofDataset(config.getName(), sourceArgs)); }
@Test(expected = IllegalArgumentException.class) public void testCycleWithNonExistentOperationNames() { EndPoint readEndPoint = EndPoint.of("ns", "file1"); EndPoint writeEndPoint = EndPoint.of("ns", "file2"); ReadOperation read = new ReadOperation("read", "read", readEndPoint, "offset", "body"); TransformOperation parse = new TransformOperation("parse", "parse", Arrays.asList(InputField.of("read", "body"), InputField.of("normalize", "name"), InputField.of("nop1", "field1")), "name", "address"); TransformOperation normalize = new TransformOperation("normalize", "normalize", Arrays.asList(InputField.of("parse", "name"), InputField.of("nop2", "field2")), "name"); WriteOperation write = new WriteOperation("write", "writing to another file", writeEndPoint, Arrays.asList(InputField.of("normalize", "name"), InputField.of("parse", "address"), InputField.of("nop3", "field3"))); List<Operation> operations = new ArrayList<>(); operations.add(parse); operations.add(read); operations.add(normalize); operations.add(write); FieldLineageInfo.getTopologicallySortedOperations(new HashSet<>(operations)); }
@Test public void testValidOperations() { ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("endpoint1"), "offset", "body"); TransformOperation parse = new TransformOperation("parse", "parse body", Collections.singletonList(InputField.of("read", "body")), "name", "address"); WriteOperation write = new WriteOperation("write", "write data", EndPoint.of("ns", "endpoint2"), Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"), WriteOperation anotherWrite = new WriteOperation("write", "write data", EndPoint.of("myns", "endpoint2"), Arrays.asList(InputField.of("read", "offset"), InputField.of("parse", "name"),
ReadOperation read = new ReadOperation("read", "some read", EndPoint.of("ns1", "endpoint1"), "offset", "body"); WriteOperation write = new WriteOperation("write", "some write", EndPoint.of("ns", "endpoint3"), InputField.of("read", "body")); final FieldLineageInfo info1 = new FieldLineageInfo(operations); ReadOperation anotherRead = new ReadOperation("anotherRead", "another read", EndPoint.of("ns1", "endpoint2"), "offset", "body"); WriteOperation anotherWrite = new WriteOperation("anotherWrite", "another write", EndPoint.of("ns", "endpoint3"), InputField.of("anotherRead", "body")); operations.add(anotherRead); EndPoint source1 = EndPoint.of("ns1", "endpoint1"); EndPoint source2 = EndPoint.of("ns1", "endpoint2"); EndPoint destination = EndPoint.of("ns", "endpoint3");
@Test public void testFields() throws Exception { FieldLineageAdmin fieldLineageAdmin = new FieldLineageAdmin(new FakeFieldLineageReader(getFieldNames(), Collections.emptySet(), Collections.emptySet()), metadataAdmin); EndPoint endPoint = EndPoint.of("ns", "file"); // test all fields Assert.assertEquals(getFields(getFieldNames()), fieldLineageAdmin.getFields(endPoint, 0, Long.MAX_VALUE, null, false)); // test fields prefixed with string "add" Assert.assertEquals(new HashSet<>(Arrays.asList(new Field("address", true), new Field("address_original", true))), fieldLineageAdmin.getFields(endPoint, 0, Long.MAX_VALUE, "add", false)); }
@Test public void testSummary() { FieldLineageAdmin fieldLineageAdmin = new FieldLineageAdmin(new FakeFieldLineageReader(Collections.emptySet(), summary(), Collections.emptySet()), metadataAdmin); EndPoint endPoint = EndPoint.of("ns", "file"); DatasetField datasetField = new DatasetField(new DatasetId("ns", "file"), new HashSet<>(Arrays.asList("a", "b", "c"))); DatasetField anotherDatasetField = new DatasetField(new DatasetId("ns", "anotherfile"), new HashSet<>(Arrays.asList("x", "y", "z"))); Set<DatasetField> expected = new HashSet<>(); expected.add(datasetField); expected.add(anotherDatasetField); // input args to the getSummary below does not matter since data returned is mocked FieldLineageSummary summary = fieldLineageAdmin.getSummary(Constants.FieldLineage.Direction.INCOMING, new EndPointField(endPoint, "somefield"), 0, Long.MAX_VALUE); Assert.assertEquals(expected, summary.getIncoming()); Assert.assertNull(summary.getOutgoing()); summary = fieldLineageAdmin.getSummary(Constants.FieldLineage.Direction.OUTGOING, new EndPointField(endPoint, "somefield"), 0, Long.MAX_VALUE); Assert.assertEquals(expected, summary.getOutgoing()); Assert.assertNull(summary.getIncoming()); summary = fieldLineageAdmin.getSummary(Constants.FieldLineage.Direction.BOTH, new EndPointField(endPoint, "somefield"), 0, Long.MAX_VALUE); Assert.assertEquals(expected, summary.getOutgoing()); Assert.assertEquals(expected, summary.getIncoming()); }