private DataSet<DataInstance> loadDataSet(ExecutionEnvironment env){ if (attributes==null) this.loadHeader(env); DataSet<Attributes> attsDataSet = env.fromElements(attributes); DataSource<String> data = env.readTextFile(pathFileData); Configuration config = new Configuration(); config.setString(DataFlinkLoader.RELATION_NAME, this.relationName); return data .filter(w -> !w.isEmpty()) .filter(w -> !w.startsWith("%")) .filter(line -> !line.startsWith("@attribute")) .filter(line -> !line.startsWith("@relation")) .filter(line -> !line.startsWith("@data")) .map(new DataInstanceBuilder(isNormalize())) .withParameters(config) .withBroadcastSet(attsDataSet, DataFlinkLoader.ATTRIBUTES_NAME + "_" + this.relationName); }
/** * Creates a new logical graph that contains only vertices and edges that * are contained in the starting graph but not in any other graph that is part * of the given collection. * * @param collection input collection * @return excluded graph */ @Override public LogicalGraph execute(GraphCollection collection) { DataSet<GradoopId> excludedGraphIds = collection.getGraphHeads() .filter(new ByDifferentId<GraphHead>(startId)) .map(new Id<GraphHead>()); DataSet<Vertex> vertices = collection.getVertices() .filter(new InGraph<Vertex>(startId)) .filter(new NotInGraphsBroadcast<Vertex>()) .withBroadcastSet(excludedGraphIds, NotInGraphsBroadcast.GRAPH_IDS); DataSet<Edge> edges = collection.getEdges() .filter(new InGraph<Edge>(startId)) .filter(new NotInGraphsBroadcast<Edge>()) .withBroadcastSet(excludedGraphIds, NotInGraphsBroadcast.GRAPH_IDS); return collection.getConfig().getLogicalGraphFactory().fromDataSets(vertices, edges); }
/** * Creates a new logical graph that contains only vertices and edges that * are contained in the starting graph but not in any other graph that is part * of the given collection. * * @param collection input collection * @return excluded graph */ @Override public LogicalGraph execute(GraphCollection collection) { DataSet<GradoopId> excludedGraphIds = collection.getGraphHeads() .filter(new ByDifferentId<GraphHead>(startId)) .map(new Id<GraphHead>()); DataSet<Vertex> vertices = collection.getVertices() .filter(new InGraph<Vertex>(startId)) .filter(new NotInGraphsBroadcast<Vertex>()) .withBroadcastSet(excludedGraphIds, NotInGraphsBroadcast.GRAPH_IDS); DataSet<Edge> edges = collection.getEdges() .filter(new InGraph<Edge>(startId)) .filter(new NotInGraphsBroadcast<Edge>()) .withBroadcastSet(excludedGraphIds, NotInGraphsBroadcast.GRAPH_IDS); return collection.getConfig().getLogicalGraphFactory().fromDataSets(vertices, edges); } }
public static void main(String[] args) throws Exception { // parse parameters ParameterTool params = ParameterTool.fromArgs(args); String input = params.getRequired("input"); // obtain an execution environment ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment(); // read messageId, sender, and reply-to fields from the input data set DataSet<Tuple3<String, String, String>> mails = env.readCsvFile(input) .lineDelimiter(MBoxParser.MAIL_RECORD_DELIM) .fieldDelimiter(MBoxParser.MAIL_FIELD_DELIM) // messageId at position 0, sender at 2, reply-to at 5 .includeFields("101001") .types(String.class, String.class, String.class); // extract email addresses and filter out mails from bots DataSet<Tuple3<String, String, String>> addressMails = mails .map(new EmailExtractor()) .filter(new ExcludeEmailFilter("git@git.apache.org")) .filter(new ExcludeEmailFilter("jira@apache.org")); // construct reply connections by joining on messageId and reply-To DataSet<Tuple2<String, String>> replyConnections = addressMails .join(addressMails).where(2).equalTo(0).projectFirst(1).projectSecond(1); // count reply connections for each pair of email addresses replyConnections .groupBy(0, 1).reduceGroup(new ConnectionCounter()) .print(); }