public CreateBloomFilter(Pipe keys, String bloomFilterID, String approxCountPartsDir, String bloomPartsDir, String keyBytesField, HashFunctionFactory hashFactory) throws IOException { super(keys); Pipe smallPipe = new Each(keys, new Fields(keyBytesField), new GetIndices(hashFactory), new Fields("split", "index", "hash_num")); smallPipe = new Each(smallPipe, new Fields("split", "index", "hash_num"), new Unique.FilterPartialDuplicates()); smallPipe = new GroupBy(smallPipe, new Fields("split")); smallPipe = new Every(smallPipe, new Fields("index", "hash_num"), new CreateBloomFilterFromIndices(), Fields.ALL); ConfigDef bloomDef = smallPipe.getStepConfigDef(); bloomDef.setProperty(BloomProps.BLOOM_FILTER_PARTS_DIR, bloomPartsDir); bloomDef.setProperty(BloomProps.BLOOM_KEYS_COUNTS_DIR, approxCountPartsDir); bloomDef.setProperty(BloomProps.TARGET_BLOOM_FILTER_ID, bloomFilterID); setTails(smallPipe); }
@Test public void testSwap() throws Exception { Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); Tap sink = getPlatform().getTextFile( new Fields( "offset", "line" ), new Fields( "count", "ipaddress" ), getOutputPath( "swap" ), SinkMode.REPLACE ); Pipe pipe = new Pipe( "test" ); Function parser = new RegexParser( new Fields( "ip" ), "^[^ ]*" ); pipe = new Each( pipe, new Fields( "line" ), parser, Fields.SWAP ); pipe = new GroupBy( pipe, new Fields( "ip" ) ); pipe = new Every( pipe, new Fields( "ip" ), new Count( new Fields( "count" ) ) ); pipe = new Each( pipe, new Fields( "ip" ), new Identity( new Fields( "ipaddress" ) ), Fields.SWAP ); Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); flow.complete(); validateLength( flow, 8, 2, Pattern.compile( "^\\d+\\s\\d+\\s[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}$" ) ); }
@Test public void testSwap() throws Exception { Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); Tap sink = getPlatform().getTextFile( new Fields( "offset", "line" ), new Fields( "count", "ipaddress" ), getOutputPath( "swap" ), SinkMode.REPLACE ); Pipe pipe = new Pipe( "test" ); Function parser = new RegexParser( new Fields( "ip" ), "^[^ ]*" ); pipe = new Each( pipe, new Fields( "line" ), parser, Fields.SWAP ); pipe = new GroupBy( pipe, new Fields( "ip" ) ); pipe = new Every( pipe, new Fields( "ip" ), new Count( new Fields( "count" ) ) ); pipe = new Each( pipe, new Fields( "ip" ), new Identity( new Fields( "ipaddress" ) ), Fields.SWAP ); Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); flow.complete(); validateLength( flow, 8, 2, Pattern.compile( "^\\d+\\s\\d+\\s[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}$" ) ); }
@Override public List<Pipe> resolveTails( Context context ) { Pipe pipe = new Pipe( (String) context.getFlow().getSourceNames().get( 0 ) ); pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) ); pipe = new GroupBy( pipe, new Fields( "ip" ) ); pipe = new Every( pipe, new Count(), new Fields( "ip", "count" ) ); return Arrays.asList( pipe ); } };
@Test public void testCoGroupWithResultGroupFields() throws Exception { Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo" ); Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "bar" ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); // using null pos so all fields are written Tap sink = new Hfs( new TextLine(), "/complex/cogroup/", SinkMode.REPLACE ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "num1", "lhs", "num2", "rhs" ), new Fields( "somenum", "somenum2" ) ); splice = new Every( splice, new First( new Fields( "value" ) ), new Fields( "somenum", "value" ) ); Flow countFlow = getPlatform().getFlowConnector().connect( sources, sink, splice ); }
@Override public List<Pipe> resolveTails( Context context ) { Pipe pipe = new Pipe( (String) context.getFlow().getSourceNames().get( 0 ) ); pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) ); pipe = new GroupBy( pipe, new Fields( "ip" ) ); pipe = new Every( pipe, new Count(), new Fields( "ip", "count" ) ); return Arrays.asList( pipe ); } };
@Test public void testNone() throws Exception { Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); Tap sink = getPlatform().getTextFile( new Fields( "offset", "line" ), new Fields( "count", "ip" ), getOutputPath( "none" ), SinkMode.REPLACE ); Pipe pipe = new Pipe( "test" ); Function parser = new RegexParser( new Fields( "ip" ), "^[^ ]*" ); pipe = new Each( pipe, new Fields( "line" ), parser, Fields.ALL ); pipe = new Each( pipe, new Fields( "line" ), new NoOp(), Fields.SWAP ); // declares Fields.NONE pipe = new GroupBy( pipe, new Fields( "ip" ) ); pipe = new Every( pipe, new Fields( "ip" ), new Count( new Fields( "count" ) ) ); pipe = new Each( pipe, Fields.NONE, new Insert( new Fields( "ipaddress" ), "1.2.3.4" ), Fields.ALL ); Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); flow.complete(); validateLength( flow, 8, 2, Pattern.compile( "^\\d+\\s\\d+\\s[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}\\.[\\d]{1,3}$" ) ); }
@Test public void testEveryOutResolver() throws Exception { Fields sourceFields = new Fields( "first", "second" ); Tap source = getPlatform().getTabDelimitedFile( sourceFields, "input/path", SinkMode.KEEP ); Fields sinkFields = new Fields( "third", "fourth" ); Tap sink = getPlatform().getTabDelimitedFile( sinkFields, "output/path", SinkMode.REPLACE ); Pipe pipe = new Pipe( "test" ); pipe = new GroupBy( pipe, new Fields( "first" ) ); pipe = new Every( pipe, new Fields( "second" ), new Count(), new Fields( "third" ) ); verify( source, sink, pipe ); }
@Test public void testCoGroupWithResultGroupFields() throws Exception { Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo" ); Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "bar" ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); // using null pos so all fields are written Tap sink = new Hfs( new TextLine(), "/complex/cogroup/", SinkMode.REPLACE ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "num1", "lhs", "num2", "rhs" ), new Fields( "somenum", "somenum2" ) ); splice = new Every( splice, new First( new Fields( "value" ) ), new Fields( "somenum", "value" ) ); Flow countFlow = getPlatform().getFlowConnector().connect( sources, sink, splice ); }
@Test public void testEveryOutResolver() throws Exception { Fields sourceFields = new Fields( "first", "second" ); Tap source = getPlatform().getTabDelimitedFile( sourceFields, "input/path", SinkMode.KEEP ); Fields sinkFields = new Fields( "third", "fourth" ); Tap sink = getPlatform().getTabDelimitedFile( sinkFields, "output/path", SinkMode.REPLACE ); Pipe pipe = new Pipe( "test" ); pipe = new GroupBy( pipe, new Fields( "first" ) ); pipe = new Every( pipe, new Fields( "second" ), new Count(), new Fields( "third" ) ); verify( source, sink, pipe ); }
@Test public void testCoGroupWithResultGroupFieldsDefault() throws Exception { Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo" ); Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "bar" ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); // using null pos so all fields are written Tap sink = new Hfs( new TextLine(), "/complex/cogroup/", SinkMode.REPLACE ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "num1", "lhs", "num2", "rhs" ) ); splice = new Every( splice, new First( new Fields( "value" ) ), new Fields( "num1", "value" ) ); Flow countFlow = getPlatform().getFlowConnector().connect( sources, sink, splice ); }
@Test public void testEveryArgResolver() throws Exception { Fields sourceFields = new Fields( "first", "second" ); Tap source = getPlatform().getTabDelimitedFile( sourceFields, "input/path", SinkMode.KEEP ); Fields sinkFields = new Fields( "third", "fourth" ); Tap sink = getPlatform().getTabDelimitedFile( sinkFields, "output/path", SinkMode.REPLACE ); Pipe pipe = new Pipe( "test" ); pipe = new GroupBy( pipe, new Fields( "first" ) ); pipe = new Every( pipe, new Fields( "third" ), new Count() ); verify( source, sink, pipe ); }
@Test public void testCoGroupWithResultGroupFieldsDefault() throws Exception { Tap sourceLower = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "foo" ); Tap sourceUpper = new Hfs( new TextLine( new Fields( "offset", "line" ) ), "bar" ); Map sources = new HashMap(); sources.put( "lower", sourceLower ); sources.put( "upper", sourceUpper ); Function splitter = new RegexSplitter( new Fields( "num", "char" ), " " ); // using null pos so all fields are written Tap sink = new Hfs( new TextLine(), "/complex/cogroup/", SinkMode.REPLACE ); Pipe pipeLower = new Each( new Pipe( "lower" ), new Fields( "line" ), splitter ); Pipe pipeUpper = new Each( new Pipe( "upper" ), new Fields( "line" ), splitter ); Pipe splice = new CoGroup( pipeLower, new Fields( "num" ), pipeUpper, new Fields( "num" ), new Fields( "num1", "lhs", "num2", "rhs" ) ); splice = new Every( splice, new First( new Fields( "value" ) ), new Fields( "num1", "value" ) ); Flow countFlow = getPlatform().getFlowConnector().connect( sources, sink, splice ); }
@Test public void testEveryArgResolver() throws Exception { Fields sourceFields = new Fields( "first", "second" ); Tap source = getPlatform().getTabDelimitedFile( sourceFields, "input/path", SinkMode.KEEP ); Fields sinkFields = new Fields( "third", "fourth" ); Tap sink = getPlatform().getTabDelimitedFile( sinkFields, "output/path", SinkMode.REPLACE ); Pipe pipe = new Pipe( "test" ); pipe = new GroupBy( pipe, new Fields( "first" ) ); pipe = new Every( pipe, new Fields( "third" ), new Count() ); verify( source, sink, pipe ); }
@Test public void testChainEndingWithEach() throws Exception { getPlatform().copyFromLocal( inputFileApache ); Pipe pipe = new Pipe( "test" ); pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) ); pipe = new GroupBy( pipe, new Fields( "ip" ) ); pipe = new Every( pipe, new Count( new Fields( "count1" ) ) ); pipe = new Every( pipe, new Count( new Fields( "count2" ) ) ); pipe = new Each( pipe, new Fields( "count1", "count2" ), new ExpressionFunction( new Fields( "sum" ), "count1 + count2", int.class ), Fields.ALL ); Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); Tap sink = getPlatform().getTextFile( getOutputPath( "chaineach" ), SinkMode.REPLACE ); Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); flow.complete(); validateLength( flow, 8, null ); }
@Test public void testName() { Pipe count = new Pipe( "count" ); Pipe pipe = new GroupBy( count, new Fields( 1 ) ); pipe = new Every( pipe, new Fields( 1 ), new Count(), new Fields( 0, 1 ) ); assertEquals( "not equal: count.getName()", "count", count.getName() ); assertEquals( "not equal: pipe.getName()", "count", pipe.getName() ); pipe = new Each( count, new Fields( 1 ), new RegexSplitter( Fields.size( 2 ) ) ); assertEquals( "not equal: pipe.getName()", "count", pipe.getName() ); }
@Test public void testSimpleChain() throws Exception { getPlatform().copyFromLocal( inputFileApache ); Tap source = getPlatform().getTextFile( new Fields( "offset", "line" ), inputFileApache ); Pipe pipe = new Pipe( "test" ); pipe = new Each( pipe, new Fields( "line" ), new RegexParser( new Fields( "ip" ), "^[^ ]*" ), new Fields( "ip" ) ); pipe = new GroupBy( pipe, new Fields( "ip" ) ); pipe = new Every( pipe, new Count( new Fields( "count1" ) ) ); pipe = new Every( pipe, new Count( new Fields( "count2" ) ) ); pipe = new Every( pipe, new Count( new Fields( "count3" ) ) ); pipe = new Every( pipe, new Count( new Fields( "count4" ) ) ); Tap sink = getPlatform().getTabDelimitedFile( Fields.ALL, getOutputPath( "simplechain" ), SinkMode.REPLACE ); Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); flow.complete(); validateLength( flow, 8, 5 ); }
@Test public void testName() { Pipe count = new Pipe( "count" ); Pipe pipe = new GroupBy( count, new Fields( 1 ) ); pipe = new Every( pipe, new Fields( 1 ), new Count(), new Fields( 0, 1 ) ); assertEquals( "not equal: count.getName()", "count", count.getName() ); assertEquals( "not equal: pipe.getName()", "count", pipe.getName() ); pipe = new Each( count, new Fields( 1 ), new RegexSplitter( Fields.size( 2 ) ) ); assertEquals( "not equal: pipe.getName()", "count", pipe.getName() ); }