private Pipe getBloomFilterPipe(Pipe largePipe, Fields largeJoinFields, Pipe smallPipe, Fields smallJoinFields) throws IOException { String bloomJobID = UUID.randomUUID().toString(); Path bloomTempDir = FileSystemHelper.getRandomTemporaryPath("/tmp/bloom_tmp/"); String bloomPartsDir = bloomTempDir + "/parts"; String bloomFinalFilter = bloomTempDir + "/filter.bloomfilter"; String approxCountPartsDir = bloomTempDir + "/approx_distinct_keys_parts/"; Pipe filterPipe; smallPipe = new Each(smallPipe, smallJoinFields, new GetSerializedTuple()); smallPipe = new CreateBloomFilter(smallPipe, bloomJobID, approxCountPartsDir, bloomPartsDir, "serialized-tuple-key"); // This is a bit of a hack to: // 1) Force a dependency on the operations performed on RHS above (can't continue until they're done) // 2) Bind RHS to the flow, which wouldn't happen otherwise. // Note that RHS has no output, so there shouldn't be any danger in doing this. filterPipe = new NaiveMerge(largePipe.getName(), largePipe, smallPipe); // Load the bloom filter into memory and apply it to the LHS. filterPipe = new Each(filterPipe, largeJoinFields, new BloomJoinFilter(bloomJobID, false)); ConfigDef config = filterPipe.getStepConfigDef(); // tell BloomAssemblyStrategy which bloom filter to expect config.setProperty(BloomProps.SOURCE_BLOOM_FILTER_ID, bloomJobID); config.setProperty(BloomProps.REQUIRED_BLOOM_FILTER_PATH, bloomFinalFilter); return filterPipe; }
@Test public void testSubAssemblyConfigDef() throws IOException { getPlatform().copyFromLocal( inputFileNums20 ); Tap source = getPlatform().getTextFile( new Fields( "line" ), inputFileNums20 ); Pipe pipe = new Pipe( "test" ); pipe = new ConfigSubAssembly( pipe, getPlatform().isDAG() ); pipe.getConfigDef().setProperty( Mode.DEFAULT, "default", "pipe-default" ); // steps on above value pipe.getStepConfigDef().setProperty( Mode.DEFAULT, "default", "process-default" ); pipe.getConfigDef().setProperty( Mode.DEFAULT, "replace", "pipe-default" ); pipe.getConfigDef().setProperty( Mode.REPLACE, "replace", "pipe-replace" ); pipe.getNodeConfigDef().setProperty( Mode.REPLACE, "default-node", "node-replace" ); pipe.getStepConfigDef().setProperty( Mode.DEFAULT, "replace", "process-default" ); pipe.getStepConfigDef().setProperty( Mode.REPLACE, "replace", "process-replace" ); pipe.getStepConfigDef().setProperty( Mode.DEFAULT, "default-node", "process-default" ); Tap sink = getPlatform().getTextFile( getOutputPath( "subassembly-configdef" ), SinkMode.REPLACE ); Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); flow.complete(); assertTrue( flow.resourceExists( sink ) ); } }
@Test public void testSubAssemblyConfigDef() throws IOException { getPlatform().copyFromLocal( inputFileNums20 ); Tap source = getPlatform().getTextFile( new Fields( "line" ), inputFileNums20 ); Pipe pipe = new Pipe( "test" ); pipe = new ConfigSubAssembly( pipe, getPlatform().isDAG() ); pipe.getConfigDef().setProperty( Mode.DEFAULT, "default", "pipe-default" ); // steps on above value pipe.getStepConfigDef().setProperty( Mode.DEFAULT, "default", "process-default" ); pipe.getConfigDef().setProperty( Mode.DEFAULT, "replace", "pipe-default" ); pipe.getConfigDef().setProperty( Mode.REPLACE, "replace", "pipe-replace" ); pipe.getNodeConfigDef().setProperty( Mode.REPLACE, "default-node", "node-replace" ); pipe.getStepConfigDef().setProperty( Mode.DEFAULT, "replace", "process-default" ); pipe.getStepConfigDef().setProperty( Mode.REPLACE, "replace", "process-replace" ); pipe.getStepConfigDef().setProperty( Mode.DEFAULT, "default-node", "process-default" ); Tap sink = getPlatform().getTextFile( getOutputPath( "subassembly-configdef" ), SinkMode.REPLACE ); Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); flow.complete(); assertTrue( flow.resourceExists( sink ) ); } }
@Test public void testPipeConfigDef() throws IOException { getPlatform().copyFromLocal( inputFileNums20 ); Tap source = getPlatform().getTextFile( new Fields( "line" ), inputFileNums20 ); Pipe pipe = new Pipe( "test" ); pipe = new Each( pipe, new IterateInsert( new Fields( "value" ), getPlatform().isDAG() ), Fields.ALL ); pipe.getConfigDef().setProperty( Mode.DEFAULT, "default", "pipe-default" ); // steps on above value pipe.getStepConfigDef().setProperty( Mode.DEFAULT, "default", "process-default" ); pipe.getConfigDef().setProperty( Mode.DEFAULT, "replace", "pipe-default" ); pipe.getConfigDef().setProperty( Mode.REPLACE, "replace", "pipe-replace" ); pipe.getNodeConfigDef().setProperty( Mode.REPLACE, "default-node", "node-replace" ); pipe.getStepConfigDef().setProperty( Mode.DEFAULT, "replace", "process-default" ); pipe.getStepConfigDef().setProperty( Mode.REPLACE, "replace", "process-replace" ); pipe.getStepConfigDef().setProperty( Mode.DEFAULT, "default-node", "process-default" ); Tap sink = getPlatform().getTextFile( getOutputPath( "configdef" ), SinkMode.REPLACE ); Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); flow.complete(); assertTrue( flow.resourceExists( sink ) ); }
public CreateBloomFilter(Pipe keys, String bloomFilterID, String approxCountPartsDir, String bloomPartsDir, String keyBytesField, HashFunctionFactory hashFactory) throws IOException { super(keys); Pipe smallPipe = new Each(keys, new Fields(keyBytesField), new GetIndices(hashFactory), new Fields("split", "index", "hash_num")); smallPipe = new Each(smallPipe, new Fields("split", "index", "hash_num"), new Unique.FilterPartialDuplicates()); smallPipe = new GroupBy(smallPipe, new Fields("split")); smallPipe = new Every(smallPipe, new Fields("index", "hash_num"), new CreateBloomFilterFromIndices(), Fields.ALL); ConfigDef bloomDef = smallPipe.getStepConfigDef(); bloomDef.setProperty(BloomProps.BLOOM_FILTER_PARTS_DIR, bloomPartsDir); bloomDef.setProperty(BloomProps.BLOOM_KEYS_COUNTS_DIR, approxCountPartsDir); bloomDef.setProperty(BloomProps.TARGET_BLOOM_FILTER_ID, bloomFilterID); setTails(smallPipe); }
@Test public void testPipeConfigDef() throws IOException { getPlatform().copyFromLocal( inputFileNums20 ); Tap source = getPlatform().getTextFile( new Fields( "line" ), inputFileNums20 ); Pipe pipe = new Pipe( "test" ); pipe = new Each( pipe, new IterateInsert( new Fields( "value" ), getPlatform().isDAG() ), Fields.ALL ); pipe.getConfigDef().setProperty( Mode.DEFAULT, "default", "pipe-default" ); // steps on above value pipe.getStepConfigDef().setProperty( Mode.DEFAULT, "default", "process-default" ); pipe.getConfigDef().setProperty( Mode.DEFAULT, "replace", "pipe-default" ); pipe.getConfigDef().setProperty( Mode.REPLACE, "replace", "pipe-replace" ); pipe.getNodeConfigDef().setProperty( Mode.REPLACE, "default-node", "node-replace" ); pipe.getStepConfigDef().setProperty( Mode.DEFAULT, "replace", "process-default" ); pipe.getStepConfigDef().setProperty( Mode.REPLACE, "replace", "process-replace" ); pipe.getStepConfigDef().setProperty( Mode.DEFAULT, "default-node", "process-default" ); Tap sink = getPlatform().getTextFile( getOutputPath( "configdef" ), SinkMode.REPLACE ); Flow flow = getPlatform().getFlowConnector().connect( source, sink, pipe ); flow.complete(); assertTrue( flow.resourceExists( sink ) ); }